
intel_iommu=on (intel 기준) 추가.iommu=pt nouveau.modeset=0 rd.driver.pre=bnxt_en,ixgbe rd.driver.post=tg3,igb modprobe.blacklist=nvidiafb,nouveau
드라이버 설정 시 3가지 정보가 필요함. 정보는 lspci 명령어를 통해 확인 가능.
[root@v100-gpu1 ~]# lspci -nn | grep -i nvidia
18:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 SXM2 32GB] [10de:1db5] (rev a1)
3b:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 SXM2 32GB] [10de:1db5] (rev a1)
86:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 SXM2 32GB] [10de:1db5] (rev a1)
af:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 SXM2 32GB] [10de:1db5] (rev a1
위 예시에서 PCI bus ID 는 18:00.0, Vendor ID는 10de, project ID는 1db5 임.
아무 설정을 하지 않은 상태에서 GPU의 커널 Driver를 확인해보면 CentOS에서 기본적으로 제공하는 비디오 Driver인 nouveau가 활성화 되어 있을 것이다. GPU의 기본 드라이버로 비디오 드라이버가 바인딩 되어 있으면 GPU passthrough가 정상 동작하지 않으므로 적절한 드라이버를 재설정 해줘야 한다.
[root@v100-gpu1 default]# lspci -s af:00.0 -nk
af:00.0 0302: 10de:1db5 (rev a1)
Subsystem: 10de:1249
Kernel driver in use: nouveau
Kernel modules: nouveau
[root@v100-gpu1 ~]# cat /etc/modprobe.d/blacklist-nouveau.conf
blacklist nouveau
options nouveau modeset=0
[root@v100-gpu1 ~]# cat /etc/modprobe.d/vfio.conf
options vfio-pci ids=10de:1db5
[root@v100-gpu1 ~]# cat /etc/modules-load.d/vfio-pci.conf
vfio-pci
[root@v100-gpu1 ~]# cat /etc/nova/nova.conf
...
# Add GPU PCI Address whitelist V100
[pci]
passthrough_whitelist = { "vendor_id": "10de", "product_id": "1db5" }
PciPassthroughFilter,IsolatedHostsFilter 추가[root@controller1 ~]# cat /etc/nova/nova.conf
...
[filter_scheduler]
...
enabled_filters = AvailabilityZoneFilter,ComputeFilter,AggregateNumInstancesFilter,AggregateIoOpsFilter,ComputeCapabilitiesFilter,ImagePropertiesFilter,ServerGroupAntiAffinityFilter,ServerGroupAffinityFilter,NUMATopologyFilter,PciPassthroughFilter,IsolatedHostsFilter
host_subset_size = 10
tracks_instance_changes = True
# GPU Isolated Hosts
isolated_hosts = gov-t4-gpu1, gpu-t4-gpu2, gpu-t4-gpu3, gpu-t4-gpu4, gpu-t4-gpu5, gpu-t4-gpu6, gpu-t4-gpu7, gpu-t4-gpu8, gpu-t4-gpu9, gpu-t4-gpu10, gov-t4-gpu11, gov-t4-gpu12,v100-gpu1
isolated_images = 5029452f-4cb1-4cb9-bb05-8494b7e84223, a838c826-a9a4-40bd-a1d2-a92397d8a9ed
[pci]
alias = { "vendor_id":"10de", "product_id":"1eb8", "device_type":"type-PF", "name":"T4" }
alias = { "vendor_id":"10de", "product_id":"1db5", "device_type":"type-PCI", "name":"V100" }
...
[root@v100-gpu1 ~]# dmesg -T| grep -i dmar | grep -i iommu | grep -i enable
[Mon Aug 1 14:30:59 2022] DMAR: IOMMU enabled
or
if compgen -G "/sys/kernel/iommu_groups/*/devices/*" > /dev/null; then
echo "AMD's IOMMU / Intel's VT-D is enabled in the BIOS/UEFI."
else
echo "AMD's IOMMU / Intel's VT-D is not enabled in the BIOS/UEFI"
fi
or
[root@v100-gpu1 ~]# virt-host-validate | grep -i iomm
QEMU: Checking for device assignment IOMMU support : PASS
QEMU: Checking if IOMMU is enabled by kernel : PASS
[root@v100-gpu1 ~]# lsmod | grep -i vfio_pci
vfio_pci 41993 4
irqbypass 13503 10 kvm,vfio_pci
vfio 32657 12 vfio_iommu_type1,vfio_pci
[root@v100-gpu1 ~]# lsmod | grep -i nouveau
[root@v100-gpu1 ~]# lspci -nn | grep -i nvidia
18:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 SXM2 32GB] [10de:1db5] (rev a1)
3b:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 SXM2 32GB] [10de:1db5] (rev a1)
86:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 SXM2 32GB] [10de:1db5] (rev a1)
af:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 SXM2 32GB] [10de:1db5] (rev a1)
# lspci -s [pci bus ID] -nnk 로 확인
[root@v100-gpu1 ~]# lspci -s 18:00.0 -nnk
18:00.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 SXM2 32GB] [10de:1db5] (rev a1)
Subsystem: NVIDIA Corporation Device [10de:1249]
Kernel driver in use: vfio-pci
Kernel modules: nouveau
[root@controller1 ~]# openstack flavor create \
> --vcpus 16 \
> --ram 65536 \
> --disk 100 \
> --property "pci_passthrough:alias"="V100:2" \
> v100_test_flavor
+----------------------------+--------------------------------------+
| Field | Value |
+----------------------------+--------------------------------------+
| OS-FLV-DISABLED:disabled | False |
| OS-FLV-EXT-DATA:ephemeral | 0 |
| disk | 100 |
| id | 8f5f540e-1000-4311-846c-aa89ae56b8e0 |
| name | v100_test_flavor |
| os-flavor-access:is_public | True |
| properties | pci_passthrough:alias='V100:2' |
| ram | 65536 |
| rxtx_factor | 1.0 |
| swap | |
| vcpus | 16 |
+----------------------------+--------------------------------------+
root@v100-test2:~# lspci -nn | grep -i nvidia
00:06.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 SXM2 32GB] [10de:1db5] (rev a1)
00:07.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 SXM2 32GB] [10de:1db5] (rev a1)
root@v100-test2:~# lspci -s 00:06.0 -nnk
00:06.0 3D controller [0302]: NVIDIA Corporation GV100GL [Tesla V100 SX
M2 32GB] [10de:1db5] (rev a1)
Subsystem: NVIDIA Corporation Device [10de:1249]
Kernel modules: nvidiafb