Infra : NKS
k8s VERSION : v1.27.9
OS-IMAGE : Ubuntu20.04.3 LTS
KERNEL-VERSION : 5.4.0-163-generic
CONTAINER-RUNTIME: containerd://1.6.28
node01@host-01:~$ kubectl describe node | grep nvidia
nvidia.com/cuda.driver.major=470
nvidia.com/cuda.driver.minor=199
nvidia.com/cuda.driver.rev=02
nvidia.com/cuda.runtime.major=11
nvidia.com/cuda.runtime.minor=4
nvidia.com/gfd.timestamp=1713229473
nvidia.com/gpu-driver-upgrade-state=upgrade-done
nvidia.com/gpu.compute.major=7
nvidia.com/gpu.compute.minor=5
nvidia.com/gpu.count=1
nvidia.com/gpu.deploy.container-toolkit=true
nvidia.com/gpu.deploy.dcgm=true
nvidia.com/gpu.deploy.dcgm-exporter=true
nvidia.com/gpu.deploy.device-plugin=true
nvidia.com/gpu.deploy.driver=pre-installed
nvidia.com/gpu.deploy.gpu-feature-discovery=true
nvidia.com/gpu.deploy.node-status-exporter=true
nvidia.com/gpu.deploy.operator-validator=true
nvidia.com/gpu.family=turing
nvidia.com/gpu.machine=HVM-domU
nvidia.com/gpu.memory=16127
nvidia.com/gpu.present=true
nvidia.com/gpu.product=Tesla-T4-SHARED
nvidia.com/gpu.replicas=4
nvidia.com/mig.capable=false
nvidia.com/mig.strategy=single
nvidia.com/gpu-driver-upgrade-enabled: true
혹시 드라이버 재설치가 필요하다면
https://guide.ncloud-docs.com/docs/server-gpuserver-vpc
이런식으로~
470.199.02
https://kr.download.nvidia.com/tesla/470.199.02/NVIDIA-Linux-x86_64-470.199.02.run
550.54.15
https://kr.download.nvidia.com/tesla/550.54.15/NVIDIA-Linux-x86_64-550.54.15.run
Capacity:
cpu: 1
ephemeral-storage: 51288544Ki
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 20462056Ki
nvidia.com/gpu: 1
pods: 110
Allocatable:
cpu: 3920m
ephemeral-storage: 47267522073
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 17413608Ki
nvidia.com/gpu: 1
pods: 110
apiVersion: v1
data:
a100-80gb: |-
version: v1
sharing:
timeSlicing:
resources:
- name: nvidia.com/gpu
replicas: 4 # 쪼갤만큼
tesla-t4: |-
version: v1
sharing:
timeSlicing:
resources:
- name: nvidia.com/gpu
replicas: 4 # 쪼갤만큼
kind: ConfigMap
metadata:
name: time-slicing-config
namespace: gpu-operator
kubectl patch clusterpolicy/cluster-policy \
-n gpu-operator --type merge \
-p '{"spec": {"devicePlugin": {"config": {"name": "time-slicing-config", "default": "tesla-t4"}}}}'
# 컨피그맵에 등록한 낸용을 디폴트로
Capacity:
cpu: 1
ephemeral-storage: 51288544Ki
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 20462056Ki
nvidia.com/gpu: 4
pods: 110
Allocatable:
cpu: 3920m
ephemeral-storage: 47267522073
hugepages-1Gi: 0
hugepages-2Mi: 0
memory: 17413608Ki
nvidia.com/gpu: 4
pods: 110
참고자료
GPU 쪼개기