GPU监控
环境: ubuntu22.04 nvidia 4090
安装DCGM
#NVIDIA 数据中心 GPU 管理器 (DCGM)# 设置 CUDA 网络存储库元数据,GPG 密钥:wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.debsudo dpkg -i cuda-keyring_1.1-1_all.debsudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /"apt update
CUDA_VERSION=$(nvidia-smi | sed -E -n 's/.*CUDA Version: ([0-9]+)[.].*/\1/p')sudo apt-get install --yes \ --install-recommends \ datacenter-gpu-manager-4-cuda${CUDA_VERSION}systemctl --now enable nvidia-dcgm
# 查看日志cat /var/log/nv-hostengine.log
配置远程访问
vi /lib/systemd/system/nvidia-dcgm.serviceExecStart=/usr/bin/nv-hostengine -n --service-account nvidia-dcgm -b ALLsystemctl daemon-reloadsystemctl restart nvidia-dcgmdcgmi discovery --host localhost -l
DCGM-Exporter
#DCGM-Exporter监控多台GPUsudo docker run -it -d --gpus all --cap-add SYS_ADMIN --name dcgm00 -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.1.1-4.0.4-ubuntu22.04 #本机GPUsudo docker run -it -d --gpus all --cap-add SYS_ADMIN --name dcgm01 -p 9401:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.1.1-4.0.4-ubuntu22.04 -r 192.168.22.7:5555sudo docker run -it -d --gpus all --cap-add SYS_ADMIN --name dcgm02 -p 9402:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.1.1-4.0.4-ubuntu22.04 -r 192.168.22.24:5555curl 192.168.22.11:9402/metrics
prometheus配置
vi gpu.yml#id- targets: ['192.168.22.11:9400'] labels: instance: gaojinbo-22.11- targets: ['192.168.22.11:9401'] labels: instance: lixiaolong-22.7- targets: ['192.168.22.11:9402'] labels: instance: liubang-22.24
curl -XPOST 127.0.0.1:9090/-/reload
#grafana导入面板ID 12239
修改grafana面板
GPU Framebuffer Mem Used Metrics browser修改为
DCGM_FI_DEV_FB_USED{instance=~"${instance}", gpu=~"${gpu}"}
Legend修改为
{{instance}} GPU {{gpu}}