Skip to content

GPU监控

环境: ubuntu22.04 nvidia 4090

安装DCGM

#NVIDIA 数据中心 GPU 管理器 (DCGM)
# 设置 CUDA 网络存储库元数据,GPG 密钥:
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i cuda-keyring_1.1-1_all.deb
sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /"
apt update
CUDA_VERSION=$(nvidia-smi | sed -E -n 's/.*CUDA Version: ([0-9]+)[.].*/\1/p')
sudo apt-get install --yes \
--install-recommends \
datacenter-gpu-manager-4-cuda${CUDA_VERSION}
systemctl --now enable nvidia-dcgm
# 查看日志
cat /var/log/nv-hostengine.log

配置远程访问

vi /lib/systemd/system/nvidia-dcgm.service
ExecStart=/usr/bin/nv-hostengine -n --service-account nvidia-dcgm -b ALL
systemctl daemon-reload
systemctl restart nvidia-dcgm
dcgmi discovery --host localhost -l

DCGM-Exporter

#DCGM-Exporter监控多台GPU
sudo docker run -it -d --gpus all --cap-add SYS_ADMIN --name dcgm00 -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.1.1-4.0.4-ubuntu22.04 #本机GPU
sudo docker run -it -d --gpus all --cap-add SYS_ADMIN --name dcgm01 -p 9401:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.1.1-4.0.4-ubuntu22.04 -r 192.168.22.7:5555
sudo docker run -it -d --gpus all --cap-add SYS_ADMIN --name dcgm02 -p 9402:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.1.1-4.0.4-ubuntu22.04 -r 192.168.22.24:5555
curl 192.168.22.11:9402/metrics

prometheus配置

vi gpu.yml
#id
- targets: ['192.168.22.11:9400']
labels:
instance: gaojinbo-22.11
- targets: ['192.168.22.11:9401']
labels:
instance: lixiaolong-22.7
- targets: ['192.168.22.11:9402']
labels:
instance: liubang-22.24
curl -XPOST 127.0.0.1:9090/-/reload
#grafana导入面板
ID 12239

修改grafana面板

GPU Framebuffer Mem Used Metrics browser修改为

DCGM_FI_DEV_FB_USED{instance=~"${instance}", gpu=~"${gpu}"}

Legend修改为

{{instance}} GPU {{gpu}}