From 34c5d447e10c0d110683104a6e4c7ba660cbf5d7 Mon Sep 17 00:00:00 2001 From: bobtiji Date: Sat, 22 Jan 2022 07:49:09 -0500 Subject: [PATCH 1/5] first --- .../Nvidia DGCM exporter/docker-compose.yml | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 docker-compose/prometheus/exporters/Nvidia DGCM exporter/docker-compose.yml diff --git a/docker-compose/prometheus/exporters/Nvidia DGCM exporter/docker-compose.yml b/docker-compose/prometheus/exporters/Nvidia DGCM exporter/docker-compose.yml new file mode 100644 index 0000000..874136e --- /dev/null +++ b/docker-compose/prometheus/exporters/Nvidia DGCM exporter/docker-compose.yml @@ -0,0 +1,21 @@ +--- +version: '3' +services: + + nvidia_exporter: #to export data from DCGM host, need DCGM installed of an equal or newer version to the container on the host system https://github.com/NVIDIA/DCGM + image: nvcr.io/nvidia/k8s/dcgm-exporter:2.3.2-2.6.2-ubuntu20.04 + container_name: nvidia_exporter + runtime: nvidia + cap_add: + - SYS_ADMIN + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=all + ports: + - "9400:9400" + restart: unless-stopped + + #NVIDIA Data Center GPU Manager: To export data from DCGM host to prometheus, you need DCGM installed on host as well as nvidia container toolkit + #https://github.com/NVIDIA/DCGM + #https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#install-guide + #https://github.com/NVIDIA/dcgm-exporter and https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/dcgm-exporter.html \ No newline at end of file From cf288ca19d22c400d51a31f8f779c356cfba57f8 Mon Sep 17 00:00:00 2001 From: bobtiji Date: Sat, 22 Jan 2022 08:14:33 -0500 Subject: [PATCH 2/5] readme for first commit --- .../exporters/Nvidia DGCM exporter/README.md | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 docker-compose/prometheus/exporters/Nvidia DGCM exporter/README.md diff --git a/docker-compose/prometheus/exporters/Nvidia DGCM exporter/README.md b/docker-compose/prometheus/exporters/Nvidia DGCM exporter/README.md new file mode 100644 index 0000000..f6c214d --- /dev/null +++ b/docker-compose/prometheus/exporters/Nvidia DGCM exporter/README.md @@ -0,0 +1,38 @@ +# Prerequisite + + NVIDIA container toolkit + sudo apt -y install build-essential nvidia-cuda-toolkit nvidia-headless-495 nvidia-utils-495 libnvidia-encode-495 \ + && distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \ + && curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - \ + && curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list \ + && sudo apt update \ + && sudo apt -y install nvidia-container-toolkit nvidia-container-runtime nvidia-docker2 + + + DCGM on host machine running nvidia GPU + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin \ + && sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 \ + && sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub \ + && sudo add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /" \ + && sudo apt update \ + && sudo apt install -y datacenter-gpu-manager \ + && sudo systemctl --now enable nvidia-dcgm + +## Deployment + +1. Modify the prometheus configuration template `/etc/prometheus/prometheus.yml` location. +2. # job for nvidia DCGM exporter + - job_name: 'nvidia_exporter' + static_configs: + - targets: ['nvidia_exporter:9400'] # if nvidia_exporter container is not on same docker network , change this line to "- targets: ['whichever ip your host is:9400']" + +## Configuration + +None + +# Additional Referfences +[Official DCGM Documentations](https://github.com/NVIDIA/DCGM) +[Nvidia container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#install-guide) +[Nvidia DCGM exporter Documentation](https://github.com/NVIDIA/dcgm-exporter) and (https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/dcgm-exporter.html) +[Official Prometheus Documentation](https://prometheus.io/docs/introduction/overview/) +[Some grafana dashboard, not perfect, old, but configurable](https://grafana.com/grafana/dashboards/11578) \ No newline at end of file From 8f75f764903cd5dfcbfc47771524aa2343aa1c62 Mon Sep 17 00:00:00 2001 From: bobtiji Date: Sat, 22 Jan 2022 08:17:32 -0500 Subject: [PATCH 3/5] Typos --- .../prometheus/exporters/Nvidia DGCM exporter/README.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/docker-compose/prometheus/exporters/Nvidia DGCM exporter/README.md b/docker-compose/prometheus/exporters/Nvidia DGCM exporter/README.md index f6c214d..fccafab 100644 --- a/docker-compose/prometheus/exporters/Nvidia DGCM exporter/README.md +++ b/docker-compose/prometheus/exporters/Nvidia DGCM exporter/README.md @@ -9,7 +9,7 @@ && sudo apt -y install nvidia-container-toolkit nvidia-container-runtime nvidia-docker2 - DCGM on host machine running nvidia GPU + DCGM on host machine running Nvidia GPU wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin \ && sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600 \ && sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub \ @@ -21,15 +21,11 @@ ## Deployment 1. Modify the prometheus configuration template `/etc/prometheus/prometheus.yml` location. -2. # job for nvidia DCGM exporter +# job for nvidia DCGM exporter - job_name: 'nvidia_exporter' static_configs: - targets: ['nvidia_exporter:9400'] # if nvidia_exporter container is not on same docker network , change this line to "- targets: ['whichever ip your host is:9400']" -## Configuration - -None - # Additional Referfences [Official DCGM Documentations](https://github.com/NVIDIA/DCGM) [Nvidia container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#install-guide) From ccd4833bf41edc760956e5a214a92390df9bf245 Mon Sep 17 00:00:00 2001 From: bobtiji Date: Sat, 22 Jan 2022 08:20:11 -0500 Subject: [PATCH 4/5] References changed --- .../prometheus/exporters/Nvidia DGCM exporter/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker-compose/prometheus/exporters/Nvidia DGCM exporter/README.md b/docker-compose/prometheus/exporters/Nvidia DGCM exporter/README.md index fccafab..d2d54c7 100644 --- a/docker-compose/prometheus/exporters/Nvidia DGCM exporter/README.md +++ b/docker-compose/prometheus/exporters/Nvidia DGCM exporter/README.md @@ -21,7 +21,7 @@ ## Deployment 1. Modify the prometheus configuration template `/etc/prometheus/prometheus.yml` location. -# job for nvidia DCGM exporter +# Job for Nvidia DCGM exporter in prometheus config file - job_name: 'nvidia_exporter' static_configs: - targets: ['nvidia_exporter:9400'] # if nvidia_exporter container is not on same docker network , change this line to "- targets: ['whichever ip your host is:9400']" @@ -29,6 +29,7 @@ # Additional Referfences [Official DCGM Documentations](https://github.com/NVIDIA/DCGM) [Nvidia container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#install-guide) -[Nvidia DCGM exporter Documentation](https://github.com/NVIDIA/dcgm-exporter) and (https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/dcgm-exporter.html) +[Nvidia DCGM exporter Documentation](https://github.com/NVIDIA/dcgm-exporter) +[Nvidia DCGM exporter Documentation-2](https://docs.nvidia.com/datacenter/cloud-native/gpu-telemetry/dcgm-exporter.html) [Official Prometheus Documentation](https://prometheus.io/docs/introduction/overview/) [Some grafana dashboard, not perfect, old, but configurable](https://grafana.com/grafana/dashboards/11578) \ No newline at end of file From 6e93d47604fe1894c5fbfa815b6e39b8eea02db1 Mon Sep 17 00:00:00 2001 From: bobtiji Date: Sat, 22 Jan 2022 11:54:59 -0500 Subject: [PATCH 5/5] Added Nvidia_smi compose and readme --- .../exporters/Nvidia_smi_exporter/README.md | 23 +++++++++++++++++++ .../Nvidia_smi_exporter/docker-compose.yml | 19 +++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 docker-compose/prometheus/exporters/Nvidia_smi_exporter/README.md create mode 100644 docker-compose/prometheus/exporters/Nvidia_smi_exporter/docker-compose.yml diff --git a/docker-compose/prometheus/exporters/Nvidia_smi_exporter/README.md b/docker-compose/prometheus/exporters/Nvidia_smi_exporter/README.md new file mode 100644 index 0000000..1a12ef5 --- /dev/null +++ b/docker-compose/prometheus/exporters/Nvidia_smi_exporter/README.md @@ -0,0 +1,23 @@ +# Prerequisite + + NVIDIA container toolkit + sudo apt -y install build-essential nvidia-cuda-toolkit nvidia-headless-495 nvidia-utils-495 libnvidia-encode-495 \ + && distribution=$(. /etc/os-release;echo $ID$VERSION_ID) \ + && curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - \ + && curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list \ + && sudo apt update \ + && sudo apt -y install nvidia-container-toolkit nvidia-container-runtime nvidia-docker2 + +## Deployment + +1. Modify the prometheus configuration template `/etc/prometheus/prometheus.yml` location. +# Job for Nvidia SMI exporter in prometheus config file + - job_name: 'nvidia_smi_exporter' + static_configs: + - targets: ['nvidia_smi_exporter:9835'] # if nvidia_smi_exporter container is not on same docker network , change this line to "- targets: ['whichever ip your host is:9835']" + +# Additional Referfences +[Nvidia container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#install-guide) +[Nvidia GPU exporter Documentation](https://github.com/utkuozdemir/nvidia_gpu_exporter) +[Official Prometheus Documentation](https://prometheus.io/docs/introduction/overview/) +[Some grafana dashboard, not perfect, old, but configurable](https://grafana.com/grafana/dashboards/14574) \ No newline at end of file diff --git a/docker-compose/prometheus/exporters/Nvidia_smi_exporter/docker-compose.yml b/docker-compose/prometheus/exporters/Nvidia_smi_exporter/docker-compose.yml new file mode 100644 index 0000000..795abcf --- /dev/null +++ b/docker-compose/prometheus/exporters/Nvidia_smi_exporter/docker-compose.yml @@ -0,0 +1,19 @@ +--- +version: '3' +services: + nvidia_smi_exporter: #To export data from nvidia-smi, needs nvidia-smi and nvidia-container-toolkit installed on host. + image: utkuozdemir/nvidia_gpu_exporter:0.3.0 + container_name: nvidia_smi_exporter + runtime: nvidia + environment: + - NVIDIA_VISIBLE_DEVICES=all + - NVIDIA_DRIVER_CAPABILITIES=all + ports: + - "9835:9835" + volumes: + - /usr/bin/nvidia-smi:/usr/bin/nvidia-smi + - /usr/lib/x86_64-linux-gnu/libnvidia-ml.so:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so + - /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1:/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 + restart: unless-stopped + +#Mount points for volume work on Ubuntu 20.04 \ No newline at end of file