From aeba7b0f91046f6498ed00aa374c2cf90ffcb6a6 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Thu, 15 Oct 2020 14:12:15 -0400 Subject: [PATCH] Integrate RDC with Grafana A new Grafana dashboard file rdc_grafana_dashboard_example.json has been added to the folder python_binding. User can import this dashboard to monitor multiple compute nodes. To display the host name only in the dashboard, the rdc_prometheus_example.yml is also changed to create a new label short_instance which will not have the port number. Change-Id: I9ab91838006d59c8dcb5fea01decb8c799484e1d --- .../rdc_grafana_dashboard_example.json | 991 ++++++++++++++++++ python_binding/rdc_prometheus_example.yml | 6 + 2 files changed, 997 insertions(+) create mode 100644 python_binding/rdc_grafana_dashboard_example.json diff --git a/python_binding/rdc_grafana_dashboard_example.json b/python_binding/rdc_grafana_dashboard_example.json new file mode 100644 index 0000000000..08ecd22acf --- /dev/null +++ b/python_binding/rdc_grafana_dashboard_example.json @@ -0,0 +1,991 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Dashboard to monitor AMD GPUs using RDC", + "editable": true, + "gnetId": 11756, + "graphTooltip": 0, + "id": 4, + "iteration": 1599146807681, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 26, + "interval": "1s", + "legend": { + "avg": false, + "current": true, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxPerRow": 6, + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pluginVersion": "6.7.3", + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "node", + "repeatDirection": "h", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"0\"}", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{short_instance}}:gpu0", + "metric": "", + "refId": "A", + "step": 1200, + "target": "" + }, + { + "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"1\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu1", + "refId": "B" + }, + { + "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"2\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu2", + "refId": "C" + }, + { + "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"3\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu3", + "refId": "D" + }, + { + "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"4\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu4", + "refId": "E" + }, + { + "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"5\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu5", + "refId": "F" + }, + { + "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"6\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu6", + "refId": "G" + }, + { + "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"7\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu7", + "refId": "H" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average GPU Package Power (Watt)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 45, + "interval": "1s", + "legend": { + "avg": false, + "current": false, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxPerRow": 6, + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "node", + "repeatDirection": "h", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"1\"}", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{short_instance}}:gpu1", + "metric": "", + "refId": "A", + "step": 1200, + "target": "" + }, + { + "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"2\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu2", + "refId": "B" + }, + { + "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"3\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu3", + "refId": "C" + }, + { + "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"4\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu4", + "refId": "D" + }, + { + "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"5\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu5", + "refId": "E" + }, + { + "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"6\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu6", + "refId": "F" + }, + { + "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"7\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu7", + "refId": "G" + }, + { + "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"0\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu0", + "refId": "H" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Usage (%)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "hiddenSeries": false, + "id": 27, + "interval": "1s", + "legend": { + "avg": false, + "current": false, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxPerRow": 6, + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "node", + "repeatDirection": "h", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"1\"}", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{short_instance}}:gpu1", + "metric": "", + "refId": "A", + "step": 1200, + "target": "" + }, + { + "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"2\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu2", + "refId": "B" + }, + { + "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"3\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu3", + "refId": "C" + }, + { + "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"4\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu4", + "refId": "D" + }, + { + "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"5\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu5", + "refId": "E" + }, + { + "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"6\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu6", + "refId": "F" + }, + { + "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"7\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu7", + "refId": "G" + }, + { + "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"0\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu0", + "refId": "H" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Clock Speed (MHz)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "description": "The GPU temperature in degree", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 86, + "interval": "1s", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxPerRow": 6, + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "node", + "repeatDirection": "h", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"0\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu0 - Allocated", + "refId": "I" + }, + { + "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"7\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu7 - Allocated", + "refId": "J" + }, + { + "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"6\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu6 - Allocated", + "refId": "K" + }, + { + "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"5\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu5 - Allocated", + "refId": "L" + }, + { + "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"4\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu4 - Allocated", + "refId": "M" + }, + { + "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"3\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu3 - Allocated", + "refId": "N" + }, + { + "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"2\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu2 - Allocated", + "refId": "O" + }, + { + "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"1\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu1 - Allocated", + "refId": "P" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Temperature (Celsius)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "description": "the amount of total available and allocated VRAM", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "hiddenSeries": false, + "id": 65, + "interval": "1s", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxPerRow": 6, + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "node", + "repeatDirection": "h", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"0\"} / 1024", + "interval": "", + "legendFormat": "{{short_instance}}:gpu0 - Allocated", + "refId": "I" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"7\"} / 1024", + "interval": "", + "legendFormat": "{{short_instance}}:gpu7 - Allocated", + "refId": "J" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"6\"} / 1024", + "interval": "", + "legendFormat": "{{short_instance}}:gpu6 - Allocated", + "refId": "K" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"5\"} / 1024", + "interval": "", + "legendFormat": "{{short_instance}}:gpu5 - Allocated", + "refId": "L" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"4\"} / 1024", + "interval": "", + "legendFormat": "{{short_instance}}:gpu4 - Allocated", + "refId": "M" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"3\"} / 1024", + "interval": "", + "legendFormat": "{{short_instance}}:gpu3 - Allocated", + "refId": "N" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"2\"} / 1024", + "interval": "", + "legendFormat": "{{short_instance}}:gpu2 - Allocated", + "refId": "O" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"1\"} / 1024", + "interval": "", + "legendFormat": "{{short_instance}}:gpu1 - Allocated", + "refId": "P" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Memory Allocation (GB)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "description": "indicate how busy the respective mem blocks are", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 31 + }, + "hiddenSeries": false, + "id": 64, + "interval": "1s", + "legend": { + "avg": false, + "current": false, + "max": true, + "min": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxPerRow": 6, + "nullPointMode": "connected", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": "node", + "repeatDirection": "h", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"1\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"1\"}", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{short_instance}}:gpu1", + "metric": "", + "refId": "A", + "step": 1200, + "target": "" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"2\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"2\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu2", + "refId": "B" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"3\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"3\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu3", + "refId": "C" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"4\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"4\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu4", + "refId": "D" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"5\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"5\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu5", + "refId": "E" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"6\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"6\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu6", + "refId": "F" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"7\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"7\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu7", + "refId": "G" + }, + { + "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"0\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"0\"}", + "interval": "", + "legendFormat": "{{short_instance}}:gpu0", + "refId": "H" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "GPU Memory Activity Level (%)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 25, + "style": "dark", + "tags": [ + "IB" + ], + "templating": { + "list": [ + { + "allFormat": "glob", + "allValue": null, + "current": { + "selected": true, + "tags": [], + "text": "", + "value": [] + }, + "datasource": "prometheus", + "definition": "label_values(instance)", + "hide": 0, + "includeAll": false, + "label": "Host", + "multi": true, + "multiFormat": "regex values", + "name": "node", + "options": [], + "query": "label_values(instance)", + "refresh": 1, + "regex": "/(.*):.*/", + "skipUrlSync": false, + "sort": 1, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "1m", + "2m", + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "ROCm Data Center tool V1.0", + "uid": "thisIsAuniqueID", + "version": 21 +} diff --git a/python_binding/rdc_prometheus_example.yml b/python_binding/rdc_prometheus_example.yml index 2f63443398..44e7c665e8 100644 --- a/python_binding/rdc_prometheus_example.yml +++ b/python_binding/rdc_prometheus_example.yml @@ -12,6 +12,12 @@ scrape_configs: # metrics_path defaults to '/metrics' # scheme defaults to 'http'. + # Remove the port for display + relabel_configs: + - source_labels: [__address__] + regex: '([^:]+):\d+' + target_label: short_instance + file_sd_configs: - files: - 'prometheus_targets.json'