{ "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "description": "Dashboard to monitor AMD GPUs using RDC", "editable": true, "gnetId": 11756, "graphTooltip": 0, "id": 4, "iteration": 1599146807681, "links": [], "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 0, "editable": true, "error": false, "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 0, "fillGradient": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 0 }, "hiddenSeries": false, "id": 26, "interval": "1s", "legend": { "avg": false, "current": true, "max": true, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "maxPerRow": 6, "nullPointMode": "connected", "options": { "dataLinks": [] }, "percentage": false, "pluginVersion": "6.7.3", "pointradius": 5, "points": false, "renderer": "flot", "repeat": "node", "repeatDirection": "h", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": true, "targets": [ { "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"0\"}", "interval": "", "intervalFactor": 1, "legendFormat": "{{short_instance}}:gpu0", "metric": "", "refId": "A", "step": 1200, "target": "" }, { "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"1\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu1", "refId": "B" }, { "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"2\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu2", "refId": "C" }, { "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"3\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu3", "refId": "D" }, { "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"4\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu4", "refId": "E" }, { "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"5\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu5", "refId": "F" }, { "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"6\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu6", "refId": "G" }, { "expr": "power_usage{instance=~\"$node.*\",gpu_index=\"7\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu7", "refId": "H" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Average GPU Package Power (Watt)", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 0, "editable": true, "error": false, "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 0, "fillGradient": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 7 }, "hiddenSeries": false, "id": 45, "interval": "1s", "legend": { "avg": false, "current": false, "max": true, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "maxPerRow": 6, "nullPointMode": "connected", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeat": "node", "repeatDirection": "h", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"1\"}", "instant": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{short_instance}}:gpu1", "metric": "", "refId": "A", "step": 1200, "target": "" }, { "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"2\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu2", "refId": "B" }, { "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"3\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu3", "refId": "C" }, { "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"4\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu4", "refId": "D" }, { "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"5\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu5", "refId": "E" }, { "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"6\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu6", "refId": "F" }, { "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"7\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu7", "refId": "G" }, { "expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"0\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu0", "refId": "H" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Usage (%)", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 0, "editable": true, "error": false, "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 0, "fillGradient": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }, "hiddenSeries": false, "id": 27, "interval": "1s", "legend": { "avg": false, "current": false, "max": true, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "maxPerRow": 6, "nullPointMode": "connected", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeat": "node", "repeatDirection": "h", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"1\"}", "instant": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{short_instance}}:gpu1", "metric": "", "refId": "A", "step": 1200, "target": "" }, { "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"2\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu2", "refId": "B" }, { "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"3\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu3", "refId": "C" }, { "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"4\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu4", "refId": "D" }, { "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"5\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu5", "refId": "E" }, { "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"6\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu6", "refId": "F" }, { "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"7\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu7", "refId": "G" }, { "expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"0\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu0", "refId": "H" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Clock Speed (MHz)", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 0, "description": "The GPU temperature in degree", "editable": true, "error": false, "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 0, "fillGradient": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 23 }, "hiddenSeries": false, "id": 86, "interval": "1s", "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "maxPerRow": 6, "nullPointMode": "connected", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeat": "node", "repeatDirection": "h", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"0\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu0 - Allocated", "refId": "I" }, { "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"7\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu7 - Allocated", "refId": "J" }, { "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"6\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu6 - Allocated", "refId": "K" }, { "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"5\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu5 - Allocated", "refId": "L" }, { "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"4\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu4 - Allocated", "refId": "M" }, { "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"3\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu3 - Allocated", "refId": "N" }, { "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"2\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu2 - Allocated", "refId": "O" }, { "expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"1\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu1 - Allocated", "refId": "P" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Temperature (Celsius)", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 0, "description": "the amount of total available and allocated VRAM", "editable": true, "error": false, "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 0, "fillGradient": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 23 }, "hiddenSeries": false, "id": 65, "interval": "1s", "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "maxPerRow": 6, "nullPointMode": "connected", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeat": "node", "repeatDirection": "h", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"0\"} / 1024", "interval": "", "legendFormat": "{{short_instance}}:gpu0 - Allocated", "refId": "I" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"7\"} / 1024", "interval": "", "legendFormat": "{{short_instance}}:gpu7 - Allocated", "refId": "J" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"6\"} / 1024", "interval": "", "legendFormat": "{{short_instance}}:gpu6 - Allocated", "refId": "K" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"5\"} / 1024", "interval": "", "legendFormat": "{{short_instance}}:gpu5 - Allocated", "refId": "L" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"4\"} / 1024", "interval": "", "legendFormat": "{{short_instance}}:gpu4 - Allocated", "refId": "M" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"3\"} / 1024", "interval": "", "legendFormat": "{{short_instance}}:gpu3 - Allocated", "refId": "N" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"2\"} / 1024", "interval": "", "legendFormat": "{{short_instance}}:gpu2 - Allocated", "refId": "O" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"1\"} / 1024", "interval": "", "legendFormat": "{{short_instance}}:gpu1 - Allocated", "refId": "P" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Memory Allocation (GB)", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "prometheus", "decimals": 0, "description": "indicate how busy the respective mem blocks are", "editable": true, "error": false, "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, "fill": 0, "fillGradient": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 31 }, "hiddenSeries": false, "id": 64, "interval": "1s", "legend": { "avg": false, "current": false, "max": true, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "maxPerRow": 6, "nullPointMode": "connected", "options": { "dataLinks": [] }, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeat": "node", "repeatDirection": "h", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"1\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"1\"}", "instant": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{short_instance}}:gpu1", "metric": "", "refId": "A", "step": 1200, "target": "" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"2\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"2\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu2", "refId": "B" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"3\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"3\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu3", "refId": "C" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"4\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"4\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu4", "refId": "D" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"5\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"5\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu5", "refId": "E" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"6\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"6\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu6", "refId": "F" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"7\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"7\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu7", "refId": "G" }, { "expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"0\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"0\"}", "interval": "", "legendFormat": "{{short_instance}}:gpu0", "refId": "H" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GPU Memory Activity Level (%)", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "refresh": false, "schemaVersion": 25, "style": "dark", "tags": [ "IB" ], "templating": { "list": [ { "allFormat": "glob", "allValue": null, "current": { "selected": true, "tags": [], "text": "", "value": [] }, "datasource": "prometheus", "definition": "label_values(instance)", "hide": 0, "includeAll": false, "label": "Host", "multi": true, "multiFormat": "regex values", "name": "node", "options": [], "query": "label_values(instance)", "refresh": 1, "regex": "/(.*):.*/", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-30m", "to": "now" }, "timepicker": { "now": true, "refresh_intervals": [ "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "1m", "2m", "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "ROCm Data Center tool V1.0", "uid": "thisIsAuniqueID", "version": 21 }