ファイル
Bill(Shuzhou) Liu 1602a481cf Integrate RDC with Grafana
A new Grafana dashboard file rdc_grafana_dashboard_example.json
has been added to the folder python_binding. User can import
this dashboard to monitor multiple compute nodes.

To display the host name only in the dashboard, the
rdc_prometheus_example.yml is also changed to create a new label
short_instance which will not have the port number.

Change-Id: I9ab91838006d59c8dcb5fea01decb8c799484e1d


[ROCm/rdc commit: aeba7b0f91]
2020-10-15 14:12:15 -04:00

992 行
24 KiB
JSON

{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Dashboard to monitor AMD GPUs using RDC",
"editable": true,
"gnetId": 11756,
"graphTooltip": 0,
"id": 4,
"iteration": 1599146807681,
"links": [],
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"decimals": 0,
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 0
},
"hiddenSeries": false,
"id": 26,
"interval": "1s",
"legend": {
"avg": false,
"current": true,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"maxPerRow": 6,
"nullPointMode": "connected",
"options": {
"dataLinks": []
},
"percentage": false,
"pluginVersion": "6.7.3",
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": "node",
"repeatDirection": "h",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": true,
"targets": [
{
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"0\"}",
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{short_instance}}:gpu0",
"metric": "",
"refId": "A",
"step": 1200,
"target": ""
},
{
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"1\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu1",
"refId": "B"
},
{
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"2\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu2",
"refId": "C"
},
{
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"3\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu3",
"refId": "D"
},
{
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"4\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu4",
"refId": "E"
},
{
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"5\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu5",
"refId": "F"
},
{
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"6\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu6",
"refId": "G"
},
{
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"7\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu7",
"refId": "H"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Average GPU Package Power (Watt)",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"decimals": 0,
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 7
},
"hiddenSeries": false,
"id": 45,
"interval": "1s",
"legend": {
"avg": false,
"current": false,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"maxPerRow": 6,
"nullPointMode": "connected",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": "node",
"repeatDirection": "h",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"1\"}",
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{short_instance}}:gpu1",
"metric": "",
"refId": "A",
"step": 1200,
"target": ""
},
{
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"2\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu2",
"refId": "B"
},
{
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"3\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu3",
"refId": "C"
},
{
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"4\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu4",
"refId": "D"
},
{
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"5\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu5",
"refId": "E"
},
{
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"6\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu6",
"refId": "F"
},
{
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"7\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu7",
"refId": "G"
},
{
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"0\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu0",
"refId": "H"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU Usage (%)",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"decimals": 0,
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 15
},
"hiddenSeries": false,
"id": 27,
"interval": "1s",
"legend": {
"avg": false,
"current": false,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"maxPerRow": 6,
"nullPointMode": "connected",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": "node",
"repeatDirection": "h",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"1\"}",
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{short_instance}}:gpu1",
"metric": "",
"refId": "A",
"step": 1200,
"target": ""
},
{
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"2\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu2",
"refId": "B"
},
{
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"3\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu3",
"refId": "C"
},
{
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"4\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu4",
"refId": "D"
},
{
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"5\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu5",
"refId": "E"
},
{
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"6\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu6",
"refId": "F"
},
{
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"7\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu7",
"refId": "G"
},
{
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"0\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu0",
"refId": "H"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU Clock Speed (MHz)",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"decimals": 0,
"description": "The GPU temperature in degree",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 23
},
"hiddenSeries": false,
"id": 86,
"interval": "1s",
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"maxPerRow": 6,
"nullPointMode": "connected",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": "node",
"repeatDirection": "h",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"0\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu0 - Allocated",
"refId": "I"
},
{
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"7\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu7 - Allocated",
"refId": "J"
},
{
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"6\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu6 - Allocated",
"refId": "K"
},
{
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"5\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu5 - Allocated",
"refId": "L"
},
{
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"4\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu4 - Allocated",
"refId": "M"
},
{
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"3\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu3 - Allocated",
"refId": "N"
},
{
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"2\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu2 - Allocated",
"refId": "O"
},
{
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"1\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu1 - Allocated",
"refId": "P"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU Temperature (Celsius)",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"decimals": 0,
"description": "the amount of total available and allocated VRAM",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 23
},
"hiddenSeries": false,
"id": 65,
"interval": "1s",
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"maxPerRow": 6,
"nullPointMode": "connected",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": "node",
"repeatDirection": "h",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"0\"} / 1024",
"interval": "",
"legendFormat": "{{short_instance}}:gpu0 - Allocated",
"refId": "I"
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"7\"} / 1024",
"interval": "",
"legendFormat": "{{short_instance}}:gpu7 - Allocated",
"refId": "J"
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"6\"} / 1024",
"interval": "",
"legendFormat": "{{short_instance}}:gpu6 - Allocated",
"refId": "K"
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"5\"} / 1024",
"interval": "",
"legendFormat": "{{short_instance}}:gpu5 - Allocated",
"refId": "L"
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"4\"} / 1024",
"interval": "",
"legendFormat": "{{short_instance}}:gpu4 - Allocated",
"refId": "M"
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"3\"} / 1024",
"interval": "",
"legendFormat": "{{short_instance}}:gpu3 - Allocated",
"refId": "N"
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"2\"} / 1024",
"interval": "",
"legendFormat": "{{short_instance}}:gpu2 - Allocated",
"refId": "O"
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"1\"} / 1024",
"interval": "",
"legendFormat": "{{short_instance}}:gpu1 - Allocated",
"refId": "P"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU Memory Allocation (GB)",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "prometheus",
"decimals": 0,
"description": "indicate how busy the respective mem blocks are",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {}
},
"overrides": []
},
"fill": 0,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 31
},
"hiddenSeries": false,
"id": 64,
"interval": "1s",
"legend": {
"avg": false,
"current": false,
"max": true,
"min": true,
"show": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"maxPerRow": 6,
"nullPointMode": "connected",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": "node",
"repeatDirection": "h",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"1\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"1\"}",
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{short_instance}}:gpu1",
"metric": "",
"refId": "A",
"step": 1200,
"target": ""
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"2\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"2\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu2",
"refId": "B"
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"3\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"3\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu3",
"refId": "C"
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"4\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"4\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu4",
"refId": "D"
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"5\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"5\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu5",
"refId": "E"
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"6\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"6\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu6",
"refId": "F"
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"7\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"7\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu7",
"refId": "G"
},
{
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"0\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"0\"}",
"interval": "",
"legendFormat": "{{short_instance}}:gpu0",
"refId": "H"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GPU Memory Activity Level (%)",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 0,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": "",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": false,
"schemaVersion": 25,
"style": "dark",
"tags": [
"IB"
],
"templating": {
"list": [
{
"allFormat": "glob",
"allValue": null,
"current": {
"selected": true,
"tags": [],
"text": "",
"value": []
},
"datasource": "prometheus",
"definition": "label_values(instance)",
"hide": 0,
"includeAll": false,
"label": "Host",
"multi": true,
"multiFormat": "regex values",
"name": "node",
"options": [],
"query": "label_values(instance)",
"refresh": 1,
"regex": "/(.*):.*/",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-30m",
"to": "now"
},
"timepicker": {
"now": true,
"refresh_intervals": [
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"1m",
"2m",
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "ROCm Data Center tool V1.0",
"uid": "thisIsAuniqueID",
"version": 21
}