1602a481cf
A new Grafana dashboard file rdc_grafana_dashboard_example.json
has been added to the folder python_binding. User can import
this dashboard to monitor multiple compute nodes.
To display the host name only in the dashboard, the
rdc_prometheus_example.yml is also changed to create a new label
short_instance which will not have the port number.
Change-Id: I9ab91838006d59c8dcb5fea01decb8c799484e1d
[ROCm/rdc commit: aeba7b0f91]
992 wiersze
24 KiB
JSON
992 wiersze
24 KiB
JSON
{
|
|
"annotations": {
|
|
"list": [
|
|
{
|
|
"builtIn": 1,
|
|
"datasource": "-- Grafana --",
|
|
"enable": true,
|
|
"hide": true,
|
|
"iconColor": "rgba(0, 211, 255, 1)",
|
|
"name": "Annotations & Alerts",
|
|
"type": "dashboard"
|
|
}
|
|
]
|
|
},
|
|
"description": "Dashboard to monitor AMD GPUs using RDC",
|
|
"editable": true,
|
|
"gnetId": 11756,
|
|
"graphTooltip": 0,
|
|
"id": 4,
|
|
"iteration": 1599146807681,
|
|
"links": [],
|
|
"panels": [
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "prometheus",
|
|
"decimals": 0,
|
|
"editable": true,
|
|
"error": false,
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": {}
|
|
},
|
|
"overrides": []
|
|
},
|
|
"fill": 0,
|
|
"fillGradient": 0,
|
|
"grid": {},
|
|
"gridPos": {
|
|
"h": 7,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 0
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 26,
|
|
"interval": "1s",
|
|
"legend": {
|
|
"avg": false,
|
|
"current": true,
|
|
"max": true,
|
|
"min": true,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 2,
|
|
"links": [],
|
|
"maxPerRow": 6,
|
|
"nullPointMode": "connected",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pluginVersion": "6.7.3",
|
|
"pointradius": 5,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"repeat": "node",
|
|
"repeatDirection": "h",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": true,
|
|
"targets": [
|
|
{
|
|
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"0\"}",
|
|
"interval": "",
|
|
"intervalFactor": 1,
|
|
"legendFormat": "{{short_instance}}:gpu0",
|
|
"metric": "",
|
|
"refId": "A",
|
|
"step": 1200,
|
|
"target": ""
|
|
},
|
|
{
|
|
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"1\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu1",
|
|
"refId": "B"
|
|
},
|
|
{
|
|
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"2\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu2",
|
|
"refId": "C"
|
|
},
|
|
{
|
|
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"3\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu3",
|
|
"refId": "D"
|
|
},
|
|
{
|
|
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"4\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu4",
|
|
"refId": "E"
|
|
},
|
|
{
|
|
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"5\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu5",
|
|
"refId": "F"
|
|
},
|
|
{
|
|
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"6\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu6",
|
|
"refId": "G"
|
|
},
|
|
{
|
|
"expr": "power_usage{instance=~\"$node.*\",gpu_index=\"7\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu7",
|
|
"refId": "H"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "Average GPU Package Power (Watt)",
|
|
"tooltip": {
|
|
"msResolution": false,
|
|
"shared": true,
|
|
"sort": 2,
|
|
"value_type": "cumulative"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "short",
|
|
"label": "",
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "prometheus",
|
|
"decimals": 0,
|
|
"editable": true,
|
|
"error": false,
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": {}
|
|
},
|
|
"overrides": []
|
|
},
|
|
"fill": 0,
|
|
"fillGradient": 0,
|
|
"grid": {},
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 7
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 45,
|
|
"interval": "1s",
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": true,
|
|
"min": true,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 2,
|
|
"links": [],
|
|
"maxPerRow": 6,
|
|
"nullPointMode": "connected",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 5,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"repeat": "node",
|
|
"repeatDirection": "h",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"1\"}",
|
|
"instant": false,
|
|
"interval": "",
|
|
"intervalFactor": 1,
|
|
"legendFormat": "{{short_instance}}:gpu1",
|
|
"metric": "",
|
|
"refId": "A",
|
|
"step": 1200,
|
|
"target": ""
|
|
},
|
|
{
|
|
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"2\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu2",
|
|
"refId": "B"
|
|
},
|
|
{
|
|
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"3\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu3",
|
|
"refId": "C"
|
|
},
|
|
{
|
|
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"4\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu4",
|
|
"refId": "D"
|
|
},
|
|
{
|
|
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"5\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu5",
|
|
"refId": "E"
|
|
},
|
|
{
|
|
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"6\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu6",
|
|
"refId": "F"
|
|
},
|
|
{
|
|
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"7\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu7",
|
|
"refId": "G"
|
|
},
|
|
{
|
|
"expr": "gpu_util{instance=~\"$node.*\",gpu_index=\"0\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu0",
|
|
"refId": "H"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU Usage (%)",
|
|
"tooltip": {
|
|
"msResolution": false,
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "cumulative"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "short",
|
|
"label": "",
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "prometheus",
|
|
"decimals": 0,
|
|
"editable": true,
|
|
"error": false,
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": {}
|
|
},
|
|
"overrides": []
|
|
},
|
|
"fill": 0,
|
|
"fillGradient": 0,
|
|
"grid": {},
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 15
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 27,
|
|
"interval": "1s",
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": true,
|
|
"min": true,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 2,
|
|
"links": [],
|
|
"maxPerRow": 6,
|
|
"nullPointMode": "connected",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 5,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"repeat": "node",
|
|
"repeatDirection": "h",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"1\"}",
|
|
"instant": false,
|
|
"interval": "",
|
|
"intervalFactor": 1,
|
|
"legendFormat": "{{short_instance}}:gpu1",
|
|
"metric": "",
|
|
"refId": "A",
|
|
"step": 1200,
|
|
"target": ""
|
|
},
|
|
{
|
|
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"2\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu2",
|
|
"refId": "B"
|
|
},
|
|
{
|
|
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"3\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu3",
|
|
"refId": "C"
|
|
},
|
|
{
|
|
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"4\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu4",
|
|
"refId": "D"
|
|
},
|
|
{
|
|
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"5\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu5",
|
|
"refId": "E"
|
|
},
|
|
{
|
|
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"6\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu6",
|
|
"refId": "F"
|
|
},
|
|
{
|
|
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"7\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu7",
|
|
"refId": "G"
|
|
},
|
|
{
|
|
"expr": "gpu_clock{instance=~\"$node.*\",gpu_index=\"0\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu0",
|
|
"refId": "H"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU Clock Speed (MHz)",
|
|
"tooltip": {
|
|
"msResolution": false,
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "cumulative"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "short",
|
|
"label": "",
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "prometheus",
|
|
"decimals": 0,
|
|
"description": "The GPU temperature in degree",
|
|
"editable": true,
|
|
"error": false,
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": {}
|
|
},
|
|
"overrides": []
|
|
},
|
|
"fill": 0,
|
|
"fillGradient": 0,
|
|
"grid": {},
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 23
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 86,
|
|
"interval": "1s",
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 2,
|
|
"links": [],
|
|
"maxPerRow": 6,
|
|
"nullPointMode": "connected",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 5,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"repeat": "node",
|
|
"repeatDirection": "h",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"0\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu0 - Allocated",
|
|
"refId": "I"
|
|
},
|
|
{
|
|
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"7\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu7 - Allocated",
|
|
"refId": "J"
|
|
},
|
|
{
|
|
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"6\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu6 - Allocated",
|
|
"refId": "K"
|
|
},
|
|
{
|
|
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"5\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu5 - Allocated",
|
|
"refId": "L"
|
|
},
|
|
{
|
|
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"4\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu4 - Allocated",
|
|
"refId": "M"
|
|
},
|
|
{
|
|
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"3\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu3 - Allocated",
|
|
"refId": "N"
|
|
},
|
|
{
|
|
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"2\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu2 - Allocated",
|
|
"refId": "O"
|
|
},
|
|
{
|
|
"expr": "gpu_temp{instance=~\"$node.*\",gpu_index=\"1\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu1 - Allocated",
|
|
"refId": "P"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU Temperature (Celsius)",
|
|
"tooltip": {
|
|
"msResolution": false,
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "cumulative"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "short",
|
|
"label": "",
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "prometheus",
|
|
"decimals": 0,
|
|
"description": "the amount of total available and allocated VRAM",
|
|
"editable": true,
|
|
"error": false,
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": {}
|
|
},
|
|
"overrides": []
|
|
},
|
|
"fill": 0,
|
|
"fillGradient": 0,
|
|
"grid": {},
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 23
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 65,
|
|
"interval": "1s",
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": false,
|
|
"min": false,
|
|
"show": true,
|
|
"total": false,
|
|
"values": false
|
|
},
|
|
"lines": true,
|
|
"linewidth": 2,
|
|
"links": [],
|
|
"maxPerRow": 6,
|
|
"nullPointMode": "connected",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 5,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"repeat": "node",
|
|
"repeatDirection": "h",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"0\"} / 1024",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu0 - Allocated",
|
|
"refId": "I"
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"7\"} / 1024",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu7 - Allocated",
|
|
"refId": "J"
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"6\"} / 1024",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu6 - Allocated",
|
|
"refId": "K"
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"5\"} / 1024",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu5 - Allocated",
|
|
"refId": "L"
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"4\"} / 1024",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu4 - Allocated",
|
|
"refId": "M"
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"3\"} / 1024",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu3 - Allocated",
|
|
"refId": "N"
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"2\"} / 1024",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu2 - Allocated",
|
|
"refId": "O"
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"1\"} / 1024",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu1 - Allocated",
|
|
"refId": "P"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU Memory Allocation (GB)",
|
|
"tooltip": {
|
|
"msResolution": false,
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "cumulative"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "short",
|
|
"label": "",
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
},
|
|
{
|
|
"aliasColors": {},
|
|
"bars": false,
|
|
"dashLength": 10,
|
|
"dashes": false,
|
|
"datasource": "prometheus",
|
|
"decimals": 0,
|
|
"description": "indicate how busy the respective mem blocks are",
|
|
"editable": true,
|
|
"error": false,
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"custom": {}
|
|
},
|
|
"overrides": []
|
|
},
|
|
"fill": 0,
|
|
"fillGradient": 0,
|
|
"grid": {},
|
|
"gridPos": {
|
|
"h": 8,
|
|
"w": 12,
|
|
"x": 0,
|
|
"y": 31
|
|
},
|
|
"hiddenSeries": false,
|
|
"id": 64,
|
|
"interval": "1s",
|
|
"legend": {
|
|
"avg": false,
|
|
"current": false,
|
|
"max": true,
|
|
"min": true,
|
|
"show": true,
|
|
"total": false,
|
|
"values": true
|
|
},
|
|
"lines": true,
|
|
"linewidth": 2,
|
|
"links": [],
|
|
"maxPerRow": 6,
|
|
"nullPointMode": "connected",
|
|
"options": {
|
|
"dataLinks": []
|
|
},
|
|
"percentage": false,
|
|
"pointradius": 5,
|
|
"points": false,
|
|
"renderer": "flot",
|
|
"repeat": "node",
|
|
"repeatDirection": "h",
|
|
"seriesOverrides": [],
|
|
"spaceLength": 10,
|
|
"stack": false,
|
|
"steppedLine": false,
|
|
"targets": [
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"1\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"1\"}",
|
|
"instant": false,
|
|
"interval": "",
|
|
"intervalFactor": 1,
|
|
"legendFormat": "{{short_instance}}:gpu1",
|
|
"metric": "",
|
|
"refId": "A",
|
|
"step": 1200,
|
|
"target": ""
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"2\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"2\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu2",
|
|
"refId": "B"
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"3\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"3\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu3",
|
|
"refId": "C"
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"4\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"4\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu4",
|
|
"refId": "D"
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"5\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"5\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu5",
|
|
"refId": "E"
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"6\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"6\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu6",
|
|
"refId": "F"
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"7\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"7\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu7",
|
|
"refId": "G"
|
|
},
|
|
{
|
|
"expr": "gpu_memory_usage{instance=~\"$node.*\",gpu_index=\"0\"}*100/gpu_memory_total{instance=~\"$node.*\",gpu_index=\"0\"}",
|
|
"interval": "",
|
|
"legendFormat": "{{short_instance}}:gpu0",
|
|
"refId": "H"
|
|
}
|
|
],
|
|
"thresholds": [],
|
|
"timeFrom": null,
|
|
"timeRegions": [],
|
|
"timeShift": null,
|
|
"title": "GPU Memory Activity Level (%)",
|
|
"tooltip": {
|
|
"msResolution": false,
|
|
"shared": true,
|
|
"sort": 0,
|
|
"value_type": "cumulative"
|
|
},
|
|
"type": "graph",
|
|
"xaxis": {
|
|
"buckets": null,
|
|
"mode": "time",
|
|
"name": null,
|
|
"show": true,
|
|
"values": []
|
|
},
|
|
"yaxes": [
|
|
{
|
|
"format": "short",
|
|
"label": "",
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
},
|
|
{
|
|
"format": "short",
|
|
"logBase": 1,
|
|
"max": null,
|
|
"min": null,
|
|
"show": true
|
|
}
|
|
],
|
|
"yaxis": {
|
|
"align": false,
|
|
"alignLevel": null
|
|
}
|
|
}
|
|
],
|
|
"refresh": false,
|
|
"schemaVersion": 25,
|
|
"style": "dark",
|
|
"tags": [
|
|
"IB"
|
|
],
|
|
"templating": {
|
|
"list": [
|
|
{
|
|
"allFormat": "glob",
|
|
"allValue": null,
|
|
"current": {
|
|
"selected": true,
|
|
"tags": [],
|
|
"text": "",
|
|
"value": []
|
|
},
|
|
"datasource": "prometheus",
|
|
"definition": "label_values(instance)",
|
|
"hide": 0,
|
|
"includeAll": false,
|
|
"label": "Host",
|
|
"multi": true,
|
|
"multiFormat": "regex values",
|
|
"name": "node",
|
|
"options": [],
|
|
"query": "label_values(instance)",
|
|
"refresh": 1,
|
|
"regex": "/(.*):.*/",
|
|
"skipUrlSync": false,
|
|
"sort": 1,
|
|
"tagValuesQuery": "",
|
|
"tags": [],
|
|
"tagsQuery": "",
|
|
"type": "query",
|
|
"useTags": false
|
|
}
|
|
]
|
|
},
|
|
"time": {
|
|
"from": "now-30m",
|
|
"to": "now"
|
|
},
|
|
"timepicker": {
|
|
"now": true,
|
|
"refresh_intervals": [
|
|
"10s",
|
|
"30s",
|
|
"1m",
|
|
"5m",
|
|
"15m",
|
|
"30m",
|
|
"1h",
|
|
"2h",
|
|
"1d"
|
|
],
|
|
"time_options": [
|
|
"1m",
|
|
"2m",
|
|
"5m",
|
|
"15m",
|
|
"1h",
|
|
"6h",
|
|
"12h",
|
|
"24h",
|
|
"2d",
|
|
"7d",
|
|
"30d"
|
|
]
|
|
},
|
|
"timezone": "browser",
|
|
"title": "ROCm Data Center tool V1.0",
|
|
"uid": "thisIsAuniqueID",
|
|
"version": 21
|
|
}
|