SWDEV-504130 Add temperature violation status to amd-smi monitor (#2)

Added boolean temperature violation status to amd-smi monitor.

Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
This commit is contained in:
Kanangot Balakrishnan, Bindhiya
2025-01-08 16:35:53 -06:00
committed by GitHub
parent 129ad8ffad
commit d0e770ffbc
4 changed files with 42 additions and 5 deletions
+20
View File
@@ -7,6 +7,26 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
### Added
- **Added TVIOL_ACTIVE to `amd-smi monitor`**.
Added temperature violation active or not status to `amd-smi monitor`. TVIOL_ACTIVE will be displayed as below:
- True if active
- False if not active
- N/A if not supported.
Example CLI output:
```shell
$ amd-smi monitor --viol
GPU PVIOL TVIOL TVIOL_ACTIVE PHOT_TVIOL VR_TVIOL HBM_TVIOL
0 100 % 1 % True 0 % 0 % 0 %
1 100 % 0 % False 0 % 0 % 0 %
2 100 % 0 % False 0 % 0 % 0 %
3 100 % 0 % False 0 % 0 % 0 %
4 100 % 0 % False 0 % 0 % 0 %
5 100 % 3 % True 0 % 0 % 0 %
6 100 % 0 % False 0 % 0 % 0 %
7 100 % 0 % False 0 % 0 % 0 %
```
- **Added support for GPU metrics 1.7 to `amdsmi_get_gpu_metrics_info()`**
Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to include new fields for XGMI Link Status, graphics clocks below host limit (per XCP), and VRAM max bandwidth:
- `uint64_t vram_max_bandwidth` - VRAM max bandwidth at max memory clock (GB/s)
+11 -4
View File
@@ -5270,6 +5270,7 @@ class AMDSMICommands():
violation_status = {
"pviol": "N/A",
"tviol": "N/A",
"tviol_active": "N/A",
"phot_tviol": "N/A",
"vr_tviol": "N/A",
"hbm_tviol": "N/A",
@@ -5278,26 +5279,31 @@ class AMDSMICommands():
violations = amdsmi_interface.amdsmi_get_violation_status(args.gpu)
violation_status['pviol'] = violations['per_ppt_pwr']
violation_status['tviol'] = violations['per_socket_thrm']
violation_status['tviol_active'] = violations['active_socket_thrm']
violation_status['phot_tviol'] = violations['per_prochot_thrm']
violation_status['vr_tviol'] = violations['per_vr_thrm']
violation_status['hbm_tviol'] = violations['per_hbm_thrm']
except amdsmi_exception.AmdSmiLibraryException as e:
monitor_values['pviol'] = violation_status['pviol']
monitor_values['tviol'] = violation_status['tviol']
monitor_values['tviol_active'] = violation_status['tviol_active']
monitor_values['phot_tviol'] = violation_status['phot_tviol']
monitor_values['vr_tviol'] = violation_status['vr_tviol']
monitor_values['hbm_tviol'] = violation_status['hbm_tviol']
logging.debug("Failed to get violation status on gpu %s | %s", gpu_id, e.get_error_info())
violation_status_unit = "%"
kTVIOL_MAX_WIDTH = 10
kPVIOL_MAX_WIDTH = 10
kTVIOL_MAX_WIDTH = 7
kTVIOL_ACTIVE_MAX_WIDTH = 14
kPVIOL_MAX_WIDTH = 7
kPHOT_MAX_WIDTH = 12
kVR_MAX_WIDTH = 10
kHBM_MAX_WIDTH = 11
for key, value in violation_status.items():
monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit)
if key == "tviol_active":
monitor_values[key] = value
elif key != "tviol_active":
monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit)
if self.logger.is_human_readable_format():
monitor_values['pviol'] = monitor_values['pviol'].rjust(kPVIOL_MAX_WIDTH, ' ')
monitor_values['tviol'] = monitor_values['tviol'].rjust(kTVIOL_MAX_WIDTH, ' ')
@@ -5306,6 +5312,7 @@ class AMDSMICommands():
monitor_values['hbm_tviol'] = monitor_values['hbm_tviol'].rjust(kHBM_MAX_WIDTH, ' ')
self.logger.table_header += 'PVIOL'.rjust(kPVIOL_MAX_WIDTH, ' ')
self.logger.table_header += 'TVIOL'.rjust(kTVIOL_MAX_WIDTH, ' ')
self.logger.table_header += 'TVIOL_ACTIVE'.rjust(kTVIOL_ACTIVE_MAX_WIDTH, ' ')
self.logger.table_header += 'PHOT_TVIOL'.rjust(kPHOT_MAX_WIDTH, ' ')
self.logger.table_header += 'VR_TVIOL'.rjust(kVR_MAX_WIDTH, ' ')
self.logger.table_header += 'HBM_TVIOL'.rjust(kHBM_MAX_WIDTH, ' ')
+10
View File
@@ -170,6 +170,16 @@ class AMDSMILogger():
table_values += string_value.ljust(18)
elif key == "RW":
table_values += string_value.ljust(57)
elif key in ('pviol', 'tviol'):
table_values += string_value.rjust(7)
elif key == "tviol_active":
table_values += string_value.rjust(14)
elif key == "phot_tviol":
table_values += string_value.rjust(12)
elif key == "vr_tviol":
table_values += string_value.rjust(10)
elif key == "hbm_tviol":
table_values += string_value.rjust(11)
elif key == "process_list":
#Add an additional padding between the first instance of GPU and NAME
table_values += ' '
+1 -1
View File
@@ -794,7 +794,7 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
ss << __PRETTY_FUNCTION__ << " | "
<< "ENTERED socket_thm_residency_acc | per_socket_thrm: " << std::dec
<< violation_status->per_socket_thrm
<< "%; active_ppt_pwr = " << std::dec
<< "%; active_socket_thrm = " << std::dec
<< violation_status->active_socket_thrm << "\n";
LOG_DEBUG(ss);
}