diff --git a/CHANGELOG.md b/CHANGELOG.md index e4df3d0189..22768bd1cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,26 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ### Added +- **Added TVIOL_ACTIVE to `amd-smi monitor`**. +Added temperature violation active or not status to `amd-smi monitor`. TVIOL_ACTIVE will be displayed as below: + - True if active + - False if not active + - N/A if not supported. + + Example CLI output: +```shell +$ amd-smi monitor --viol +GPU PVIOL TVIOL TVIOL_ACTIVE PHOT_TVIOL VR_TVIOL HBM_TVIOL + 0 100 % 1 % True 0 % 0 % 0 % + 1 100 % 0 % False 0 % 0 % 0 % + 2 100 % 0 % False 0 % 0 % 0 % + 3 100 % 0 % False 0 % 0 % 0 % + 4 100 % 0 % False 0 % 0 % 0 % + 5 100 % 3 % True 0 % 0 % 0 % + 6 100 % 0 % False 0 % 0 % 0 % + 7 100 % 0 % False 0 % 0 % 0 % +``` + - **Added support for GPU metrics 1.7 to `amdsmi_get_gpu_metrics_info()`** Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to include new fields for XGMI Link Status, graphics clocks below host limit (per XCP), and VRAM max bandwidth: - `uint64_t vram_max_bandwidth` - VRAM max bandwidth at max memory clock (GB/s) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index f88258e149..e2534c7b49 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -5270,6 +5270,7 @@ class AMDSMICommands(): violation_status = { "pviol": "N/A", "tviol": "N/A", + "tviol_active": "N/A", "phot_tviol": "N/A", "vr_tviol": "N/A", "hbm_tviol": "N/A", @@ -5278,26 +5279,31 @@ class AMDSMICommands(): violations = amdsmi_interface.amdsmi_get_violation_status(args.gpu) violation_status['pviol'] = violations['per_ppt_pwr'] violation_status['tviol'] = violations['per_socket_thrm'] + violation_status['tviol_active'] = violations['active_socket_thrm'] violation_status['phot_tviol'] = violations['per_prochot_thrm'] violation_status['vr_tviol'] = violations['per_vr_thrm'] violation_status['hbm_tviol'] = violations['per_hbm_thrm'] except amdsmi_exception.AmdSmiLibraryException as e: monitor_values['pviol'] = violation_status['pviol'] monitor_values['tviol'] = violation_status['tviol'] + monitor_values['tviol_active'] = violation_status['tviol_active'] monitor_values['phot_tviol'] = violation_status['phot_tviol'] monitor_values['vr_tviol'] = violation_status['vr_tviol'] monitor_values['hbm_tviol'] = violation_status['hbm_tviol'] logging.debug("Failed to get violation status on gpu %s | %s", gpu_id, e.get_error_info()) violation_status_unit = "%" - kTVIOL_MAX_WIDTH = 10 - kPVIOL_MAX_WIDTH = 10 + kTVIOL_MAX_WIDTH = 7 + kTVIOL_ACTIVE_MAX_WIDTH = 14 + kPVIOL_MAX_WIDTH = 7 kPHOT_MAX_WIDTH = 12 kVR_MAX_WIDTH = 10 kHBM_MAX_WIDTH = 11 for key, value in violation_status.items(): - monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit) - + if key == "tviol_active": + monitor_values[key] = value + elif key != "tviol_active": + monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit) if self.logger.is_human_readable_format(): monitor_values['pviol'] = monitor_values['pviol'].rjust(kPVIOL_MAX_WIDTH, ' ') monitor_values['tviol'] = monitor_values['tviol'].rjust(kTVIOL_MAX_WIDTH, ' ') @@ -5306,6 +5312,7 @@ class AMDSMICommands(): monitor_values['hbm_tviol'] = monitor_values['hbm_tviol'].rjust(kHBM_MAX_WIDTH, ' ') self.logger.table_header += 'PVIOL'.rjust(kPVIOL_MAX_WIDTH, ' ') self.logger.table_header += 'TVIOL'.rjust(kTVIOL_MAX_WIDTH, ' ') + self.logger.table_header += 'TVIOL_ACTIVE'.rjust(kTVIOL_ACTIVE_MAX_WIDTH, ' ') self.logger.table_header += 'PHOT_TVIOL'.rjust(kPHOT_MAX_WIDTH, ' ') self.logger.table_header += 'VR_TVIOL'.rjust(kVR_MAX_WIDTH, ' ') self.logger.table_header += 'HBM_TVIOL'.rjust(kHBM_MAX_WIDTH, ' ') diff --git a/amdsmi_cli/amdsmi_logger.py b/amdsmi_cli/amdsmi_logger.py index 4295a7b489..38633f46ed 100644 --- a/amdsmi_cli/amdsmi_logger.py +++ b/amdsmi_cli/amdsmi_logger.py @@ -170,6 +170,16 @@ class AMDSMILogger(): table_values += string_value.ljust(18) elif key == "RW": table_values += string_value.ljust(57) + elif key in ('pviol', 'tviol'): + table_values += string_value.rjust(7) + elif key == "tviol_active": + table_values += string_value.rjust(14) + elif key == "phot_tviol": + table_values += string_value.rjust(12) + elif key == "vr_tviol": + table_values += string_value.rjust(10) + elif key == "hbm_tviol": + table_values += string_value.rjust(11) elif key == "process_list": #Add an additional padding between the first instance of GPU and NAME table_values += ' ' diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 69f049f0af..5c418b3e10 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -794,7 +794,7 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha ss << __PRETTY_FUNCTION__ << " | " << "ENTERED socket_thm_residency_acc | per_socket_thrm: " << std::dec << violation_status->per_socket_thrm - << "%; active_ppt_pwr = " << std::dec + << "%; active_socket_thrm = " << std::dec << violation_status->active_socket_thrm << "\n"; LOG_DEBUG(ss); }