SWDEV-504130 Add temperature violation status to amd-smi monitor (#2)
Added boolean temperature violation status to amd-smi monitor. Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
This commit is contained in:
committed by
GitHub
parent
129ad8ffad
commit
d0e770ffbc
@@ -7,6 +7,26 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
|
||||
|
||||
### Added
|
||||
|
||||
- **Added TVIOL_ACTIVE to `amd-smi monitor`**.
|
||||
Added temperature violation active or not status to `amd-smi monitor`. TVIOL_ACTIVE will be displayed as below:
|
||||
- True if active
|
||||
- False if not active
|
||||
- N/A if not supported.
|
||||
|
||||
Example CLI output:
|
||||
```shell
|
||||
$ amd-smi monitor --viol
|
||||
GPU PVIOL TVIOL TVIOL_ACTIVE PHOT_TVIOL VR_TVIOL HBM_TVIOL
|
||||
0 100 % 1 % True 0 % 0 % 0 %
|
||||
1 100 % 0 % False 0 % 0 % 0 %
|
||||
2 100 % 0 % False 0 % 0 % 0 %
|
||||
3 100 % 0 % False 0 % 0 % 0 %
|
||||
4 100 % 0 % False 0 % 0 % 0 %
|
||||
5 100 % 3 % True 0 % 0 % 0 %
|
||||
6 100 % 0 % False 0 % 0 % 0 %
|
||||
7 100 % 0 % False 0 % 0 % 0 %
|
||||
```
|
||||
|
||||
- **Added support for GPU metrics 1.7 to `amdsmi_get_gpu_metrics_info()`**
|
||||
Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to include new fields for XGMI Link Status, graphics clocks below host limit (per XCP), and VRAM max bandwidth:
|
||||
- `uint64_t vram_max_bandwidth` - VRAM max bandwidth at max memory clock (GB/s)
|
||||
|
||||
@@ -5270,6 +5270,7 @@ class AMDSMICommands():
|
||||
violation_status = {
|
||||
"pviol": "N/A",
|
||||
"tviol": "N/A",
|
||||
"tviol_active": "N/A",
|
||||
"phot_tviol": "N/A",
|
||||
"vr_tviol": "N/A",
|
||||
"hbm_tviol": "N/A",
|
||||
@@ -5278,26 +5279,31 @@ class AMDSMICommands():
|
||||
violations = amdsmi_interface.amdsmi_get_violation_status(args.gpu)
|
||||
violation_status['pviol'] = violations['per_ppt_pwr']
|
||||
violation_status['tviol'] = violations['per_socket_thrm']
|
||||
violation_status['tviol_active'] = violations['active_socket_thrm']
|
||||
violation_status['phot_tviol'] = violations['per_prochot_thrm']
|
||||
violation_status['vr_tviol'] = violations['per_vr_thrm']
|
||||
violation_status['hbm_tviol'] = violations['per_hbm_thrm']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['pviol'] = violation_status['pviol']
|
||||
monitor_values['tviol'] = violation_status['tviol']
|
||||
monitor_values['tviol_active'] = violation_status['tviol_active']
|
||||
monitor_values['phot_tviol'] = violation_status['phot_tviol']
|
||||
monitor_values['vr_tviol'] = violation_status['vr_tviol']
|
||||
monitor_values['hbm_tviol'] = violation_status['hbm_tviol']
|
||||
logging.debug("Failed to get violation status on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
violation_status_unit = "%"
|
||||
kTVIOL_MAX_WIDTH = 10
|
||||
kPVIOL_MAX_WIDTH = 10
|
||||
kTVIOL_MAX_WIDTH = 7
|
||||
kTVIOL_ACTIVE_MAX_WIDTH = 14
|
||||
kPVIOL_MAX_WIDTH = 7
|
||||
kPHOT_MAX_WIDTH = 12
|
||||
kVR_MAX_WIDTH = 10
|
||||
kHBM_MAX_WIDTH = 11
|
||||
|
||||
for key, value in violation_status.items():
|
||||
monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit)
|
||||
|
||||
if key == "tviol_active":
|
||||
monitor_values[key] = value
|
||||
elif key != "tviol_active":
|
||||
monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit)
|
||||
if self.logger.is_human_readable_format():
|
||||
monitor_values['pviol'] = monitor_values['pviol'].rjust(kPVIOL_MAX_WIDTH, ' ')
|
||||
monitor_values['tviol'] = monitor_values['tviol'].rjust(kTVIOL_MAX_WIDTH, ' ')
|
||||
@@ -5306,6 +5312,7 @@ class AMDSMICommands():
|
||||
monitor_values['hbm_tviol'] = monitor_values['hbm_tviol'].rjust(kHBM_MAX_WIDTH, ' ')
|
||||
self.logger.table_header += 'PVIOL'.rjust(kPVIOL_MAX_WIDTH, ' ')
|
||||
self.logger.table_header += 'TVIOL'.rjust(kTVIOL_MAX_WIDTH, ' ')
|
||||
self.logger.table_header += 'TVIOL_ACTIVE'.rjust(kTVIOL_ACTIVE_MAX_WIDTH, ' ')
|
||||
self.logger.table_header += 'PHOT_TVIOL'.rjust(kPHOT_MAX_WIDTH, ' ')
|
||||
self.logger.table_header += 'VR_TVIOL'.rjust(kVR_MAX_WIDTH, ' ')
|
||||
self.logger.table_header += 'HBM_TVIOL'.rjust(kHBM_MAX_WIDTH, ' ')
|
||||
|
||||
@@ -170,6 +170,16 @@ class AMDSMILogger():
|
||||
table_values += string_value.ljust(18)
|
||||
elif key == "RW":
|
||||
table_values += string_value.ljust(57)
|
||||
elif key in ('pviol', 'tviol'):
|
||||
table_values += string_value.rjust(7)
|
||||
elif key == "tviol_active":
|
||||
table_values += string_value.rjust(14)
|
||||
elif key == "phot_tviol":
|
||||
table_values += string_value.rjust(12)
|
||||
elif key == "vr_tviol":
|
||||
table_values += string_value.rjust(10)
|
||||
elif key == "hbm_tviol":
|
||||
table_values += string_value.rjust(11)
|
||||
elif key == "process_list":
|
||||
#Add an additional padding between the first instance of GPU and NAME
|
||||
table_values += ' '
|
||||
|
||||
@@ -794,7 +794,7 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
ss << __PRETTY_FUNCTION__ << " | "
|
||||
<< "ENTERED socket_thm_residency_acc | per_socket_thrm: " << std::dec
|
||||
<< violation_status->per_socket_thrm
|
||||
<< "%; active_ppt_pwr = " << std::dec
|
||||
<< "%; active_socket_thrm = " << std::dec
|
||||
<< violation_status->active_socket_thrm << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user