[SWDEV-463406] Update API with fields for gfx_clock_below_host_limit and low_utilization violations
Updated API with fields for gfx_clock_below_host_limit and low_utilization violations
Change-Id: I25647bae6e7b785f44dab024272767658688bcad
---------
Signed-off-by: Scaffidi, Salvatore <Salvatore.Scaffidi@amd.com>
Signed-off-by: Arif, Maisam <Maisam.Arif@amd.com>
Co-authored-by: Charis Poag <Charis.Poag@amd.com>
[ROCm/amdsmi commit: 3793be7735]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
c563c9c8d5
Коммит
6eeb06927d
@@ -135,46 +135,124 @@ GPU: 0
|
||||
|
||||
### Changed
|
||||
|
||||
|
||||
- **All `amd-smi set` and `amd-smi reset` options are now mutually exclusive**.
|
||||
- Users can only use one set option at a time now.
|
||||
- **All `amd-smi set` and `amd-smi reset` options are now mutually exclusive**.
|
||||
- Users can only use one set option at a time now.
|
||||
|
||||
- **Python API for `amdsmi_get_energy_count()` will change the name for the `power` field to `energy_accumulator`**.
|
||||
|
||||
- **Added violation status output for Graphics Clock Below Host Limit to our CLI: `amdsmi_get_violation_status()`, `amd-smi metric --throttle`, and `amd-smi monitor --violation`.**
|
||||
***Only available for MI300+ ASICs.***
|
||||
Users can retrieve violation status' through either our Python or C++ APIs.
|
||||
Additionally, we have added capability to view these outputs conviently through `amd-smi metric --throttle` and `amd-smi monitor --violation`.
|
||||
Example outputs are listed below (below is for reference, output is subject to change):
|
||||
|
||||
```shell
|
||||
$ amd-smi monitor --violation
|
||||
GPU PVIOL TVIOL TVIOL_ACTIVE PHOT_TVIOL VR_TVIOL HBM_TVIOL GFX_CLKVIOL
|
||||
0 0 % 0 % False 0 % 0 % 0 % 0 %
|
||||
1 0 % 0 % False 0 % 0 % 0 % 0 %
|
||||
...
|
||||
```
|
||||
|
||||
```shell
|
||||
$ amd-smi metric --throttle
|
||||
GPU: 0
|
||||
THROTTLE:
|
||||
ACCUMULATION_COUNTER: 11240028
|
||||
PROCHOT_ACCUMULATED: 0
|
||||
PPT_ACCUMULATED: 0
|
||||
SOCKET_THERMAL_ACCUMULATED: 0
|
||||
VR_THERMAL_ACCUMULATED: 0
|
||||
HBM_THERMAL_ACCUMULATED: 0
|
||||
GFX_CLK_BELOW_HOST_LIMIT_ACCUMULATED: N/A
|
||||
PROCHOT_VIOLATION_STATUS: NOT ACTIVE
|
||||
PPT_VIOLATION_STATUS: NOT ACTIVE
|
||||
SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE
|
||||
VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE
|
||||
HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE
|
||||
GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_STATUS: N/A
|
||||
PROCHOT_VIOLATION_ACTIVITY: 0 %
|
||||
PPT_VIOLATION_ACTIVITY: 0 %
|
||||
SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 %
|
||||
VR_THERMAL_VIOLATION_ACTIVITY: 0 %
|
||||
HBM_THERMAL_VIOLATION_ACTIVITY: 0 %
|
||||
GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_ACTIVITY: 0 %
|
||||
|
||||
GPU: 1
|
||||
THROTTLE:
|
||||
ACCUMULATION_COUNTER: 11238232
|
||||
PROCHOT_ACCUMULATED: 0
|
||||
PPT_ACCUMULATED: 0
|
||||
SOCKET_THERMAL_ACCUMULATED: 0
|
||||
VR_THERMAL_ACCUMULATED: 0
|
||||
HBM_THERMAL_ACCUMULATED: 0
|
||||
GFX_CLK_BELOW_HOST_LIMIT_ACCUMULATED: 0
|
||||
PROCHOT_VIOLATION_STATUS: NOT ACTIVE
|
||||
PPT_VIOLATION_STATUS: NOT ACTIVE
|
||||
SOCKET_THERMAL_VIOLATION_STATUS: NOT ACTIVE
|
||||
VR_THERMAL_VIOLATION_STATUS: NOT ACTIVE
|
||||
HBM_THERMAL_VIOLATION_STATUS: NOT ACTIVE
|
||||
GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_STATUS: NOT ACTIVE
|
||||
PROCHOT_VIOLATION_ACTIVITY: 0 %
|
||||
PPT_VIOLATION_ACTIVITY: 0 %
|
||||
SOCKET_THERMAL_VIOLATION_ACTIVITY: 0 %
|
||||
VR_THERMAL_VIOLATION_ACTIVITY: 0 %
|
||||
HBM_THERMAL_VIOLATION_ACTIVITY: 0 %
|
||||
GFX_CLK_BELOW_HOST_LIMIT_VIOLATION_ACTIVITY: 0 %
|
||||
...
|
||||
```
|
||||
|
||||
- **Updated API `amdsmi_get_violation_status()` structure and CLI `amdsmi_violation_status_t` to include GFX Clk below host limit**
|
||||
Updated structure `amdsmi_violation_status_t`:
|
||||
|
||||
```C
|
||||
typedef struct {
|
||||
...
|
||||
uint64_t acc_gfx_clk_below_host_limit; //!< Current graphic clock below host limit count; Max uint64 means unsupported
|
||||
...
|
||||
uint64_t per_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation % (greater than 0% is a violation); Max uint64 means unsupported
|
||||
...
|
||||
uint8_t active_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation; 1 = active 0 = not active; Max uint8 means unsupported
|
||||
...
|
||||
} amdsmi_violation_status_t;
|
||||
```
|
||||
|
||||
- **Updated API `amdsmi_get_gpu_vram_info()` structure and CLI `amd-smi static --vram`**
|
||||
Updated structure `amdsmi_vram_info_t`:
|
||||
```C
|
||||
typedef struct {
|
||||
amdsmi_vram_type_t vram_type;
|
||||
amdsmi_vram_vendor_type_t vram_vendor;
|
||||
uint64_t vram_size;
|
||||
uint32_t vram_bit_width;
|
||||
uint64_t vram_max_bandwidth; //!< The VRAM max bandwidth at current memory clock (GB/s)
|
||||
uint64_t reserved[4];
|
||||
} amdsmi_vram_info_t;
|
||||
Updated structure `amdsmi_vram_info_t`:
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_vram_info(amdsmi_processor_handle processor_handle, amdsmi_vram_info_t *info)
|
||||
```
|
||||
Example CLI output:
|
||||
```shell
|
||||
$ amd-smi static --vram
|
||||
GPU: 0
|
||||
VRAM:
|
||||
TYPE: GDDR6
|
||||
VENDOR: N/A
|
||||
SIZE: 16368 MB
|
||||
BIT_WIDTH: 256
|
||||
MAX_BANDWIDTH: 1555 GB/s
|
||||
GPU: 1
|
||||
VRAM:
|
||||
TYPE: GDDR6
|
||||
VENDOR: N/A
|
||||
SIZE: 30704 MB
|
||||
BIT_WIDTH: 256
|
||||
MAX_BANDWIDTH: 1555 GB/s
|
||||
...
|
||||
```C
|
||||
typedef struct {
|
||||
amdsmi_vram_type_t vram_type;
|
||||
amdsmi_vram_vendor_type_t vram_vendor;
|
||||
uint64_t vram_size;
|
||||
uint32_t vram_bit_width;
|
||||
uint64_t vram_max_bandwidth; //!< The VRAM max bandwidth at current memory clock (GB/s)
|
||||
uint64_t reserved[4];
|
||||
} amdsmi_vram_info_t;
|
||||
|
||||
```
|
||||
amdsmi_status_t amdsmi_get_gpu_vram_info(amdsmi_processor_handle processor_handle, amdsmi_vram_info_t *info)
|
||||
```
|
||||
|
||||
Example CLI output:
|
||||
|
||||
```shell
|
||||
$ amd-smi static --vram
|
||||
GPU: 0
|
||||
VRAM:
|
||||
TYPE: GDDR6
|
||||
VENDOR: N/A
|
||||
SIZE: 16368 MB
|
||||
BIT_WIDTH: 256
|
||||
MAX_BANDWIDTH: 1555 GB/s
|
||||
GPU: 1
|
||||
VRAM:
|
||||
TYPE: GDDR6
|
||||
VENDOR: N/A
|
||||
SIZE: 30704 MB
|
||||
BIT_WIDTH: 256
|
||||
MAX_BANDWIDTH: 1555 GB/s
|
||||
...
|
||||
```
|
||||
|
||||
### Removed
|
||||
|
||||
|
||||
@@ -2277,7 +2277,7 @@ class AMDSMICommands():
|
||||
'socket_thermal_accumulated': "N/A",
|
||||
'vr_thermal_accumulated': "N/A",
|
||||
'hbm_thermal_accumulated': "N/A",
|
||||
'gfx_below_host_limit_acc': "N/A",
|
||||
'gfx_clk_below_host_limit_accumulated': "N/A",
|
||||
|
||||
# violation status values - active/not active
|
||||
'prochot_violation_status': "N/A",
|
||||
@@ -2285,13 +2285,15 @@ class AMDSMICommands():
|
||||
'socket_thermal_violation_status': "N/A",
|
||||
'vr_thermal_violation_status': "N/A",
|
||||
'hbm_thermal_violation_status': "N/A",
|
||||
'gfx_clk_below_host_limit_violation_status': "N/A",
|
||||
|
||||
# violation activity values - percent
|
||||
'prochot_violation_activity': "N/A",
|
||||
'ppt_violation_activity': "N/A",
|
||||
'socket_thermal_violation_activity': "N/A",
|
||||
'vr_thermal_violation_activity': "N/A",
|
||||
'hbm_thermal_violation_activity': "N/A"
|
||||
'hbm_thermal_violation_activity': "N/A",
|
||||
'gfx_clk_below_host_limit_violation_activity': "N/A",
|
||||
}
|
||||
|
||||
try:
|
||||
@@ -2302,18 +2304,21 @@ class AMDSMICommands():
|
||||
throttle_status['socket_thermal_accumulated'] = violation_status['acc_socket_thrm']
|
||||
throttle_status['vr_thermal_accumulated'] = violation_status['acc_vr_thrm']
|
||||
throttle_status['hbm_thermal_accumulated'] = violation_status['acc_hbm_thrm']
|
||||
throttle_status['gfx_clk_below_host_limit_accumulated'] = violation_status['acc_gfx_clk_below_host_limit']
|
||||
|
||||
throttle_status['prochot_violation_status'] = violation_status['active_prochot_thrm']
|
||||
throttle_status['ppt_violation_status'] = violation_status['active_ppt_pwr']
|
||||
throttle_status['socket_thermal_violation_status'] = violation_status['active_socket_thrm']
|
||||
throttle_status['vr_thermal_violation_status'] = violation_status['active_vr_thrm']
|
||||
throttle_status['hbm_thermal_violation_status'] = violation_status['active_hbm_thrm']
|
||||
throttle_status['gfx_clk_below_host_limit_violation_status'] = violation_status['active_gfx_clk_below_host_limit']
|
||||
|
||||
throttle_status['prochot_violation_activity'] = violation_status['per_prochot_thrm']
|
||||
throttle_status['ppt_violation_activity'] = violation_status['per_ppt_pwr']
|
||||
throttle_status['socket_thermal_violation_activity'] = violation_status['per_socket_thrm']
|
||||
throttle_status['vr_thermal_violation_activity'] = violation_status['per_vr_thrm']
|
||||
throttle_status['hbm_thermal_violation_activity'] = violation_status['per_hbm_thrm']
|
||||
throttle_status['gfx_clk_below_host_limit_violation_activity'] = violation_status['per_gfx_clk_below_host_limit']
|
||||
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
values_dict['throttle'] = throttle_status
|
||||
@@ -5274,6 +5279,7 @@ class AMDSMICommands():
|
||||
"phot_tviol": "N/A",
|
||||
"vr_tviol": "N/A",
|
||||
"hbm_tviol": "N/A",
|
||||
"gfx_clkviol": "N/A",
|
||||
}
|
||||
try:
|
||||
violations = amdsmi_interface.amdsmi_get_violation_status(args.gpu)
|
||||
@@ -5283,6 +5289,7 @@ class AMDSMICommands():
|
||||
violation_status['phot_tviol'] = violations['per_prochot_thrm']
|
||||
violation_status['vr_tviol'] = violations['per_vr_thrm']
|
||||
violation_status['hbm_tviol'] = violations['per_hbm_thrm']
|
||||
violation_status['gfx_clkviol'] = violations['per_gfx_clk_below_host_limit']
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['pviol'] = violation_status['pviol']
|
||||
monitor_values['tviol'] = violation_status['tviol']
|
||||
@@ -5290,32 +5297,40 @@ class AMDSMICommands():
|
||||
monitor_values['phot_tviol'] = violation_status['phot_tviol']
|
||||
monitor_values['vr_tviol'] = violation_status['vr_tviol']
|
||||
monitor_values['hbm_tviol'] = violation_status['hbm_tviol']
|
||||
monitor_values['gfx_clkviol'] = violation_status['gfx_clkviol']
|
||||
logging.debug("Failed to get violation status on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
violation_status_unit = "%"
|
||||
kPVIOL_MAX_WIDTH = 7
|
||||
kTVIOL_MAX_WIDTH = 7
|
||||
kTVIOL_ACTIVE_MAX_WIDTH = 14
|
||||
kPVIOL_MAX_WIDTH = 7
|
||||
kPHOT_MAX_WIDTH = 12
|
||||
kVR_MAX_WIDTH = 10
|
||||
kHBM_MAX_WIDTH = 11
|
||||
kGFXC_MAX_WIDTH = 13
|
||||
|
||||
for key, value in violation_status.items():
|
||||
if key == "tviol_active":
|
||||
monitor_values[key] = value
|
||||
elif key != "tviol_active":
|
||||
monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit)
|
||||
if value != "N/A":
|
||||
if key == "tviol_active":
|
||||
monitor_values[key] = value
|
||||
else:
|
||||
monitor_values[key] = self.helpers.unit_format(self.logger, violation_status[key], violation_status_unit)
|
||||
else:
|
||||
monitor_values[key] = violation_status[key]
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
monitor_values['pviol'] = monitor_values['pviol'].rjust(kPVIOL_MAX_WIDTH, ' ')
|
||||
monitor_values['tviol'] = monitor_values['tviol'].rjust(kTVIOL_MAX_WIDTH, ' ')
|
||||
monitor_values['phot_tviol'] = monitor_values['phot_tviol'].rjust(kPHOT_MAX_WIDTH, ' ')
|
||||
monitor_values['vr_tviol'] = monitor_values['vr_tviol'].rjust(kVR_MAX_WIDTH, ' ')
|
||||
monitor_values['hbm_tviol'] = monitor_values['hbm_tviol'].rjust(kHBM_MAX_WIDTH, ' ')
|
||||
monitor_values['gfx_clkviol'] = monitor_values['gfx_clkviol'].rjust(kGFXC_MAX_WIDTH, ' ')
|
||||
self.logger.table_header += 'PVIOL'.rjust(kPVIOL_MAX_WIDTH, ' ')
|
||||
self.logger.table_header += 'TVIOL'.rjust(kTVIOL_MAX_WIDTH, ' ')
|
||||
self.logger.table_header += 'TVIOL_ACTIVE'.rjust(kTVIOL_ACTIVE_MAX_WIDTH, ' ')
|
||||
self.logger.table_header += 'PHOT_TVIOL'.rjust(kPHOT_MAX_WIDTH, ' ')
|
||||
self.logger.table_header += 'VR_TVIOL'.rjust(kVR_MAX_WIDTH, ' ')
|
||||
self.logger.table_header += 'HBM_TVIOL'.rjust(kHBM_MAX_WIDTH, ' ')
|
||||
self.logger.table_header += 'GFX_CLKVIOL'.rjust(kGFXC_MAX_WIDTH, ' ')
|
||||
|
||||
self.logger.store_output(args.gpu, 'values', monitor_values)
|
||||
|
||||
|
||||
@@ -180,6 +180,8 @@ class AMDSMILogger():
|
||||
table_values += string_value.rjust(10)
|
||||
elif key == "hbm_tviol":
|
||||
table_values += string_value.rjust(11)
|
||||
elif key == "gfx_clkviol":
|
||||
table_values += string_value.rjust(13)
|
||||
elif key == "process_list":
|
||||
#Add an additional padding between the first instance of GPU and NAME
|
||||
table_values += ' '
|
||||
|
||||
@@ -526,18 +526,22 @@ typedef struct {
|
||||
uint64_t acc_socket_thrm; //!< TVIOL; Current accumulated Socket thermal count; Max uint64 means unsupported
|
||||
uint64_t acc_vr_thrm; //!< Current accumulated voltage regulator count; Max uint64 means unsupported
|
||||
uint64_t acc_hbm_thrm; //!< Current accumulated High Bandwidth Memory (HBM) thermal count; Max uint64 means unsupported
|
||||
uint64_t acc_gfx_clk_below_host_limit; //!< Current graphic clock below host limit count; Max uint64 means unsupported
|
||||
uint64_t per_prochot_thrm; //!< Processor hot violation % (greater than 0% is a violation); Max uint64 means unsupported
|
||||
uint64_t per_ppt_pwr; //!< PVIOL; Package Power Tracking (PPT) violation % (greater than 0% is a violation); Max uint64 means unsupported
|
||||
uint64_t per_socket_thrm; //!< TVIOL; Socket thermal violation % (greater than 0% is a violation); Max uint64 means unsupported
|
||||
uint64_t per_vr_thrm; //!< Voltage regulator violation % (greater than 0% is a violation); Max uint64 means unsupported
|
||||
uint64_t per_hbm_thrm; //!< High Bandwidth Memory (HBM) thermal violation % (greater than 0% is a violation); Max uint64 means unsupported
|
||||
uint64_t per_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation % (greater than 0% is a violation); Max uint64 means unsupported
|
||||
uint8_t active_prochot_thrm; //!< Processor hot violation; 1 = active 0 = not active; Max uint8 means unsupported
|
||||
uint8_t active_ppt_pwr; //!< Package Power Tracking (PPT) violation; 1 = active 0 = not active; Max uint8 means unsupported
|
||||
uint8_t active_socket_thrm; //!< Socket thermal violation; 1 = active 0 = not active; Max uint8 means unsupported
|
||||
uint8_t active_vr_thrm; //!< Voltage regulator violation; 1 = active 0 = not active; Max uint8 means unsupported
|
||||
uint8_t active_hbm_thrm; //!< High Bandwidth Memory (HBM) thermal violation; 1 = active 0 = not active; Max uint8 means unsupported
|
||||
uint64_t reserved[30]; // Reserved for new violation info
|
||||
uint8_t active_gfx_clk_below_host_limit; //!< Graphics clock below host limit violation; 1 = active 0 = not active; Max uint8 means unsupported
|
||||
uint64_t reserved[3]; // Reserved for new violation info
|
||||
} amdsmi_violation_status_t;
|
||||
|
||||
typedef struct {
|
||||
amdsmi_range_t supported_freq_range;
|
||||
amdsmi_range_t current_freq_range;
|
||||
|
||||
@@ -28,6 +28,7 @@ from .amdsmi_exception import *
|
||||
import sys
|
||||
import math
|
||||
from time import localtime, asctime, time
|
||||
import json
|
||||
|
||||
MAX_NUM_PROCESSES = 1024
|
||||
|
||||
@@ -1559,7 +1560,9 @@ def amdsmi_get_hsmp_metrics_table(
|
||||
"mtbl_ppt_residency_acc": mtbl.ppt_residency_acc,
|
||||
"mtbl_socket_thm_residency_acc": mtbl.socket_thm_residency_acc,
|
||||
"mtbl_vr_thm_residency_acc": mtbl.vr_thm_residency_acc,
|
||||
"mtbl_hbm_thm_residency_acc": mtbl.hbm_thm_residency_acc
|
||||
"mtbl_hbm_thm_residency_acc": mtbl.hbm_thm_residency_acc,
|
||||
"mtbl_gfx_clk_below_host_residency_acc": mtbl.gfx_clk_below_host_residency_acc,
|
||||
"mtbl_low_utilization_residency_acc": mtbl.low_utilization_residency_acc
|
||||
}
|
||||
|
||||
def amdsmi_first_online_core_on_cpu_socket(
|
||||
@@ -2035,7 +2038,7 @@ def amdsmi_get_violation_status(
|
||||
processor_handle, ctypes.byref(violation_status))
|
||||
)
|
||||
|
||||
return {
|
||||
dict_return = {
|
||||
"reference_timestamp": _validate_if_max_uint(violation_status.reference_timestamp, MaxUIntegerTypes.UINT64_T),
|
||||
"violation_timestamp": _validate_if_max_uint(violation_status.violation_timestamp, MaxUIntegerTypes.UINT64_T),
|
||||
"acc_counter": _validate_if_max_uint(violation_status.acc_counter, MaxUIntegerTypes.UINT64_T),
|
||||
@@ -2044,17 +2047,21 @@ def amdsmi_get_violation_status(
|
||||
"acc_socket_thrm": _validate_if_max_uint(violation_status.acc_socket_thrm, MaxUIntegerTypes.UINT64_T), #TVIOL
|
||||
"acc_vr_thrm": _validate_if_max_uint(violation_status.acc_vr_thrm, MaxUIntegerTypes.UINT64_T),
|
||||
"acc_hbm_thrm": _validate_if_max_uint(violation_status.acc_hbm_thrm, MaxUIntegerTypes.UINT64_T),
|
||||
"acc_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.acc_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT64_T),
|
||||
"per_prochot_thrm": _validate_if_max_uint(violation_status.per_prochot_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True),
|
||||
"per_ppt_pwr": _validate_if_max_uint(violation_status.per_ppt_pwr, MaxUIntegerTypes.UINT64_T, isActivity=True), #PVIOL
|
||||
"per_socket_thrm": _validate_if_max_uint(violation_status.per_socket_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True), #TVIOL
|
||||
"per_vr_thrm": _validate_if_max_uint(violation_status.per_vr_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True),
|
||||
"per_hbm_thrm": _validate_if_max_uint(violation_status.per_hbm_thrm, MaxUIntegerTypes.UINT64_T, isActivity=True),
|
||||
"per_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.per_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT64_T, isActivity=True),
|
||||
"active_prochot_thrm": _validate_if_max_uint(violation_status.active_prochot_thrm, MaxUIntegerTypes.UINT8_T, isBool=True),
|
||||
"active_ppt_pwr": _validate_if_max_uint(violation_status.active_ppt_pwr, MaxUIntegerTypes.UINT8_T, isBool=True), #PVIOL
|
||||
"active_socket_thrm": _validate_if_max_uint(violation_status.active_socket_thrm, MaxUIntegerTypes.UINT8_T, isBool=True), #TVIOL
|
||||
"active_vr_thrm": _validate_if_max_uint(violation_status.active_vr_thrm, MaxUIntegerTypes.UINT8_T, isBool=True),
|
||||
"active_hbm_thrm": _validate_if_max_uint(violation_status.active_hbm_thrm, MaxUIntegerTypes.UINT8_T, isBool=True)
|
||||
"active_hbm_thrm": _validate_if_max_uint(violation_status.active_hbm_thrm, MaxUIntegerTypes.UINT8_T, isBool=True),
|
||||
"active_gfx_clk_below_host_limit": _validate_if_max_uint(violation_status.active_gfx_clk_below_host_limit, MaxUIntegerTypes.UINT8_T, isBool=True),
|
||||
}
|
||||
return dict_return
|
||||
|
||||
def amdsmi_get_gpu_total_ecc_count(
|
||||
processor_handle: amdsmi_wrapper.amdsmi_processor_handle,
|
||||
|
||||
@@ -727,18 +727,21 @@ struct_amdsmi_violation_status_t._fields_ = [
|
||||
('acc_socket_thrm', ctypes.c_uint64),
|
||||
('acc_vr_thrm', ctypes.c_uint64),
|
||||
('acc_hbm_thrm', ctypes.c_uint64),
|
||||
('acc_gfx_clk_below_host_limit', ctypes.c_uint64),
|
||||
('per_prochot_thrm', ctypes.c_uint64),
|
||||
('per_ppt_pwr', ctypes.c_uint64),
|
||||
('per_socket_thrm', ctypes.c_uint64),
|
||||
('per_vr_thrm', ctypes.c_uint64),
|
||||
('per_hbm_thrm', ctypes.c_uint64),
|
||||
('per_gfx_clk_below_host_limit', ctypes.c_uint64),
|
||||
('active_prochot_thrm', ctypes.c_ubyte),
|
||||
('active_ppt_pwr', ctypes.c_ubyte),
|
||||
('active_socket_thrm', ctypes.c_ubyte),
|
||||
('active_vr_thrm', ctypes.c_ubyte),
|
||||
('active_hbm_thrm', ctypes.c_ubyte),
|
||||
('PADDING_0', ctypes.c_ubyte * 3),
|
||||
('reserved', ctypes.c_uint64 * 30),
|
||||
('active_gfx_clk_below_host_limit', ctypes.c_ubyte),
|
||||
('PADDING_0', ctypes.c_ubyte * 2),
|
||||
('reserved', ctypes.c_uint64 * 3),
|
||||
]
|
||||
|
||||
amdsmi_violation_status_t = struct_amdsmi_violation_status_t
|
||||
@@ -791,6 +794,19 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum
|
||||
class struct_amdsmi_pcie_info_t(Structure):
|
||||
pass
|
||||
|
||||
class struct_pcie_static_(Structure):
|
||||
pass
|
||||
|
||||
struct_pcie_static_._pack_ = 1 # source:False
|
||||
struct_pcie_static_._fields_ = [
|
||||
('max_pcie_width', ctypes.c_uint16),
|
||||
('PADDING_0', ctypes.c_ubyte * 2),
|
||||
('max_pcie_speed', ctypes.c_uint32),
|
||||
('pcie_interface_version', ctypes.c_uint32),
|
||||
('slot_type', amdsmi_card_form_factor_t),
|
||||
('reserved', ctypes.c_uint64 * 10),
|
||||
]
|
||||
|
||||
class struct_pcie_metric_(Structure):
|
||||
pass
|
||||
|
||||
@@ -811,19 +827,6 @@ struct_pcie_metric_._fields_ = [
|
||||
('reserved', ctypes.c_uint64 * 12),
|
||||
]
|
||||
|
||||
class struct_pcie_static_(Structure):
|
||||
pass
|
||||
|
||||
struct_pcie_static_._pack_ = 1 # source:False
|
||||
struct_pcie_static_._fields_ = [
|
||||
('max_pcie_width', ctypes.c_uint16),
|
||||
('PADDING_0', ctypes.c_ubyte * 2),
|
||||
('max_pcie_speed', ctypes.c_uint32),
|
||||
('pcie_interface_version', ctypes.c_uint32),
|
||||
('slot_type', amdsmi_card_form_factor_t),
|
||||
('reserved', ctypes.c_uint64 * 10),
|
||||
]
|
||||
|
||||
struct_amdsmi_pcie_info_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_pcie_info_t._fields_ = [
|
||||
('pcie_static', struct_pcie_static_),
|
||||
@@ -1120,16 +1123,6 @@ amdsmi_process_handle_t = ctypes.c_uint32
|
||||
class struct_amdsmi_proc_info_t(Structure):
|
||||
pass
|
||||
|
||||
class struct_engine_usage_(Structure):
|
||||
pass
|
||||
|
||||
struct_engine_usage_._pack_ = 1 # source:False
|
||||
struct_engine_usage_._fields_ = [
|
||||
('gfx', ctypes.c_uint64),
|
||||
('enc', ctypes.c_uint64),
|
||||
('reserved', ctypes.c_uint32 * 12),
|
||||
]
|
||||
|
||||
class struct_memory_usage_(Structure):
|
||||
pass
|
||||
|
||||
@@ -1141,6 +1134,16 @@ struct_memory_usage_._fields_ = [
|
||||
('reserved', ctypes.c_uint32 * 10),
|
||||
]
|
||||
|
||||
class struct_engine_usage_(Structure):
|
||||
pass
|
||||
|
||||
struct_engine_usage_._pack_ = 1 # source:False
|
||||
struct_engine_usage_._fields_ = [
|
||||
('gfx', ctypes.c_uint64),
|
||||
('enc', ctypes.c_uint64),
|
||||
('reserved', ctypes.c_uint32 * 12),
|
||||
]
|
||||
|
||||
struct_amdsmi_proc_info_t._pack_ = 1 # source:False
|
||||
struct_amdsmi_proc_info_t._fields_ = [
|
||||
('name', ctypes.c_char * 256),
|
||||
|
||||
@@ -629,18 +629,21 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
violation_status->acc_socket_thrm = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->acc_vr_thrm = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->acc_hbm_thrm = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->acc_gfx_clk_below_host_limit = std::numeric_limits<uint64_t>::max();
|
||||
|
||||
violation_status->per_prochot_thrm = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->per_ppt_pwr = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->per_socket_thrm = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->per_vr_thrm = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->per_hbm_thrm = std::numeric_limits<uint64_t>::max();
|
||||
violation_status->per_gfx_clk_below_host_limit = std::numeric_limits<uint64_t>::max();
|
||||
|
||||
violation_status->active_prochot_thrm = std::numeric_limits<uint8_t>::max();
|
||||
violation_status->active_ppt_pwr = std::numeric_limits<uint8_t>::max();
|
||||
violation_status->active_socket_thrm = std::numeric_limits<uint8_t>::max();
|
||||
violation_status->active_vr_thrm = std::numeric_limits<uint8_t>::max();
|
||||
violation_status->active_hbm_thrm = std::numeric_limits<uint8_t>::max();
|
||||
violation_status->active_gfx_clk_below_host_limit = std::numeric_limits<uint8_t>::max();
|
||||
|
||||
const auto p1 = std::chrono::system_clock::now();
|
||||
auto current_time = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
@@ -664,8 +667,18 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
return r;
|
||||
}
|
||||
|
||||
// default to 0xffffffff as not supported
|
||||
uint32_t partitition_id = std::numeric_limits<uint32_t>::max();
|
||||
auto tmp_partition_id = uint32_t(0);
|
||||
amdsmi_status_t status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, &(tmp_partition_id));
|
||||
// Do not return early if this value fails
|
||||
// continue to try getting all info
|
||||
if (status == AMDSMI_STATUS_SUCCESS) {
|
||||
partitition_id = tmp_partition_id;
|
||||
}
|
||||
|
||||
amdsmi_gpu_metrics_t metric_info_a = {};
|
||||
amdsmi_status_t status = amdsmi_get_gpu_metrics_info(
|
||||
status = amdsmi_get_gpu_metrics_info(
|
||||
processor_handle, &metric_info_a);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
std::ostringstream ss;
|
||||
@@ -680,7 +693,9 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
&& metric_info_a.ppt_residency_acc == std::numeric_limits<uint64_t>::max()
|
||||
&& metric_info_a.socket_thm_residency_acc == std::numeric_limits<uint64_t>::max()
|
||||
&& metric_info_a.vr_thm_residency_acc == std::numeric_limits<uint64_t>::max()
|
||||
&& metric_info_a.hbm_thm_residency_acc == std::numeric_limits<uint64_t>::max()) {
|
||||
&& metric_info_a.hbm_thm_residency_acc == std::numeric_limits<uint64_t>::max()
|
||||
&& (metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id]
|
||||
== std::numeric_limits<uint64_t>::max())) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ASIC does not support throttle violations!, "
|
||||
<< "returning AMDSMI_STATUS_NOT_SUPPORTED";
|
||||
@@ -705,33 +720,38 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
violation_status->acc_socket_thrm = metric_info_b.socket_thm_residency_acc;
|
||||
violation_status->acc_vr_thrm = metric_info_b.vr_thm_residency_acc;
|
||||
violation_status->acc_hbm_thrm = metric_info_b.hbm_thm_residency_acc;
|
||||
violation_status->acc_gfx_clk_below_host_limit
|
||||
= metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id];
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | "
|
||||
<< "[gpu_metrics A] metric_info_a.accumulation_counter: " << std::dec
|
||||
<< metric_info_a.accumulation_counter
|
||||
<< metric_info_a.accumulation_counter << "\n"
|
||||
<< "; metric_info_a.prochot_residency_acc: " << std::dec
|
||||
<< metric_info_a.prochot_residency_acc
|
||||
<< metric_info_a.prochot_residency_acc << "\n"
|
||||
<< "; metric_info_a.ppt_residency_acc (pviol): " << std::dec
|
||||
<< metric_info_a.ppt_residency_acc
|
||||
<< metric_info_a.ppt_residency_acc << "\n"
|
||||
<< "; metric_info_a.socket_thm_residency_acc (tviol): " << std::dec
|
||||
<< metric_info_a.socket_thm_residency_acc
|
||||
<< metric_info_a.socket_thm_residency_acc << "\n"
|
||||
<< "; metric_info_a.vr_thm_residency_acc: " << std::dec
|
||||
<< metric_info_a.vr_thm_residency_acc
|
||||
<< metric_info_a.vr_thm_residency_acc << "\n"
|
||||
<< "; metric_info_a.hbm_thm_residency_acc: " << std::dec
|
||||
<< metric_info_a.hbm_thm_residency_acc << "\n"
|
||||
<< "; metric_info_b.xcp_stats->gfx_below_host_limit_acc[" << partitition_id << "]: "
|
||||
<< std::dec << metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id] << "\n"
|
||||
<< " [gpu_metrics B] metric_info_b.accumulation_counter: " << std::dec
|
||||
<< metric_info_b.accumulation_counter
|
||||
<< metric_info_b.accumulation_counter << "\n"
|
||||
<< "; metric_info_b.prochot_residency_acc: " << std::dec
|
||||
<< metric_info_b.prochot_residency_acc
|
||||
<< metric_info_b.prochot_residency_acc << "\n"
|
||||
<< "; metric_info_b.ppt_residency_acc (pviol): " << std::dec
|
||||
<< metric_info_b.ppt_residency_acc
|
||||
<< metric_info_b.ppt_residency_acc << "\n"
|
||||
<< "; metric_info_b.socket_thm_residency_acc (tviol): " << std::dec
|
||||
<< metric_info_b.socket_thm_residency_acc
|
||||
<< metric_info_b.socket_thm_residency_acc << "\n"
|
||||
<< "; metric_info_b.vr_thm_residency_acc: " << std::dec
|
||||
<< metric_info_b.vr_thm_residency_acc
|
||||
<< metric_info_b.vr_thm_residency_acc << "\n"
|
||||
<< "; metric_info_b.hbm_thm_residency_acc: " << std::dec
|
||||
<< metric_info_b.hbm_thm_residency_acc
|
||||
<< "\n";
|
||||
<< metric_info_b.hbm_thm_residency_acc << "\n"
|
||||
<< "; metric_info_b.xcp_stats->gfx_below_host_limit_acc[" << partitition_id << "]: "
|
||||
<< std::dec << metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id] << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
if ( (metric_info_b.prochot_residency_acc != std::numeric_limits<uint64_t>::max()
|
||||
@@ -842,6 +862,28 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
<< violation_status->active_hbm_thrm << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
if ( (metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id] != std::numeric_limits<uint64_t>::max()
|
||||
|| metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id] != std::numeric_limits<uint64_t>::max())
|
||||
&& (metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id] >= metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id])
|
||||
&& ((metric_info_b.accumulation_counter - metric_info_a.accumulation_counter) > 0) ) {
|
||||
violation_status->per_gfx_clk_below_host_limit =
|
||||
(((metric_info_b.xcp_stats->gfx_below_host_limit_acc[partitition_id] -
|
||||
metric_info_a.xcp_stats->gfx_below_host_limit_acc[partitition_id]) * 100) /
|
||||
(metric_info_b.accumulation_counter - metric_info_a.accumulation_counter));
|
||||
|
||||
if (violation_status->per_gfx_clk_below_host_limit > 0) {
|
||||
violation_status->active_gfx_clk_below_host_limit = 1;
|
||||
violation_status->violation_timestamp = kFASTEST_POLL_TIME_MS;
|
||||
} else {
|
||||
violation_status->active_gfx_clk_below_host_limit = 0;
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << " | "
|
||||
<< "ENTERED gfx_clk_below_host_residency_acc | per_gfx_clk_below_host_limit: " << std::dec
|
||||
<< violation_status->per_gfx_clk_below_host_limit
|
||||
<< "%; active_ppt_pwr = " << std::dec
|
||||
<< violation_status->active_gfx_clk_below_host_limit << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | "
|
||||
<< "RETURNING AMDSMI_STATUS_SUCCESS | "
|
||||
@@ -859,6 +901,8 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
<< violation_status->per_vr_thrm
|
||||
<< "; violation_status->per_hbm_thrm (%): " << std::dec
|
||||
<< violation_status->per_hbm_thrm
|
||||
<< "; violation_status->per_gfx_clk_below_host_limit (%): " << std::dec
|
||||
<< violation_status->per_gfx_clk_below_host_limit
|
||||
<< "; violation_status->active_prochot_thrm (bool): " << std::dec
|
||||
<< static_cast<int>(violation_status->active_prochot_thrm)
|
||||
<< "; violation_status->active_ppt_pwr (bool): " << std::dec
|
||||
@@ -869,6 +913,8 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha
|
||||
<< static_cast<int>(violation_status->active_vr_thrm)
|
||||
<< "; violation_status->active_hbm_thrm (bool): " << std::dec
|
||||
<< static_cast<int>(violation_status->active_hbm_thrm)
|
||||
<< "; violation_status->active_gfx_clk_below_host_limit (bool): " << std::dec
|
||||
<< static_cast<int>(violation_status->active_gfx_clk_below_host_limit)
|
||||
<< "\n";
|
||||
LOG_INFO(ss);
|
||||
|
||||
|
||||
@@ -893,6 +893,8 @@ class TestAmdSmiPythonInterface(unittest.TestCase):
|
||||
violation_status['acc_vr_thrm']))
|
||||
print(" Current HBM Thrm Accumulated (Count): {}".format(
|
||||
violation_status['acc_hbm_thrm']))
|
||||
print(" Current GFX CLK Below Host Limit Accumulated (Count): {}".format(
|
||||
violation_status['acc_gfx_clk_below_host_limit']))
|
||||
|
||||
print(" Prochot Thrm Violation (%): {}".format(
|
||||
violation_status['per_prochot_thrm']))
|
||||
@@ -904,6 +906,8 @@ class TestAmdSmiPythonInterface(unittest.TestCase):
|
||||
violation_status['per_vr_thrm']))
|
||||
print(" HBM Thrm Violation (%): {}".format(
|
||||
violation_status['per_hbm_thrm']))
|
||||
print(" GFX CLK Below Host Limit Violation (%): {}".format(
|
||||
violation_status['per_gfx_clk_below_host_limit']))
|
||||
|
||||
print(" Prochot Thrm Violation (bool): {}".format(
|
||||
violation_status['active_prochot_thrm']))
|
||||
@@ -915,6 +919,8 @@ class TestAmdSmiPythonInterface(unittest.TestCase):
|
||||
violation_status['active_vr_thrm']))
|
||||
print(" HBM Thrm Violation (bool): {}".format(
|
||||
violation_status['active_hbm_thrm']))
|
||||
print(" GFX CLK Below Host Limit Violation (bool): {}".format(
|
||||
violation_status['active_gfx_clk_below_host_limit']))
|
||||
print()
|
||||
self.tearDown()
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user