Fixed gpu_metric and cache cli checks
Signed-off-by: Maisam Arif <maisarif@amd.com>
Change-Id: Ic71e2b50dfa8fc106a17079842a7564a8e24b69d
[ROCm/amdsmi commit: 59d885a9ca]
This commit is contained in:
committed by
Maisam Arif
vanhempi
fa2920f4dd
commit
d5f2a6770a
@@ -561,26 +561,14 @@ class AMDSMICommands():
|
||||
if args.cache:
|
||||
try:
|
||||
cache_info = amdsmi_interface.amdsmi_get_gpu_cache_info(args.gpu)
|
||||
logging.debug("Before dictionary modify | cache_info = " + str(cache_info))
|
||||
for key, cache_values in cache_info.items():
|
||||
cache_properties = "N/A"
|
||||
if 'cache_flags' in list(cache_info[key].keys()):
|
||||
if isinstance(cache_values['cache_flags'], list):
|
||||
cache_properties = list(cache_values['cache_flags'])
|
||||
cache_values.pop('cache_flags') # remove cache_flags from output
|
||||
cache_info[key] = { # add properties to top of key's dictionary
|
||||
'cache_properties': list(cache_properties),
|
||||
**cache_info[key] # append remaining key's dictionary
|
||||
}
|
||||
logging.debug("After dictionary modify | cache_info = " + str(cache_info))
|
||||
logging.debug(f"cache_info dictionary = {cache_info}")
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
for key, cache_values in cache_info.items():
|
||||
cache_values['cache_size'] = f"{cache_values['cache_size']} KB"
|
||||
# take cache_properties out of list -> display as string, removing brackets
|
||||
update_cache_properties = str(cache_values['cache_properties'])
|
||||
update_cache_properties = update_cache_properties.replace("[","").replace("]", "")
|
||||
cache_values['cache_properties'] = update_cache_properties
|
||||
logging.debug("After human_readable | cache_info = " + str(cache_info))
|
||||
cache_values['cache_properties'] = ", ".join(cache_values['cache_properties'])
|
||||
logging.debug(f"After human_readable | cache_info = {cache_info}")
|
||||
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
cache_info = "N/A"
|
||||
@@ -1142,9 +1130,6 @@ class AMDSMICommands():
|
||||
if args.usage:
|
||||
try:
|
||||
engine_usage = amdsmi_interface.amdsmi_get_gpu_activity(args.gpu)
|
||||
engine_usage['gfx_activity'] = engine_usage.pop('gfx_activity')
|
||||
engine_usage['umc_activity'] = engine_usage.pop('umc_activity')
|
||||
engine_usage['mm_activity'] = engine_usage.pop('mm_activity')
|
||||
|
||||
# TODO: move vcn_activity and jpeg_activity into amdsmi_get_gpu_activity
|
||||
gpu_metric_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
@@ -1152,20 +1137,17 @@ class AMDSMICommands():
|
||||
engine_usage['jpeg_activity'] = gpu_metric_info.pop('jpeg_activity')
|
||||
|
||||
for key, value in engine_usage.items():
|
||||
|
||||
if self.logger.is_human_readable_format():
|
||||
unit = '%'
|
||||
if isinstance(value, list):
|
||||
engine_usage[key] = [f"{v} {unit}" if str(v) != "N/A" else str(v) for v in engine_usage[key]]
|
||||
save_value = engine_usage[key]
|
||||
pretty_array = "["
|
||||
for i in range(len(save_value)):
|
||||
if (i+1 != len(save_value)):
|
||||
pretty_array += save_value[i] + ", "
|
||||
else:
|
||||
pretty_array += save_value[i] + "]"
|
||||
engine_usage[key] = pretty_array
|
||||
elif not isinstance(value, list) and engine_usage[key] != "N/A":
|
||||
for index, activity in enumerate(value):
|
||||
if activity != "N/A":
|
||||
engine_usage[key][index] = f"{activity} {unit}"
|
||||
|
||||
# Convert list to a string for human readable format
|
||||
engine_usage[key] = '[' + ", ".join(engine_usage[key]) + ']'
|
||||
|
||||
elif value != "N/A":
|
||||
engine_usage[key] = f"{value} {unit}"
|
||||
|
||||
values_dict['usage'] = engine_usage
|
||||
@@ -1196,7 +1178,8 @@ class AMDSMICommands():
|
||||
power_dict['current_power'] = power_info['current_socket_power']
|
||||
|
||||
if power_dict['current_power'] == "N/A":
|
||||
power_dict['average_power'] = power_info['average_socket_power']
|
||||
# For older gpu's when current power doesn't populate we use the average socket power instead
|
||||
power_dict['current_power'] = power_info['average_socket_power']
|
||||
|
||||
power_dict['current_gfx_voltage'] = power_info['gfx_voltage']
|
||||
power_dict['current_soc_voltage'] = power_info['soc_voltage']
|
||||
@@ -2654,7 +2637,7 @@ class AMDSMICommands():
|
||||
if not any([args.access, args.weight, args.hops, args.link_type, args.numa_bw]):
|
||||
args.access = args.weight = args.hops = args.link_type= args.numa_bw = True
|
||||
|
||||
# Clear the table header; TODO make this a function
|
||||
# Clear the table header
|
||||
self.logger.table_header = ''.rjust(12)
|
||||
|
||||
# Populate the possible gpus
|
||||
@@ -3351,7 +3334,7 @@ class AMDSMICommands():
|
||||
# Get gpu_id for logging
|
||||
gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu)
|
||||
|
||||
# Clear the table header; TODO make this a function
|
||||
# Clear the table header
|
||||
self.logger.table_header = ''
|
||||
|
||||
# Store timestamp for watch output
|
||||
@@ -3365,12 +3348,14 @@ class AMDSMICommands():
|
||||
try:
|
||||
gpu_metrics_info = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)
|
||||
|
||||
monitor_values['power_usage'] = gpu_metrics_info['current_socket_power']
|
||||
if monitor_values['power_usage'] == "N/A": # Fallback to average_socket_power for older gpu_metrics versions
|
||||
if gpu_metrics_info['current_socket_power'] != "N/A":
|
||||
monitor_values['power_usage'] = gpu_metrics_info['current_socket_power']
|
||||
else: # Fallback to average_socket_power for older gpu_metrics versions
|
||||
monitor_values['power_usage'] = gpu_metrics_info['average_socket_power']
|
||||
|
||||
if self.logger.is_human_readable_format() and monitor_values['power_usage'] != "N/A":
|
||||
monitor_values['power_usage'] = f"{monitor_values['power_usage']} W"
|
||||
unit = 'W'
|
||||
monitor_values['power_usage'] = f"{monitor_values['power_usage']} {unit}"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['power_usage'] = "N/A"
|
||||
logging.debug("Failed to get power usage on gpu %s | %s", gpu_id, e.get_error_info())
|
||||
@@ -3403,7 +3388,7 @@ class AMDSMICommands():
|
||||
try:
|
||||
gfx_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_gfx_activity']
|
||||
monitor_values['gfx'] = gfx_util
|
||||
if self.logger.is_human_readable_format():
|
||||
if self.logger.is_human_readable_format() and gfx_util != "N/A":
|
||||
monitor_values['gfx'] = f"{monitor_values['gfx']} %"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['gfx'] = "N/A"
|
||||
@@ -3414,7 +3399,7 @@ class AMDSMICommands():
|
||||
try:
|
||||
gfx_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_gfxclk']
|
||||
monitor_values['gfx_clock'] = gfx_clock
|
||||
if self.logger.is_human_readable_format():
|
||||
if self.logger.is_human_readable_format() and gfx_clock != "N/A":
|
||||
monitor_values['gfx_clock'] = f"{monitor_values['gfx_clock']} MHz"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['gfx_clock'] = "N/A"
|
||||
@@ -3425,7 +3410,7 @@ class AMDSMICommands():
|
||||
try:
|
||||
mem_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['average_umc_activity']
|
||||
monitor_values['mem'] = mem_util
|
||||
if self.logger.is_human_readable_format():
|
||||
if self.logger.is_human_readable_format() and mem_util != "N/A":
|
||||
monitor_values['mem'] = f"{monitor_values['mem']} %"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['mem'] = "N/A"
|
||||
@@ -3436,7 +3421,7 @@ class AMDSMICommands():
|
||||
try:
|
||||
mem_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_uclk']
|
||||
monitor_values['mem_clock'] = mem_clock
|
||||
if self.logger.is_human_readable_format():
|
||||
if self.logger.is_human_readable_format() and mem_clock != "N/A":
|
||||
monitor_values['mem_clock'] = f"{monitor_values['mem_clock']} MHz"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['mem_clock'] = "N/A"
|
||||
@@ -3449,13 +3434,15 @@ class AMDSMICommands():
|
||||
encoder_util = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['vcn_activity']
|
||||
encoding_activity_avg = []
|
||||
for value in encoder_util:
|
||||
if value < 150: # each encoder chiplet's value range should be a percent
|
||||
if isinstance(value, int):
|
||||
encoding_activity_avg.append(value)
|
||||
|
||||
# Averaging the possible encoding activity values
|
||||
if encoding_activity_avg:
|
||||
encoding_activity_avg = sum(encoding_activity_avg) / len(encoding_activity_avg)
|
||||
else:
|
||||
encoding_activity_avg = "N/A"
|
||||
|
||||
monitor_values['encoder'] = encoding_activity_avg
|
||||
if self.logger.is_human_readable_format() and monitor_values['encoder'] != "N/A":
|
||||
monitor_values['encoder'] = f"{monitor_values['encoder']} %"
|
||||
@@ -3468,7 +3455,7 @@ class AMDSMICommands():
|
||||
try:
|
||||
encoder_clock = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['current_vclk0']
|
||||
monitor_values['encoder_clock'] = encoder_clock
|
||||
if self.logger.is_human_readable_format():
|
||||
if self.logger.is_human_readable_format() and encoder_clock != "N/A":
|
||||
monitor_values['encoder_clock'] = f"{monitor_values['encoder_clock']} MHz"
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['encoder_clock'] = "N/A"
|
||||
@@ -3500,10 +3487,11 @@ class AMDSMICommands():
|
||||
if args.throttle_status:
|
||||
try:
|
||||
throttle_status = amdsmi_interface.amdsmi_get_gpu_metrics_info(args.gpu)['throttle_status']
|
||||
if throttle_status:
|
||||
throttle_status = "THROTTLED"
|
||||
else:
|
||||
throttle_status = "UNTHROTTLED"
|
||||
if throttle_status != "N/A":
|
||||
if throttle_status:
|
||||
throttle_status = "THROTTLED"
|
||||
else:
|
||||
throttle_status = "UNTHROTTLED"
|
||||
monitor_values['throttle_status'] = throttle_status
|
||||
except amdsmi_exception.AmdSmiLibraryException as e:
|
||||
monitor_values['throttle_status'] = "N/A"
|
||||
|
||||
@@ -660,9 +660,9 @@ typedef struct {
|
||||
typedef uint32_t amdsmi_process_handle_t;
|
||||
|
||||
typedef struct {
|
||||
char name[AMDSMI_NORMAL_STRING_LENGTH];
|
||||
char name[AMDSMI_NORMAL_STRING_LENGTH];
|
||||
amdsmi_process_handle_t pid;
|
||||
uint64_t mem; /** in bytes */
|
||||
uint64_t mem; /** in bytes */
|
||||
struct engine_usage_ {
|
||||
uint64_t gfx;
|
||||
uint64_t enc;
|
||||
|
||||
@@ -214,56 +214,6 @@ from .amdsmi_interface import amdsmi_set_gpu_memory_partition
|
||||
from .amdsmi_interface import amdsmi_reset_gpu_memory_partition
|
||||
|
||||
# # Individual GPU Metrics Functions
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_hotspot
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_mem
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_vrsoc
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_socket_power
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_gfx_activity
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_umc_activity
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_energy_acc
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_system_clock_counter
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_firmware_timestamp
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_throttle_status
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_link_width
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_link_speed
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_xgmi_link_width
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_xgmi_link_speed
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_gfxclk_lock_status
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_gfx_activity_acc
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_mem_activity_acc
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_bandwidth_acc
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_bandwidth_inst
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_l0_recov_count_acc
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_replay_count_acc
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_pcie_replay_rover_count_acc
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_uclk
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_hbm
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_vcn_activity
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_xgmi_read_data
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_xgmi_write_data
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_gfxclk
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_socclk
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_vclk0
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_dclk0
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_edge
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_vrgfx
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_temp_vrmem
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_mm_activity
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_vclk1
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_dclk1
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_indep_throttle_status
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_socket_power
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_curr_fan_speed
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_gfx_clock_frequency
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_soc_clock_frequency
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_uclock_frequency
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_vclock0_frequency
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_dclock0_frequency
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_vclock1_frequency
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_avg_dclock1_frequency
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_volt_soc
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_volt_gfx
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_volt_mem
|
||||
from .amdsmi_interface import amdsmi_get_gpu_metrics_header_info
|
||||
|
||||
# # Enums
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1114,45 +1114,8 @@ amdsmi_status_t amdsmi_get_gpu_metrics_info(
|
||||
amdsmi_gpu_metrics_t *pgpu_metrics) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
// nullptr api supported
|
||||
amdsmi_status_t ret =
|
||||
rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle,
|
||||
return rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle,
|
||||
reinterpret_cast<rsmi_gpu_metrics_t*>(pgpu_metrics));
|
||||
if (ret != AMDSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
// WARNING: TEMPORARY - awaiting 1.5 update from amdgpu driver/firmware
|
||||
// intended to be removed later
|
||||
// START: REMOVE WHATS BELOW ME
|
||||
uint8_t content_ver = pgpu_metrics->common_header.content_revision;
|
||||
int8_t format_ver = pgpu_metrics->common_header.format_revision;
|
||||
const uint8_t expected_format_ver = 1;
|
||||
const uint8_t expected_content_ver = 4;
|
||||
if (ret == AMDSMI_STATUS_SUCCESS &&
|
||||
(format_ver == expected_format_ver &&
|
||||
content_ver <= expected_content_ver)) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | SET JPEG_ACTIVITY to MAX_UINT16, "
|
||||
<< "detected content version: " << std::dec << +content_ver
|
||||
<< "; format version: " << std::dec << +format_ver
|
||||
<< "; awaiting 1.5 metrics remove once released";
|
||||
LOG_ALWAYS(ss);
|
||||
std::fill_n(&pgpu_metrics->jpeg_activity[0],
|
||||
(sizeof(pgpu_metrics->jpeg_activity) /
|
||||
sizeof(pgpu_metrics->jpeg_activity[0])),
|
||||
std::numeric_limits<uint16_t>::max());
|
||||
pgpu_metrics->pcie_nak_sent_count_acc =
|
||||
static_cast<uint32_t>(std::numeric_limits<uint32_t>::max());
|
||||
pgpu_metrics->pcie_nak_rcvd_count_acc =
|
||||
static_cast<uint32_t>(std::numeric_limits<uint32_t>::max());
|
||||
}
|
||||
std::ostringstream ss;
|
||||
const char *status_string;
|
||||
amdsmi_status_code_to_string(ret, &status_string);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | END, returning status = " << status_string;
|
||||
LOG_TRACE(ss);
|
||||
// END: REMOVE WHATS ABOVE ME
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
Viittaa uudesa ongelmassa
Block a user