From 2ddb2ef032625fcfc7b752675a6cb45af9b47b90 Mon Sep 17 00:00:00 2001 From: Juan Castillo Date: Thu, 7 Nov 2024 16:35:17 -0600 Subject: [PATCH] [SWDEV-496693]GPU Metrics 1.7 Features added: - [SWDEV-475244] Add new interface to get max memory bandwidth Updated API: amdsmi_get_gpu_vram_info Updated: struct amdsmi_vram_info_t to include vram_max_bandwidth CLI: amd-smi static --vram - [SWDEV-488349] Add new interface for XGMI link status New API: amdsmi_get_gpu_xgmi_link_status CLI: amd-smi xgmi --link-status Signed-off-by: Juan Castillo Change-Id: I1aa35b741136eb4f02f7ea9a95b865886273eb72 [ROCm/amdsmi commit: f8b834762783303fdd031623eb97188be1a0715e] --- projects/amdsmi/CHANGELOG.md | 75 +- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 96 ++- projects/amdsmi/amdsmi_cli/amdsmi_helpers.py | 2 + projects/amdsmi/amdsmi_cli/amdsmi_logger.py | 15 +- projects/amdsmi/amdsmi_cli/amdsmi_parser.py | 2 + .../amdsmi/example/amd_smi_drm_example.cc | 41 +- projects/amdsmi/include/amd_smi/amdsmi.h | 55 +- projects/amdsmi/py-interface/__init__.py | 1 + .../amdsmi/py-interface/amdsmi_interface.py | 48 +- .../amdsmi/py-interface/amdsmi_wrapper.py | 64 +- .../rocm_smi/include/rocm_smi/rocm_smi.h | 18 + .../include/rocm_smi/rocm_smi_gpu_metrics.h | 151 +++- .../rocm_smi/src/rocm_smi_gpu_metrics.cc | 685 +++++++++++++++++- projects/amdsmi/src/amd_smi/amd_smi.cc | 53 +- .../functional/gpu_metrics_read.cc | 20 + .../amd_smi_test/functional/id_info_read.cc | 18 +- 16 files changed, 1281 insertions(+), 63 deletions(-) diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index d8c59176fe..0be36c9a08 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -7,6 +7,42 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ### Added +- **Added support for GPU metrics 1.7 to `amdsmi_get_gpu_metrics_info()`** +Updated `amdsmi_get_gpu_metrics_info()` and structure `amdsmi_gpu_metrics_t` to include new fields for XGMI Link Status, graphics clocks below host limit (per XCP), and VRAM max bandwidth: + - `uint64_t vram_max_bandwidth` - VRAM max bandwidth at max memory clock (GB/s) + - `uint16_t xgmi_link_status[MAX_NUM_XGMI_LINKS]` - XGMI link statis, 1=Up 0=Down + - `uint64_t gfx_below_host_limit_acc[MAX_NUM_XCC]` - graphics clocks below host limit (per XCP) accumulators. Used for graphic clk below host limit violation status. + +- **Added new API `amdsmi_get_gpu_xgmi_link_status()` and CLI `amd-smi xgmi --link-status`** +New API is defined as: +```C +typedef enum { + AMDSMI_XGMI_LINK_DOWN, //!< The XGMI Link is down + AMDSMI_XGMI_LINK_UP, //!< The XGMI Link is up + AMDSMI_XGMI_LINK_DISABLE, //!< The XGMI Link is disabled +} amdsmi_xgmi_link_status_type_t; + +typedef struct { + uint32_t total_links; //!< The total links in the status array + amdsmi_xgmi_link_status_type_t status[AMDSMI_MAX_NUM_XGMI_LINKS]; + uint64_t reserved[7]; +} amdsmi_xgmi_link_status_t; + +amdsmi_status_t amdsmi_get_gpu_xgmi_link_status(amdsmi_processor_handle processor_handle, amdsmi_xgmi_link_status_t *link_status) +``` +Example CLI output: +```shell +$ amd-smi xgmi --link-status + +XGMI LINK STATUS: + bdf link_status +GPU0 0000:08:00.0 U U U U D U D X +GPU1 0000:44:00.0 U U U U D U D X +... + +* U:Up D:Down X:Disabled +``` + - **Added fclk and socclk info to `amd-smi metric -c/--clock`**. fclk and socclk information such as min and max clock have been added to the metric command, in line with all the other clocks. @@ -77,12 +113,43 @@ GPU: 0 DCLK1: N/A ``` -## amd_smi_lib for ROCm 6.4.0 - -### Added - ### Changed +- **Updated API `amdsmi_get_gpu_vram_info()` structure and CLI `amd-smi static --vram`** +Updated structure `amdsmi_vram_info_t`: +```C +typedef struct { + amdsmi_vram_type_t vram_type; + amdsmi_vram_vendor_type_t vram_vendor; + uint64_t vram_size; + uint32_t vram_bit_width; + uint64_t vram_max_bandwidth; //!< The VRAM max bandwidth at current memory clock (GB/s) + uint64_t reserved[4]; +} amdsmi_vram_info_t; + +amdsmi_status_t amdsmi_get_gpu_vram_info(amdsmi_processor_handle processor_handle, amdsmi_vram_info_t *info) +``` +Example CLI output: +```shell +$ amd-smi static --vram +GPU: 0 + VRAM: + TYPE: GDDR6 + VENDOR: N/A + SIZE: 16368 MB + BIT_WIDTH: 256 + MAX_BANDWIDTH: 1555 GB/s +GPU: 1 + VRAM: + TYPE: GDDR6 + VENDOR: N/A + SIZE: 30704 MB + BIT_WIDTH: 256 + MAX_BANDWIDTH: 1555 GB/s +... + +``` + ### Removed - **Removed `GFX_BUSY_ACC` from `amd-smi metric --usage`**. diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 957b2c8031..4eb319c7a4 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -751,7 +751,8 @@ class AMDSMICommands(): vram_info_dict = {"type" : "N/A", "vendor" : "N/A", "size" : "N/A", - "bit_width" : "N/A"} + "bit_width" : "N/A", + "max_bandwidth" : "N/A"} try: vram_info = amdsmi_interface.amdsmi_get_gpu_vram_info(args.gpu) @@ -790,6 +791,15 @@ class AMDSMICommands(): # Populate bit width vram_info_dict['bit_width'] = vram_info['vram_bit_width'] + # Populate vram_max_bandwidth + vram_max_bw = vram_info['vram_max_bandwidth'] + vram_max_bw_unit = 'GB/s' + if self.logger.is_human_readable_format(): + vram_info_dict["max_bandwidth"] = f"{vram_max_bw} {vram_max_bw_unit if vram_max_bw != 'N/A' else ''}" + if self.logger.is_json_format(): + vram_info_dict["max_bandwidth"] = {"value" : vram_max_bw, + "unit" : vram_max_bw_unit} + except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("Failed to get vram info for gpu %s | %s", gpu_id, e.get_error_info()) @@ -1242,7 +1252,8 @@ class AMDSMICommands(): clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None, fan=None, voltage_curve=None, overdrive=None, perf_level=None, xgmi_err=None, energy=None, mem_usage=None, schedule=None, - guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None): + guard=None, guest_data=None, fb_usage=None, xgmi=None, throttle=None, + ): """Get Metric information for target gpu Args: @@ -1338,7 +1349,8 @@ class AMDSMICommands(): current_platform_args += ["fan", "voltage_curve", "overdrive", "perf_level", "xgmi_err", "energy", "throttle"] current_platform_values += [args.fan, args.voltage_curve, args.overdrive, - args.perf_level, args.xgmi_err, args.energy, args.throttle] + args.perf_level, args.xgmi_err, args.energy, args.throttle, + ] if self.helpers.is_hypervisor(): if schedule: @@ -2221,6 +2233,7 @@ class AMDSMICommands(): 'socket_thermal_accumulated': "N/A", 'vr_thermal_accumulated': "N/A", 'hbm_thermal_accumulated': "N/A", + 'gfx_below_host_limit_acc': "N/A", # violation status values - active/not active 'prochot_violation_status': "N/A", @@ -2311,7 +2324,7 @@ class AMDSMICommands(): def metric_cpu(self, args, multiple_devices=False, cpu=None, cpu_power_metrics=None, cpu_prochot=None, cpu_freq_metrics=None, cpu_c0_res=None, cpu_lclk_dpm_level=None, - cpu_pwr_svi_telemtry_rails=None, cpu_io_bandwidth=None, cpu_xgmi_bandwidth=None, + cpu_pwr_svi_telemetry_rails=None, cpu_io_bandwidth=None, cpu_xgmi_bandwidth=None, cpu_metrics_ver=None, cpu_metrics_table=None, cpu_socket_energy=None, cpu_ddr_bandwidth=None, cpu_temp=None, cpu_dimm_temp_range_rate=None, cpu_dimm_pow_consumption=None, cpu_dimm_thermal_sensor=None): @@ -2354,8 +2367,8 @@ class AMDSMICommands(): args.cpu_c0_res = cpu_c0_res if cpu_lclk_dpm_level: args.cpu_lclk_dpm_level = cpu_lclk_dpm_level - if cpu_pwr_svi_telemtry_rails: - args.cpu_pwr_svi_telemtry_rails = cpu_pwr_svi_telemtry_rails + if cpu_pwr_svi_telemetry_rails: + args.cpu_pwr_svi_telemtry_rails = cpu_pwr_svi_telemetry_rails if cpu_io_bandwidth: args.cpu_io_bandwidth = cpu_io_bandwidth if cpu_xgmi_bandwidth: @@ -2488,7 +2501,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: static_dict["socket_dpm"]["dpml_level_range"] = "N/A" logging.debug("Failed to get socket dpm level range for cpu %s | %s", cpu_id, e.get_error_info()) - if args.cpu_pwr_svi_telemtry_rails: + if args.cpu_pwr_svi_telemetry_rails: static_dict["svi_telemetry_all_rails"] = {} try: power = amdsmi_interface.amdsmi_get_cpu_pwr_svi_telemetry_all_rails(args.cpu) @@ -2756,8 +2769,7 @@ class AMDSMICommands(): None: Print output via AMDSMILogger to destination """ # TODO Move watch logic into here and make it driver agnostic or enable it for CPU arguments - - # Mutually exculsive args + # Mutually exclusive args if gpu: args.gpu = gpu if cpu: @@ -2832,7 +2844,7 @@ class AMDSMICommands(): cpu_dimm_pow_consumption, cpu_dimm_thermal_sensor) if args.core: self.logger.output = {} - self.logger.clear_multiple_devices_ouput() + self.logger.clear_multiple_devices_output() self.metric_core(args, multiple_devices, core, core_boost_limit, core_curr_active_freq_core_limit, core_energy) if args.gpu: @@ -2843,7 +2855,8 @@ class AMDSMICommands(): clock, temperature, ecc, ecc_blocks, pcie, fan, voltage_curve, overdrive, perf_level, xgmi_err, energy, mem_usage, schedule, - guard, guest_data, fb_usage, xgmi, throttle) + guard, guest_data, fb_usage, xgmi, throttle, + ) elif self.helpers.is_amd_hsmp_initialized(): # Only CPU is initialized if args.cpu == None and args.core == None: # If no args are set, print out all CPU and Core metrics info @@ -2877,7 +2890,8 @@ class AMDSMICommands(): usage, watch, watch_time, iterations, power, clock, temperature, ecc, ecc_blocks, pcie, fan, voltage_curve, overdrive, perf_level, - xgmi_err, energy, mem_usage, schedule, throttle) + xgmi_err, energy, mem_usage, schedule, throttle, + ) def process(self, args, multiple_devices=False, watching_output=False, @@ -5350,13 +5364,14 @@ class AMDSMICommands(): print("Placeholder for rocm-smi legacy commands") - def xgmi(self, args, multiple_devices=False, gpu=None, metric=None): + def xgmi(self, args, multiple_devices=False, gpu=None, metric=None, xgmi_link_status=None): """ Get topology information for target gpus params: args - argparser args to pass to subcommand multiple_devices (bool) - True if checking for multiple devices gpu (device_handle) - device_handle for target device metric (bool) - Value override for args.metric + xgmi_link_status (bool) - Value override for args.xgmi_link_status return: Nothing @@ -5368,6 +5383,8 @@ class AMDSMICommands(): args.gpu = gpu if metric: args.metric = metric + if xgmi_link_status: + args.link_status = xgmi_link_status # Handle No GPU passed if args.gpu == None: @@ -5377,8 +5394,9 @@ class AMDSMICommands(): args.gpu = [args.gpu] # Handle all args being false - if not any([args.metric]): + if not any([args.metric, args.link_status]): args.metric = True + args.link_status = True # Clear the table header self.logger.table_header = ''.rjust(7) @@ -5396,9 +5414,9 @@ class AMDSMICommands(): if args.metric: # prepend link metrics header to the table header - link_metrics_header = " " + "bdf".ljust(13) + \ - "bit_rate".ljust(9) + "max_bandwidth".ljust(14) + \ - "link_type".ljust(10) + link_metrics_header = " " + "bdf".ljust(14) + \ + "bit_rate".ljust(10) + "max_bandwidth".ljust(15) + \ + "link_type".ljust(11) self.logger.table_header = link_metrics_header + self.logger.table_header.strip() # Populate dictionary according to format @@ -5544,7 +5562,7 @@ class AMDSMICommands(): # Print out the tabular output self.logger.multiple_device_output = tabular_output - self.logger.table_title = "LINK METRIC TABLE" + self.logger.table_title = "\nLINK METRIC TABLE" self.logger.print_output(multiple_device_enabled=True, tabular=True) self.logger.multiple_device_output = xgmi_values @@ -5558,6 +5576,48 @@ class AMDSMICommands(): if not self.logger.is_human_readable_format(): self.logger.print_output(multiple_device_enabled=True) + if args.link_status: + # Header modification + self.logger.table_header = ''.rjust(7) + current_header = " ".ljust(7) + \ + "bdf".ljust(14) + \ + "link_status".ljust(20) + self.logger.table_header = current_header + self.logger.table_header.strip() + # Process each GPU + tabular_output = [] + for xgmi_dict in xgmi_values: + src_gpu_id = xgmi_dict['gpu'] + src_gpu_bdf = xgmi_dict['bdf'] + src_gpu = amdsmi_interface.amdsmi_get_processor_handle_from_bdf(src_gpu_bdf) + + # Populate link statuses + status_row = [] + tabular_output_dict = {"gpu#": f"GPU{src_gpu_id}", + "gpu": src_gpu_id, + "bdf": src_gpu_bdf, + "link_status": "N/A"} + try: + link_status = amdsmi_interface.amdsmi_get_gpu_xgmi_link_status(src_gpu) + tabular_output_dict['link_status'] = link_status['status'] + if self.logger.is_human_readable_format(): + del tabular_output_dict['gpu'] + else: + del tabular_output_dict['gpu#'] + tabular_output.append(tabular_output_dict) + except amdsmi_exception.AmdSmiLibraryException as e: + xgmi_dict['link_metrics']['link_status']={"status": "failed"} + logging.debug("Failed to get XGMI link status for GPU %s | %s", src_gpu_id, e.get_error_info()) + + #populate link status data for output + if self.logger.is_human_readable_format(): + xgmi_dict['link_status'] = tabular_output + self.logger.multiple_device_output= tabular_output + self.logger.table_title = "\nXGMI LINK STATUS" + self.logger.print_output(multiple_device_enabled=True, tabular=True) + self.logger.clear_multiple_devices_ouput() + if self.logger.is_human_readable_format(): + print("\n* U:Up D:Down X:Disabled".ljust(13)) + def partition(self, args, multiple_devices=False, gpu=None, current=None, memory=None, accelerator=None): """ Display parition information for the target GPU diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index 7018d8f72f..5c53583df3 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -780,6 +780,8 @@ class AMDSMIHelpers(): def convert_bytes_to_readable(self, bytes_input, format_length=None): + if isinstance(bytes_input, str): + return "N/A" for unit in ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB"]: if abs(bytes_input) < 1024: if format_length is not None: diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index f967910f7e..2bc183050a 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -132,15 +132,18 @@ class AMDSMILogger(): elif key == "gpu#": table_values += string_value.ljust(7) elif key == "bdf": - table_values += string_value.ljust(13) + table_values += string_value.ljust(14) elif "bdf_" in key: table_values += string_value.ljust(13) elif key == "bit_rate": - table_values += string_value.ljust(9) - elif key == "max_bandwidth": - table_values += string_value.ljust(14) - elif key == "link_type": table_values += string_value.ljust(10) + elif key == "max_bandwidth": + table_values += string_value.ljust(15) + elif key == "link_type": + table_values += string_value.ljust(11) + elif key == "link_status": + for i in value: + table_values += str(i).ljust(3) elif key == "memory": table_values += string_value.ljust(8) elif key == "accelerator_type": @@ -166,7 +169,7 @@ class AMDSMILogger(): elif key == "resources_shared": table_values += string_value.ljust(18) elif key == "RW": - table_values += string_value.ljust(53) + table_values += string_value.ljust(57) elif key == "process_list": #Add an additional padding between the first instance of GPU and NAME table_values += ' ' diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index a6fbbc0c15..9a77e0f140 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -1313,6 +1313,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Help text for Arguments only on Guest and BM platforms metrics_help = "Metric XGMI information" + xgmi_link_status_help = "XGMI Link Status information" # Create xgmi subparser xgmi_parser = subparsers.add_parser('xgmi', help=xgmi_help, description=xgmi_subcommand_help) @@ -1326,6 +1327,7 @@ class AMDSMIParser(argparse.ArgumentParser): # Optional Args xgmi_parser.add_argument('-m', '--metric', action='store_true', required=False, help=metrics_help) + xgmi_parser.add_argument('-l', '--link-status', action='store_true', required=False, help=xgmi_link_status_help) def _add_partition_parser(self, subparsers, func): diff --git a/projects/amdsmi/example/amd_smi_drm_example.cc b/projects/amdsmi/example/amd_smi_drm_example.cc index 56050d6bb2..bcb92615c7 100644 --- a/projects/amdsmi/example/amd_smi_drm_example.cc +++ b/projects/amdsmi/example/amd_smi_drm_example.cc @@ -293,9 +293,11 @@ int main() { CHK_AMDSMI_RET(ret) printf(" Output of amdsmi_get_gpu_vram_info:\n"); printf("\tVRAM Size: 0x%lx (%ld) \n", vram_info.vram_size, vram_info.vram_size); - printf("\tBIT Width: 0x%x (%d) \n\n", vram_info.vram_bit_width, vram_info.vram_bit_width); - } - else { + printf("\tBIT Width: 0x%x (%d) \n\n", vram_info.vram_bit_width, + vram_info.vram_bit_width); + printf("\tVRAM max bandwidth: 0x%lx (%lu) \n\n", vram_info.vram_max_bandwidth, + vram_info.vram_max_bandwidth); + } else { printf("\t**amdsmi_get_gpu_vram_info() not supported on this system.\n"); } @@ -865,6 +867,18 @@ int main() { ++idx; } + std::cout << std::dec << "\txgmi_link_status= ["; + idx = 0; + for (const auto& temp : smu.xgmi_link_status) { + std::cout << temp; + if ((idx + 1) != std::size(smu.xgmi_link_status)) { + std::cout << ", "; + } else { + std::cout << "]\n"; + } + ++idx; + } + // Voltage (mV) std::cout << "\tvoltage_soc = " << std::dec << smu.voltage_soc << "\n"; std::cout << "\tvoltage_gfx = " << std::dec << smu.voltage_gfx << "\n"; @@ -880,6 +894,9 @@ int main() { std::cout << "\tpcie_bandwidth_acc=" << std::dec << smu.pcie_bandwidth_acc << "\n"; std::cout << "\tpcie_bandwidth_inst=" << std::dec << smu.pcie_bandwidth_inst << "\n"; + // VRAM max bandwidth at max memory clock + std::cout << "\tvram_max_bandwidth=" << std::dec << smu.vram_max_bandwidth << "\n"; + // Counts std::cout << "\tpcie_l0_to_recov_count_acc= " << std::dec << smu.pcie_l0_to_recov_count_acc << "\n"; @@ -983,6 +1000,24 @@ int main() { idx++; } + idx = 0; + idy = 0; + std::cout << "\txcp_stats.gfx_below_host_limit_acc: " << "\n"; + for (auto& row : smu.xcp_stats) { + std::cout << "\t XCP [" << idx << "] : ["; + for (auto& col : row.gfx_below_host_limit_acc) { + if ((idy + 1) != static_cast(std::size(row.gfx_below_host_limit_acc))) { + std::cout << col << ", "; + } else { + std::cout << col; + } + idy++; + } + std::cout << "]\n"; + idy = 0; + idx++; + } + std::cout << "\n\n"; std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; constexpr uint16_t kMAX_ITER_TEST = 10; diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 153af77180..2f46874ad1 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -710,7 +710,8 @@ typedef struct { amdsmi_vram_vendor_type_t vram_vendor; uint64_t vram_size; uint32_t vram_bit_width; - uint64_t reserved[5]; + uint64_t vram_max_bandwidth; //!< The VRAM max bandwidth at current memory clock (GB/s) + uint64_t reserved[4]; } amdsmi_vram_info_t; typedef struct { @@ -1325,13 +1326,22 @@ typedef struct { * @brief The following structures hold the gpu statistics for a device. */ typedef struct { - /* Utilization Instantaneous (%) */ + /* + * v1.6 additions + */ + /* Utilization Instantaneous (%) */ uint32_t gfx_busy_inst[AMDSMI_MAX_NUM_XCC]; uint16_t jpeg_busy[AMDSMI_MAX_NUM_JPEG]; uint16_t vcn_busy[AMDSMI_MAX_NUM_VCN]; /* Utilization Accumulated (%) */ uint64_t gfx_busy_acc[AMDSMI_MAX_NUM_XCC]; + + /* + * v1.7 additions + */ + /* Total App Clock Counter Accumulated */ + uint64_t gfx_below_host_limit_acc[AMDSMI_MAX_NUM_XCC]; } amdsmi_gpu_xcp_metrics_t; typedef struct { @@ -1533,9 +1543,30 @@ typedef struct { /* PCIE other end recovery counter */ uint32_t pcie_lc_perf_other_end_recovery; + /* + * v1.7 additions + */ + /* VRAM max bandwidth at max memory clock (GB/s) */ + uint64_t vram_max_bandwidth; + + /* XGMI link status(up/down) */ + uint16_t xgmi_link_status[AMDSMI_MAX_NUM_XGMI_LINKS]; + /// \endcond } amdsmi_gpu_metrics_t; +typedef enum { + AMDSMI_XGMI_LINK_DOWN, //!< The XGMI Link is down + AMDSMI_XGMI_LINK_UP, //!< The XGMI Link is up + AMDSMI_XGMI_LINK_DISABLE, //!< The XGMI Link is disabled +} amdsmi_xgmi_link_status_type_t; + +typedef struct { + uint32_t total_links; //!< The total links in the status array + amdsmi_xgmi_link_status_type_t status[AMDSMI_MAX_NUM_XGMI_LINKS]; + uint64_t reserved[7]; +} amdsmi_xgmi_link_status_t; + #define MAX_AMDSMI_NAME_LENGTH 64 /** @@ -4828,6 +4859,25 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a amdsmi_status_t amdsmi_get_xgmi_info(amdsmi_processor_handle processor_handle, amdsmi_xgmi_info_t *info); +/** + * @brief Get the XGMI link status + * + * @platform{gpu_bm_linux} @platform{host} + * + * @details Given a processor handle @p processor_handle, this function + * will return the link status for each XGMI link connect to this processor. + * If the processor link type is not XGMI, it should return AMDSMI_STATUS_NOT_SUPPORTED. + * + * @param[in] processor_handle a processor handle + * + * @param[out] link_status The link status of the XGMI connect to this processor. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t +amdsmi_get_gpu_xgmi_link_status(amdsmi_processor_handle processor_handle, + amdsmi_xgmi_link_status_t* link_status); + /** @} End asicinfo */ /*****************************************************************************/ @@ -5756,6 +5806,7 @@ amdsmi_status_t amdsmi_get_cpu_model(uint32_t *cpu_model); * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ amdsmi_status_t amdsmi_get_esmi_err_msg(amdsmi_status_t status, const char **status_string); + #endif /** @} auxiquer */ diff --git a/projects/amdsmi/py-interface/__init__.py b/projects/amdsmi/py-interface/__init__.py index 0d784aa02c..b52765571e 100644 --- a/projects/amdsmi/py-interface/__init__.py +++ b/projects/amdsmi/py-interface/__init__.py @@ -105,6 +105,7 @@ from .amdsmi_interface import amdsmi_get_clock_info from .amdsmi_interface import amdsmi_get_pcie_info from .amdsmi_interface import amdsmi_get_gpu_bad_page_info from .amdsmi_interface import amdsmi_get_violation_status +from .amdsmi_interface import amdsmi_get_gpu_xgmi_link_status # # Process Information from .amdsmi_interface import amdsmi_get_gpu_process_list diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 1a809a0fe6..728dfb1e8a 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -1818,15 +1818,50 @@ def amdsmi_get_gpu_vram_info( amdsmi_wrapper.amdsmi_get_gpu_vram_info( processor_handle, ctypes.byref(vram_info)) ) - return { "vram_type": vram_info.vram_type, "vram_vendor": vram_info.vram_vendor, "vram_size": vram_info.vram_size, - "vram_bit_width": vram_info.vram_bit_width + "vram_bit_width": _validate_if_max_uint(vram_info.vram_bit_width, MaxUIntegerTypes.UINT32_T), + "vram_max_bandwidth": _validate_if_max_uint(vram_info.vram_max_bandwidth, MaxUIntegerTypes.UINT64_T), } +def amdsmi_get_gpu_xgmi_link_status( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + ) -> Dict[str, Any]: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + status_info = amdsmi_wrapper.amdsmi_xgmi_link_status_t() + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_xgmi_link_status( + processor_handle, ctypes.byref(status_info)) + ) + + link_status = [] + count = 0 + for link in status_info.status: + if count == status_info.total_links: + break + if amdsmi_wrapper.amdsmi_xgmi_link_status_type_t__enumvalues[link] == 'AMDSMI_XGMI_LINK_DISABLE': # XGMI link is disabled + link_status.append("X") + elif amdsmi_wrapper.amdsmi_xgmi_link_status_type_t__enumvalues[link] == 'AMDSMI_XGMI_LINK_UP': # XGMI Link is up + link_status.append("U") + elif amdsmi_wrapper.amdsmi_xgmi_link_status_type_t__enumvalues[link] == 'AMDSMI_XGMI_LINK_DOWN': # XGMI Link is down + link_status.append("D") + else: + link_status.append("N/A") + count += 1 + + return_dict = { + "status" : link_status, + "total_links": status_info.total_links, + } + return return_dict + + def amdsmi_get_gpu_cache_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> List[Dict[str, Any]]: @@ -3863,7 +3898,10 @@ def amdsmi_get_gpu_metrics_info( "xcp_stats.jpeg_busy": list(gpu_metrics.xcp_stats), "xcp_stats.vcn_busy": list(gpu_metrics.xcp_stats), "xcp_stats.gfx_busy_acc": list(gpu_metrics.xcp_stats), + "xcp_stats.gfx_below_host_limit_acc": list(gpu_metrics.xcp_stats), "pcie_lc_perf_other_end_recovery": _validate_if_max_uint(gpu_metrics.pcie_lc_perf_other_end_recovery, MaxUIntegerTypes.UINT32_T), + "vram_max_bandwidth": _validate_if_max_uint(gpu_metrics.vram_max_bandwidth, MaxUIntegerTypes.UINT64_T), + "xgmi_link_status": _validate_if_max_uint(list(gpu_metrics.xgmi_link_status), MaxUIntegerTypes.UINT16_T), } # Create 2d array with each XCD's stats @@ -3893,6 +3931,12 @@ def amdsmi_get_gpu_metrics_info( for val in item.gfx_busy_acc: print_xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True)) gpu_metrics_output[k][curr_xcp] = print_xcp_detail + if 'xcp_stats.gfx_below_host_limit_acc' in k: + for curr_xcp, item in enumerate(v): + print_xcp_detail = [] + for val in item.gfx_below_host_limit_acc: + print_xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T, isActivity=True)) + gpu_metrics_output[k][curr_xcp] = print_xcp_detail return gpu_metrics_output diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index e1513cdbb7..4c3437711b 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -1044,7 +1044,8 @@ struct_amdsmi_vram_info_t._fields_ = [ ('vram_size', ctypes.c_uint64), ('vram_bit_width', ctypes.c_uint32), ('PADDING_0', ctypes.c_ubyte * 4), - ('reserved', ctypes.c_uint64 * 5), + ('vram_max_bandwidth', ctypes.c_uint64), + ('reserved', ctypes.c_uint64 * 4), ] amdsmi_vram_info_t = struct_amdsmi_vram_info_t @@ -1119,6 +1120,16 @@ amdsmi_process_handle_t = ctypes.c_uint32 class struct_amdsmi_proc_info_t(Structure): pass +class struct_engine_usage_(Structure): + pass + +struct_engine_usage_._pack_ = 1 # source:False +struct_engine_usage_._fields_ = [ + ('gfx', ctypes.c_uint64), + ('enc', ctypes.c_uint64), + ('reserved', ctypes.c_uint32 * 12), +] + class struct_memory_usage_(Structure): pass @@ -1130,16 +1141,6 @@ struct_memory_usage_._fields_ = [ ('reserved', ctypes.c_uint32 * 10), ] -class struct_engine_usage_(Structure): - pass - -struct_engine_usage_._pack_ = 1 # source:False -struct_engine_usage_._fields_ = [ - ('gfx', ctypes.c_uint64), - ('enc', ctypes.c_uint64), - ('reserved', ctypes.c_uint32 * 12), -] - struct_amdsmi_proc_info_t._pack_ = 1 # source:False struct_amdsmi_proc_info_t._fields_ = [ ('name', ctypes.c_char * 256), @@ -1739,6 +1740,7 @@ struct_amdsmi_gpu_xcp_metrics_t._fields_ = [ ('jpeg_busy', ctypes.c_uint16 * 32), ('vcn_busy', ctypes.c_uint16 * 4), ('gfx_busy_acc', ctypes.c_uint64 * 8), + ('gfx_below_host_limit_acc', ctypes.c_uint64 * 8), ] amdsmi_gpu_xcp_metrics_t = struct_amdsmi_gpu_xcp_metrics_t @@ -1820,9 +1822,34 @@ struct_amdsmi_gpu_metrics_t._fields_ = [ ('xcp_stats', struct_amdsmi_gpu_xcp_metrics_t * 8), ('pcie_lc_perf_other_end_recovery', ctypes.c_uint32), ('PADDING_5', ctypes.c_ubyte * 4), + ('vram_max_bandwidth', ctypes.c_uint64), + ('xgmi_link_status', ctypes.c_uint16 * 8), ] amdsmi_gpu_metrics_t = struct_amdsmi_gpu_metrics_t + +# values for enumeration 'amdsmi_xgmi_link_status_type_t' +amdsmi_xgmi_link_status_type_t__enumvalues = { + 0: 'AMDSMI_XGMI_LINK_DOWN', + 1: 'AMDSMI_XGMI_LINK_UP', + 2: 'AMDSMI_XGMI_LINK_DISABLE', +} +AMDSMI_XGMI_LINK_DOWN = 0 +AMDSMI_XGMI_LINK_UP = 1 +AMDSMI_XGMI_LINK_DISABLE = 2 +amdsmi_xgmi_link_status_type_t = ctypes.c_uint32 # enum +class struct_amdsmi_xgmi_link_status_t(Structure): + pass + +struct_amdsmi_xgmi_link_status_t._pack_ = 1 # source:False +struct_amdsmi_xgmi_link_status_t._fields_ = [ + ('total_links', ctypes.c_uint32), + ('status', amdsmi_xgmi_link_status_type_t * 8), + ('PADDING_0', ctypes.c_ubyte * 4), + ('reserved', ctypes.c_uint64 * 7), +] + +amdsmi_xgmi_link_status_t = struct_amdsmi_xgmi_link_status_t class struct_amdsmi_name_value_t(Structure): pass @@ -2397,6 +2424,9 @@ amdsmi_get_pcie_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_ amdsmi_get_xgmi_info = _libraries['libamd_smi.so'].amdsmi_get_xgmi_info amdsmi_get_xgmi_info.restype = amdsmi_status_t amdsmi_get_xgmi_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_xgmi_info_t)] +amdsmi_get_gpu_xgmi_link_status = _libraries['libamd_smi.so'].amdsmi_get_gpu_xgmi_link_status +amdsmi_get_gpu_xgmi_link_status.restype = amdsmi_status_t +amdsmi_get_gpu_xgmi_link_status.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_xgmi_link_status_t)] amdsmi_get_fw_info = _libraries['libamd_smi.so'].amdsmi_get_fw_info amdsmi_get_fw_info.restype = amdsmi_status_t amdsmi_get_fw_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_fw_info_t)] @@ -2763,8 +2793,9 @@ __all__ = \ 'AMDSMI_VRAM_VENDOR__PLACEHOLDER3', 'AMDSMI_VRAM_VENDOR__PLACEHOLDER4', 'AMDSMI_VRAM_VENDOR__PLACEHOLDER5', 'AMDSMI_VRAM_VENDOR__SAMSUNG', - 'AMDSMI_VRAM_VENDOR__WINBOND', 'AMDSMI_XGMI_STATUS_ERROR', - 'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS', + 'AMDSMI_VRAM_VENDOR__WINBOND', 'AMDSMI_XGMI_LINK_DISABLE', + 'AMDSMI_XGMI_LINK_DOWN', 'AMDSMI_XGMI_LINK_UP', + 'AMDSMI_XGMI_STATUS_ERROR', 'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS', 'AMDSMI_XGMI_STATUS_NO_ERRORS', 'CLK_LIMIT_MAX', 'CLK_LIMIT_MIN', 'RD_BW0', 'WR_BW0', 'amd_metrics_table_header_t', 'amdsmi_accelerator_partition_profile_t', @@ -2851,7 +2882,8 @@ __all__ = \ 'amdsmi_get_gpu_total_ecc_count', 'amdsmi_get_gpu_vbios_info', 'amdsmi_get_gpu_vendor_name', 'amdsmi_get_gpu_volt_metric', 'amdsmi_get_gpu_vram_info', 'amdsmi_get_gpu_vram_usage', - 'amdsmi_get_gpu_vram_vendor', 'amdsmi_get_hsmp_metrics_table', + 'amdsmi_get_gpu_vram_vendor', 'amdsmi_get_gpu_xgmi_link_status', + 'amdsmi_get_hsmp_metrics_table', 'amdsmi_get_hsmp_metrics_table_version', 'amdsmi_get_lib_version', 'amdsmi_get_link_metrics', 'amdsmi_get_link_topology_nearest', 'amdsmi_get_minmax_bandwidth_between_processors', @@ -2924,6 +2956,7 @@ __all__ = \ 'amdsmi_voltage_metric_t', 'amdsmi_voltage_type_t', 'amdsmi_vram_info_t', 'amdsmi_vram_type_t', 'amdsmi_vram_usage_t', 'amdsmi_vram_vendor_type_t', 'amdsmi_xgmi_info_t', + 'amdsmi_xgmi_link_status_t', 'amdsmi_xgmi_link_status_type_t', 'amdsmi_xgmi_status_t', 'processor_type_t', 'size_t', 'struct__links', 'struct_amd_metrics_table_header_t', 'struct_amdsmi_accelerator_partition_profile_t', @@ -2958,7 +2991,8 @@ __all__ = \ 'struct_amdsmi_vbios_info_t', 'struct_amdsmi_version_t', 'struct_amdsmi_violation_status_t', 'struct_amdsmi_vram_info_t', 'struct_amdsmi_vram_usage_t', 'struct_amdsmi_xgmi_info_t', - 'struct_cache_', 'struct_engine_usage_', 'struct_fw_info_list_', + 'struct_amdsmi_xgmi_link_status_t', 'struct_cache_', + 'struct_engine_usage_', 'struct_fw_info_list_', 'struct_memory_usage_', 'struct_nps_flags_', 'struct_pcie_metric_', 'struct_pcie_static_', 'struct_amdsmi_bdf_t', 'uint32_t', 'uint64_t', 'uint8_t', diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index f4c58b5bcc..497a08e2f4 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -1085,6 +1085,9 @@ typedef struct metrics_table_header_t metrics_table_header_t; * @brief The following structures hold the gpu statistics for a device. */ struct amdgpu_xcp_metrics_t { + /* + * v1.6 additions + */ /* Utilization Instantaneous (%) */ uint32_t gfx_busy_inst[RSMI_MAX_NUM_XCC]; uint16_t jpeg_busy[RSMI_MAX_NUM_JPEG_ENGS]; @@ -1092,6 +1095,12 @@ struct amdgpu_xcp_metrics_t { /* Utilization Accumulated (%) */ uint64_t gfx_busy_acc[RSMI_MAX_NUM_XCC]; + + /* + * v1.7 additions + */ + /* Total App Clock Counter Accumulated */ + uint64_t gfx_below_host_limit_acc[RSMI_MAX_NUM_XCC]; }; typedef struct { @@ -1295,6 +1304,15 @@ typedef struct { /* PCIE other end recovery counter */ uint32_t pcie_lc_perf_other_end_recovery; + /* + * v1.7 additions + */ + /* VRAM max bandwidth at max memory clock */ + uint64_t vram_max_bandwidth; + + /* XGMI link status(up/down) */ + uint16_t xgmi_link_status[RSMI_MAX_NUM_XGMI_LINKS]; + /// \endcond } rsmi_gpu_metrics_t; diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h index 4a00fbb317..b47e4c8b44 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h @@ -88,6 +88,19 @@ struct AMDGpuMetricsHeader_v1_t { uint8_t m_content_revision; }; +struct amdgpu_xcp_metrics_v1_1 { + /* Utilization Instantaneous (%) */ + uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC]; + uint16_t jpeg_busy[kRSMI_MAX_JPEG_ENGINES]; + uint16_t vcn_busy[kRSMI_MAX_NUM_VCNS]; + + /* Utilization Accumulated (%) */ + uint64_t gfx_busy_acc[kRSMI_MAX_NUM_XCC]; + + /* Total App Clock Counter Accumulated */ + uint64_t gfx_below_host_limit_acc[kRSMI_MAX_NUM_XCC]; +}; + struct amdgpu_xcp_metrics { /* Utilization Instantaneous (%) */ uint32_t gfx_busy_inst[kRSMI_MAX_NUM_XCC]; @@ -551,7 +564,107 @@ struct AMDGpuMetrics_v16_t { /* PCIE other end recovery counter */ uint32_t m_pcie_lc_perf_other_end_recovery; }; -using AMGpuMetricsLatest_t = AMDGpuMetrics_v16_t; + +struct AMDGpuMetrics_v17_t { + ~AMDGpuMetrics_v17_t() = default; + struct AMDGpuMetricsHeader_v1_t m_common_header; + + /* Temperature (Celsius) */ + uint16_t m_temperature_hotspot; + uint16_t m_temperature_mem; + uint16_t m_temperature_vrsoc; + + /* Power (Watts) */ + uint16_t m_current_socket_power; + + /* Utilization (%) */ + uint16_t m_average_gfx_activity; + uint16_t m_average_umc_activity; // memory controller + + /* VRAM max bandwidth at max memory clock (GB/s) */ + uint64_t m_vram_max_bandwidth; // new for 1.7 + + /* Energy (15.259uJ (2^-16) units) */ + uint64_t m_energy_accumulator; + + /* Driver attached timestamp (in ns) */ + uint64_t m_system_clock_counter; + + /* Accumulation cycle counter */ + uint32_t m_accumulation_counter; + + /* Accumulated throttler residencies */ + uint32_t m_prochot_residency_acc; + uint32_t m_ppt_residency_acc; + uint32_t m_socket_thm_residency_acc; + uint32_t m_vr_thm_residency_acc; + uint32_t m_hbm_thm_residency_acc; + + /* Clock Lock Status. Each bit corresponds to clock instance */ + uint32_t m_gfxclk_lock_status; + + /* Link width (number of lanes) and speed (in 0.1 GT/s) */ + uint16_t m_pcie_link_width; + uint16_t m_pcie_link_speed; + + /* XGMI bus width and bitrate (in Gbps) */ + uint16_t m_xgmi_link_width; + uint16_t m_xgmi_link_speed; + + /* Utilization Accumulated (%) */ + uint32_t m_gfx_activity_acc; + uint32_t m_mem_activity_acc; + + /*PCIE accumulated bandwidth (GB/sec) */ + uint64_t m_pcie_bandwidth_acc; + + /*PCIE instantaneous bandwidth (GB/sec) */ + uint64_t m_pcie_bandwidth_inst; + + /* PCIE L0 to recovery state transition accumulated count */ + uint64_t m_pcie_l0_to_recov_count_acc; + + /* PCIE replay accumulated count */ + uint64_t m_pcie_replay_count_acc; + + /* PCIE replay rollover accumulated count */ + uint64_t m_pcie_replay_rover_count_acc; + + /* PCIE NAK sent accumulated count */ + uint32_t m_pcie_nak_sent_count_acc; + + /* PCIE NAK received accumulated count */ + uint32_t m_pcie_nak_rcvd_count_acc; + + /* XGMI accumulated data transfer size(KiloBytes) */ + uint64_t m_xgmi_read_data_acc[kRSMI_MAX_NUM_XGMI_LINKS]; + uint64_t m_xgmi_write_data_acc[kRSMI_MAX_NUM_XGMI_LINKS]; + + /* XGMI link status(up/down) */ + uint16_t m_xgmi_link_status[kRSMI_MAX_NUM_XGMI_LINKS]; // new for 1.7 + + uint16_t m_padding; + + /* PMFW attached timestamp (10ns resolution) */ + uint64_t m_firmware_timestamp; + + /* Current clocks (Mhz) */ + uint16_t m_current_gfxclk[kRSMI_MAX_NUM_GFX_CLKS]; + uint16_t m_current_socclk[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_vclk0[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_dclk0[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_uclk; + + /* Number of current partition */ + uint16_t m_num_partition; + + /* XCP metrics stats */ + struct amdgpu_xcp_metrics_v1_1 m_xcp_stats[kRSMI_MAX_NUM_XCP]; + + /* PCIE other end recovery counter */ + uint32_t m_pcie_lc_perf_other_end_recovery; +}; +using AMGpuMetricsLatest_t = AMDGpuMetrics_v17_t; /** * This is GPU Metrics version that gets to public access. @@ -766,8 +879,11 @@ enum class AMDGpuMetricsUnitType_t : AMDGpuMetricTypeId_t kMetricJpegBusy, // v1.6 kMetricVcnBusy, // v1.6 kMetricGfxBusyAcc, // v1.6 - kMetricPcieLCPerfOtherEndRecov, // v1.6 + + kMetricVramMaxBandwidth, // v1.7 + kMetricXgmiLinkStatus, // v1.7 + kMetricGfxBelowHostLimitAccumulator, // v1.7 }; using AMDGpuMetricsUnitTypeTranslationTbl_t = std::map; @@ -805,6 +921,7 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t kGpuMetricV14 = (0x1 << 4), kGpuMetricV15 = (0x1 << 5), kGpuMetricV16 = (0x1 << 6), + kGpuMetricV17 = (0x1 << 7), }; using AMDGpuMetricVersionTranslationTbl_t = std::map; using GpuMetricTypePtr_t = std::shared_ptr; @@ -1023,6 +1140,36 @@ class GpuMetricsBase_v16_t final : public GpuMetricsBase_t { std::shared_ptr m_gpu_metric_ptr; }; +class GpuMetricsBase_v17_t final : public GpuMetricsBase_t { + public: + ~GpuMetricsBase_v17_t() = default; + + size_t sizeof_metric_table() override { + return sizeof(AMDGpuMetrics_v17_t); + } + + GpuMetricTypePtr_t get_metrics_table() override { + if (!m_gpu_metric_ptr) { + m_gpu_metric_ptr.reset(&m_gpu_metrics_tbl, [](AMDGpuMetrics_v17_t*){}); + } + assert(m_gpu_metric_ptr != nullptr); + return m_gpu_metric_ptr; + } + + void dump_internal_metrics_table() override; + + AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { + return AMDGpuMetricVersionFlags_t::kGpuMetricV17; + } + + rsmi_status_t populate_metrics_dynamic_tbl() override; + AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override; + + private: + AMDGpuMetrics_v17_t m_gpu_metrics_tbl; + std::shared_ptr m_gpu_metric_ptr; +}; + template rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnitType_t metric_counter, T& metric_value); diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc index b7443652bd..d05bd77aed 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -136,6 +136,7 @@ std::string stringfy_metric_header_version(const AMDGpuMetricsHeader_v1_t& metri // version 1.4: 260 // version 1.5: 261 // version 1.6: 262 +// version 1.7: 263 // const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_table { @@ -145,6 +146,7 @@ const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_tabl {join_metrics_version(1, 4), AMDGpuMetricVersionFlags_t::kGpuMetricV14}, {join_metrics_version(1, 5), AMDGpuMetricVersionFlags_t::kGpuMetricV15}, {join_metrics_version(1, 6), AMDGpuMetricVersionFlags_t::kGpuMetricV16}, + {join_metrics_version(1, 7), AMDGpuMetricVersionFlags_t::kGpuMetricV17}, }; /** @@ -264,6 +266,12 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation // kGpuMetricLinkWidthSpeed {AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, "PcieLCPerfOtherEndRecov"}, /* v1.6 */ + + + {AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, "XgmiLinkStatus"}, /* v1.7 */ + {AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, "VramMaxBandwidth"}, /* v1.7 */ + {AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator, + "GfxBelowHostLimitAccumulator"}, /* v1.7 */ }; @@ -352,6 +360,7 @@ AMDGpuMetricFactories_t amd_gpu_metrics_factory_table {AMDGpuMetricVersionFlags_t::kGpuMetricV14, std::make_shared(GpuMetricsBase_v14_t{})}, {AMDGpuMetricVersionFlags_t::kGpuMetricV15, std::make_shared(GpuMetricsBase_v15_t{})}, {AMDGpuMetricVersionFlags_t::kGpuMetricV16, std::make_shared(GpuMetricsBase_v16_t{})}, + {AMDGpuMetricVersionFlags_t::kGpuMetricV17, std::make_shared(GpuMetricsBase_v17_t{})}, }; GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version) @@ -470,6 +479,197 @@ AMDGpuDynamicMetricTblValues_t format_metric_row(const T& metric, const std::str return multi_values; } +void GpuMetricsBase_v17_t::dump_internal_metrics_table() +{ + std::ostringstream ss; + auto idx = uint64_t(0); + auto idy = uint64_t(0); + std::cout << __PRETTY_FUNCTION__ << " | ======= start ======= \n"; + ss << __PRETTY_FUNCTION__ + << " | ======= DEBUG ======= " + << " | Metric Version: " + << stringfy_metric_header_version(m_gpu_metrics_tbl.m_common_header) + << " | Size: " + << print_unsigned_int(m_gpu_metrics_tbl.m_common_header.m_structure_size) + << " |" + << "\n"; + ss << " temperature_hotspot: " << m_gpu_metrics_tbl.m_temperature_hotspot << "\n" + << " temperature_mem: " << m_gpu_metrics_tbl.m_temperature_mem << "\n" + << " temperature_vrsoc: " << m_gpu_metrics_tbl.m_temperature_vrsoc << "\n" + << " current_socket_power: " << m_gpu_metrics_tbl.m_current_socket_power << "\n" + << " average_gfx_activity: " << m_gpu_metrics_tbl.m_average_gfx_activity << "\n" + << " average_umc_activity: " << m_gpu_metrics_tbl.m_average_umc_activity << "\n"; + + ss << " vram_max_bandwidth: " << m_gpu_metrics_tbl.m_vram_max_bandwidth << "\n" // new for v1.7 + << " energy_accumulator: " << m_gpu_metrics_tbl.m_energy_accumulator << "\n" + << " system_clock_counter: " << m_gpu_metrics_tbl.m_system_clock_counter << "\n" + << " accumulation_counter: " << m_gpu_metrics_tbl.m_accumulation_counter << "\n" + << " prochot_residency_acc: " << m_gpu_metrics_tbl.m_prochot_residency_acc << "\n" + << " ppt_residency_acc: " << m_gpu_metrics_tbl.m_ppt_residency_acc << "\n" + << " socket_thm_residency_acc: " << m_gpu_metrics_tbl.m_socket_thm_residency_acc << "\n" + << " vr_thm_residency_acc: " << m_gpu_metrics_tbl.m_vr_thm_residency_acc << "\n" + << " hbm_thm_residency_acc: " << m_gpu_metrics_tbl.m_hbm_thm_residency_acc << "\n" + << " gfxclk_lock_status: " << m_gpu_metrics_tbl.m_gfxclk_lock_status << "\n" + << " pcie_link_width: " << m_gpu_metrics_tbl.m_pcie_link_width << "\n" + << " pcie_link_speed: " << m_gpu_metrics_tbl.m_pcie_link_speed << "\n" + << " xgmi_link_width: " << m_gpu_metrics_tbl.m_xgmi_link_width << "\n" + << " xgmi_link_speed: " << m_gpu_metrics_tbl.m_xgmi_link_speed << "\n" + << " gfx_activity_acc: " << m_gpu_metrics_tbl.m_gfx_activity_acc << "\n" + << " mem_activity_acc: " << m_gpu_metrics_tbl.m_mem_activity_acc << "\n" + << " pcie_bandwidth_acc: " << m_gpu_metrics_tbl.m_pcie_bandwidth_acc << "\n" + << " pcie_bandwidth_inst: " << m_gpu_metrics_tbl.m_pcie_bandwidth_inst << "\n" + << " pcie_l0_to_recov_count_acc: " << m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc << "\n" + << " pcie_replay_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_count_acc << "\n" + << " pcie_replay_rover_count_acc: " << m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc << "\n" + << " pcie_nak_sent_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc << "\n" + << " pcie_nak_rcvd_count_acc: " << m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc << "\n" + << " firmware_timestamp: " << m_gpu_metrics_tbl.m_firmware_timestamp << "\n" + << " current_uclk: " << m_gpu_metrics_tbl.m_current_uclk << "\n" + << " num_partition: " << m_gpu_metrics_tbl.m_num_partition << "\n" + << " pcie_lc_perf_other_end_recovery: " + << m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_link_status) { // new for v1.7 + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_read_data_acc) { + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ss << " xgmi_write_data_acc: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_xgmi_write_data_acc) { + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ss << " current_gfxclk: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_current_gfxclk) { + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ss << " current_socclk: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_current_socclk) { + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ss << " current_vclk0: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_current_vclk0) { + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + ss << " current_dclk0: " << "\n"; + idx = 0; + for (const auto& temp : m_gpu_metrics_tbl.m_current_dclk0) { + ss << "\t [" << idx << "]: " << temp << "\n"; + ++idx; + } + + idx = 0; + idy = 0; + ss << " xcp_stats.gfx_busy_inst: " << "\n"; + for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { + if (idx == 0) { + ss << "\t [ "; + } + for (auto& col : row.gfx_busy_inst) { + ss << "\t [" << idx << "] [" << idy << "]: " << col; + if (idy + 1 != (std::end(row.gfx_busy_inst) - std::end(row.gfx_busy_inst) - 1)) { + ss << ", "; + } + if (idx + 1 != + (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { + ss << "\n"; + } else { + ss << "]\n"; + } + idy++; + } + idx++; + } + + idx = 0; + idy = 0; + ss << " xcp_stats.vcn_busy: " << "\n"; + for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { + if (idx == 0) { + ss << "\t [ "; + } + for (auto& col : row.vcn_busy) { + ss << "\t [" << idx << "] [" << idy << "]: " << col; + if (idy + 1 != (std::end(row.vcn_busy) - std::end(row.vcn_busy) - 1)) { + ss << ", "; + } + if (idx + 1 != + (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { + ss << "\n"; + } else { + ss << "]\n"; + } + idy++; + } + idx++; + } + + idx = 0; + idy = 0; + ss << " xcp_stats.jpeg_busy: " << "\n"; + for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { + if (idx == 0) { + ss << "\t [ "; + } + for (auto& col : row.jpeg_busy) { + ss << "\t [" << idx << "] [" << idy << "]: " << col; + if (idy + 1 != (std::end(row.jpeg_busy) - std::end(row.jpeg_busy) - 1)) { + ss << ", "; + } + if (idx + 1 != + (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { + ss << "\n"; + } else { + ss << "]\n"; + } + idy++; + } + idx++; + } + + idx = 0; + idy = 0; + ss << " xcp_stats.gfx_busy_acc: " << "\n"; + for (auto& row : m_gpu_metrics_tbl.m_xcp_stats) { + if (idx == 0) { + ss << "\t [ "; + } + for (auto& col : row.gfx_busy_acc) { + ss << "\t [" << idx << "] [" << idy << "]: " << col; + if (idy + 1 != (std::end(row.gfx_busy_acc) - std::end(row.gfx_busy_acc) - 1)) { + ss << ", "; + } + if (idx + 1 != + (std::end(m_gpu_metrics_tbl.m_xcp_stats) - std::end(m_gpu_metrics_tbl.m_xcp_stats) - 1)) { + ss << "\n"; + } else { + ss << "]\n"; + } + idy++; + } + idx++; + } + + LOG_DEBUG(ss); +} + + void GpuMetricsBase_v16_t::dump_internal_metrics_table() { std::ostringstream ss; @@ -663,6 +863,263 @@ void GpuMetricsBase_v16_t::dump_internal_metrics_table() LOG_DEBUG(ss); } +rsmi_status_t GpuMetricsBase_v17_t::populate_metrics_dynamic_tbl() { + std::ostringstream ss; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + + if (!m_metrics_dynamic_tbl.empty()) { + m_metrics_dynamic_tbl.clear(); + } + + // + // Note: Any metric treatment/changes (if any) should happen before they + // get written to internal/external tables. + // + auto run_metric_adjustments_v17 = [&]() { + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + const auto gpu_metrics_version = + translate_flag_to_metric_version(get_gpu_metrics_version_used()); + ss << __PRETTY_FUNCTION__ + << " | ======= info ======= " + << " | Applying adjustments " + << " | Metric Version: " << stringfy_metric_header_version( + disjoin_metrics_version(gpu_metrics_version)) + << " |"; + LOG_TRACE(ss); + + // firmware_timestamp is at 10ns resolution + ss << __PRETTY_FUNCTION__ + << " | ======= Changes ======= " + << " | {m_firmware_timestamp} from: " << m_gpu_metrics_tbl.m_firmware_timestamp + << " to: " << (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + m_gpu_metrics_tbl.m_firmware_timestamp = (m_gpu_metrics_tbl.m_firmware_timestamp * 10); + LOG_DEBUG(ss); + }; + + // Adjustments/Changes specific to this version + run_metric_adjustments_v17(); + + // Temperature Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, + format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot, + "temperature_hotspot"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem, + format_metric_row(m_gpu_metrics_tbl.m_temperature_mem, + "temperature_mem"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc, + format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc, + "temperature_vrsoc"))); + + // Power/Energy Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, + format_metric_row(m_gpu_metrics_tbl.m_current_socket_power, + "curr_socket_power"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator, + "energy_acc"))); + + // Utilization Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity, + "average_gfx_activity"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity, + format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity, + "average_umc_activity"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc, + "gfx_activity_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc, + "mem_activity_acc"))); + + // Timestamp Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware, + format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp, + "firmware_timestamp"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter, + format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter, + "system_clock_counter"))); + + + // GfxLock Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, + format_metric_row(m_gpu_metrics_tbl.m_gfxclk_lock_status, + "gfxclk_lock_status"))); + + // Link/Width/Speed Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width, + "pcie_link_width"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, + format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed, + "pcie_link_speed"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_width, + "xgmi_link_width"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_speed, + "xgmi_link_speed"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc, + "pcie_bandwidth_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, + format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_inst, + "pcie_bandwidth_inst"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc, + "pcie_l0_recov_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_count_acc, + "pcie_replay_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc, + "pcie_replay_rollover_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc, + "pcie_nak_sent_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc, + "pcie_nak_rcvd_count_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_read_data_acc, + "[xgmi_read_data_acc]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_write_data_acc, + "[xgmi_write_data_acc]"))); + // new for v1.7 + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, + format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_status, + "[xgmi_link_status]"))); + // CurrentClock Info + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, + format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk, + "[current_gfxclk]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock, + format_metric_row(m_gpu_metrics_tbl.m_current_socclk, + "[current_socclk]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_vclk0, + "[current_vclk0]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0, + format_metric_row(m_gpu_metrics_tbl.m_current_dclk0, + "[current_dclk0]"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock, + format_metric_row(m_gpu_metrics_tbl.m_current_uclk, + "current_uclk"))); + + /* Accumulation cycle counter */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAccumulationCounter, + format_metric_row(m_gpu_metrics_tbl.m_accumulation_counter, + "accumulation_counter"))); + + /* Accumulated throttler residencies */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricProchotResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_prochot_residency_acc, + "prochot_residency_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPPTResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_ppt_residency_acc, + "ppt_residency_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricSocketThmResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_socket_thm_residency_acc, + "socket_thm_residency_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVRThmResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_vr_thm_residency_acc, + "vr_thm_residency_acc"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_hbm_thm_residency_acc, + "hbm_thm_residency_acc"))); + + /* Partition info */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPartition] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kGpuMetricNumPartition, + format_metric_row(m_gpu_metrics_tbl.m_num_partition, + "num_partition"))); + + /* xcp_stats info */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_inst, + "xcp_stats->gfx_busy_inst"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVcnBusy, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->vcn_busy, + "xcp_stats->vcn_busy"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricJpegBusy, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->jpeg_busy, + "xcp_stats->jpeg_busy"))); + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_acc, + "xcp_stats->gfx_busy_acc"))); + + /* PCIE other end recovery counter info */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, + format_metric_row(m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery, + "pcie_lc_perf_other_end_recovery"))); + + /* VRAM max bandwidth at max memory clock */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, + format_metric_row(m_gpu_metrics_tbl.m_vram_max_bandwidth, + "vram_max_bandwidth"))); + + /* Total App Clock Counter Accumulated */ + m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] + .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitAccumulator, + format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_acc, + "gfx_below_host_limit_acc"))); + + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ss); + + return status_code; +} + rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() { std::ostringstream ss; auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); @@ -700,7 +1157,6 @@ rsmi_status_t GpuMetricsBase_v16_t::populate_metrics_dynamic_tbl() { // Adjustments/Changes specific to this version run_metric_adjustments_v16(); - // Temperature Info m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, @@ -1594,6 +2050,12 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m rsmi_gpu_metrics.pcie_link_speed = init_max_uint_types(); rsmi_gpu_metrics.gfx_activity_acc = init_max_uint_types(); rsmi_gpu_metrics.mem_activity_acc = init_max_uint_types(); + rsmi_gpu_metrics.vram_max_bandwidth = init_max_uint_types(); + + std::fill(std::begin(rsmi_gpu_metrics.xgmi_link_status), + std::end(rsmi_gpu_metrics.xgmi_link_status), + init_max_uint_types()); + std::fill(std::begin(rsmi_gpu_metrics.temperature_hbm), std::end(rsmi_gpu_metrics.temperature_hbm), @@ -1671,6 +2133,8 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m init_max_uint_types()); std::fill(std::begin(row.gfx_busy_acc), std::end(row.gfx_busy_acc), init_max_uint_types()); + std::fill(std::begin(row.gfx_below_host_limit_acc), std::end(row.gfx_below_host_limit_acc), + init_max_uint_types()); } ss << __PRETTY_FUNCTION__ @@ -1683,6 +2147,225 @@ rsmi_status_t init_max_public_gpu_matrics(AMGpuMetricsPublicLatest_t& rsmi_gpu_m return status_code; } +AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v17_t::copy_internal_to_external_metrics() +{ + std::ostringstream ss; + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + + auto copy_data_from_internal_metrics_tbl = [&]() { + AMGpuMetricsPublicLatest_t metrics_public_init{}; + + // + // Note: Initializing data members with their max. If field is max, + // no data was assigned to it. + init_max_public_gpu_matrics(metrics_public_init); + + // Header + metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size; + metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision; + metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision; + + + // Temperature + metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot; + metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem; + metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc; + + // Power + metrics_public_init.current_socket_power = m_gpu_metrics_tbl.m_current_socket_power; + + // Utilization + metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity; + metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity; + + // Power/Energy + metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator; + + // Driver attached timestamp (in ns) + metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter; + + // Clock Lock Status. Each bit corresponds to clock instance + metrics_public_init.gfxclk_lock_status = m_gpu_metrics_tbl.m_gfxclk_lock_status; + + // Link width (number of lanes) and speed + metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width; + metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed; + + // XGMI bus width and bitrate + metrics_public_init.xgmi_link_width = m_gpu_metrics_tbl.m_xgmi_link_width; + metrics_public_init.xgmi_link_speed = m_gpu_metrics_tbl.m_xgmi_link_speed; + + // Utilization Accumulated + metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc; + metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc; + + // PCIE accumulated bandwidth + metrics_public_init.pcie_bandwidth_acc = m_gpu_metrics_tbl.m_pcie_bandwidth_acc; + + // PCIE instantaneous bandwidth + metrics_public_init.pcie_bandwidth_inst = m_gpu_metrics_tbl.m_pcie_bandwidth_inst; + + // PCIE L0 to recovery state transition accumulated count + metrics_public_init.pcie_l0_to_recov_count_acc = m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc; + + // PCIE replay accumulated count + metrics_public_init.pcie_replay_count_acc = m_gpu_metrics_tbl.m_pcie_replay_count_acc; + + // PCIE replay rollover accumulated count + metrics_public_init.pcie_replay_rover_count_acc = m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc; + + // PCIE NAK sent accumulated count + metrics_public_init.pcie_nak_sent_count_acc = m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc; + + // PCIE NAK received accumulated count + metrics_public_init.pcie_nak_rcvd_count_acc = m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc; + + // Accumulated throttler residencies + // bumped up public to uint64_t due to planned size increase for newer ASICs + metrics_public_init.accumulation_counter = m_gpu_metrics_tbl.m_accumulation_counter; + metrics_public_init.prochot_residency_acc = m_gpu_metrics_tbl.m_prochot_residency_acc; + metrics_public_init.ppt_residency_acc = m_gpu_metrics_tbl.m_ppt_residency_acc; + metrics_public_init.socket_thm_residency_acc = m_gpu_metrics_tbl.m_socket_thm_residency_acc; + metrics_public_init.vr_thm_residency_acc = m_gpu_metrics_tbl.m_vr_thm_residency_acc; + metrics_public_init.hbm_thm_residency_acc = m_gpu_metrics_tbl.m_hbm_thm_residency_acc; + + /* VRAM max bandwidth at max memory clock */ + metrics_public_init.vram_max_bandwidth = m_gpu_metrics_tbl.m_vram_max_bandwidth; + + // XGMI accumulated data transfer size + // xgmi_read_data + const auto xgmi_read_data_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_read_data_acc) - + std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc), + xgmi_read_data_num_elems, + metrics_public_init.xgmi_read_data_acc); + // xgmi_write_data + const auto xgmi_write_data_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_write_data_acc) - + std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc), + xgmi_write_data_num_elems, + metrics_public_init.xgmi_write_data_acc); + + // xgmi_link_status // new for 1.7 + const auto xgmi_link_status_num_elems = static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_link_status) - + std::begin(m_gpu_metrics_tbl.m_xgmi_link_status)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_link_status), + xgmi_link_status_num_elems, + metrics_public_init.xgmi_link_status); + + // PMFW attached timestamp (10ns resolution) + metrics_public_init.firmware_timestamp = m_gpu_metrics_tbl.m_firmware_timestamp; + + // Current clocks + // current_gfxclk + const auto curr_gfxclk_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_gfxclk) - + std::begin(m_gpu_metrics_tbl.m_current_gfxclk)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_gfxclk), + curr_gfxclk_num_elems, + metrics_public_init.current_gfxclks); + + // current_socclk + const auto curr_socclk_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_socclk) - + std::begin(m_gpu_metrics_tbl.m_current_socclk)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_socclk), + curr_socclk_num_elems, + metrics_public_init.current_socclks); + + // current_vclk0 + const auto curr_vclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_vclk0) - + std::begin(m_gpu_metrics_tbl.m_current_vclk0)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_vclk0), + curr_vclk0_num_elems, + metrics_public_init.current_vclk0s); + + // current_dclk0 + const auto curr_dclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_dclk0) - + std::begin(m_gpu_metrics_tbl.m_current_dclk0)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_dclk0), + curr_dclk0_num_elems, + metrics_public_init.current_dclk0s); + + metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk; + + metrics_public_init.num_partition = m_gpu_metrics_tbl.m_num_partition; + + metrics_public_init.pcie_lc_perf_other_end_recovery = + m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery; + + auto priv_it = std::begin(m_gpu_metrics_tbl.m_xcp_stats); + for (auto pub_it = std::begin(metrics_public_init.xcp_stats); + pub_it != std::end(metrics_public_init.xcp_stats); + ++pub_it, ++priv_it) { + std::copy_n(std::begin(priv_it->gfx_busy_inst), RSMI_MAX_NUM_XCC, + pub_it->gfx_busy_inst); + std::copy_n(std::begin(priv_it->jpeg_busy), RSMI_MAX_NUM_JPEG_ENGS, + pub_it->jpeg_busy); + std::copy_n(std::begin(priv_it->vcn_busy), RSMI_MAX_NUM_VCNS, + pub_it->vcn_busy); + std::copy_n(std::begin(priv_it->gfx_busy_acc), RSMI_MAX_NUM_XCC, + pub_it->gfx_busy_acc); + std::copy_n(std::begin(priv_it->gfx_below_host_limit_acc), RSMI_MAX_NUM_XCC, + pub_it->gfx_below_host_limit_acc); + } + + // + // Note: Backwards compatibility -> Handling extra/exception cases + // related to earlier versions (1.3/1.4/1.5) + metrics_public_init.current_gfxclk = metrics_public_init.current_gfxclks[0]; + + metrics_public_init.current_socclk = metrics_public_init.current_socclks[0]; + + metrics_public_init.current_vclk0 = metrics_public_init.current_vclk0s[0]; + + metrics_public_init.current_vclk1 = metrics_public_init.current_vclk0s[1]; + + metrics_public_init.current_dclk0 = metrics_public_init.current_dclk0s[0]; + + metrics_public_init.current_dclk1 = metrics_public_init.current_dclk0s[1]; + + // separate by XCP + if (this->m_partition_id < kRSMI_MAX_NUM_XCP + && m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy[0] != UINT16_MAX) { + std::copy(std::begin(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy), + std::end(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].vcn_busy), + std::begin(metrics_public_init.vcn_activity)); + } + if (this->m_partition_id < kRSMI_MAX_NUM_XCP + && m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy[0] != UINT16_MAX) { + std::copy(std::begin(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy), + std::end(m_gpu_metrics_tbl.m_xcp_stats[this->m_partition_id].jpeg_busy), + std::begin(metrics_public_init.jpeg_activity)); + } + + return metrics_public_init; + }(); + + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Returning = " << getRSMIStatusString(status_code) + << " |"; + LOG_TRACE(ss); + + return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl); + +} + AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v16_t::copy_internal_to_external_metrics() { std::ostringstream ss; diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 6d48476128..e42cae5313 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -52,6 +52,7 @@ #include "amd_smi/impl/amd_smi_utils.h" #include "amd_smi/impl/amd_smi_processor.h" #include "rocm_smi/rocm_smi_logger.h" +#include "rocm_smi/rocm_smi.h" // a global instance of std::mutex to protect data passed during threads std::mutex myMutex; @@ -80,7 +81,7 @@ static amdsmi_status_t get_gpu_device_from_handle(amdsmi_processor_handle proces if (r != AMDSMI_STATUS_SUCCESS) return r; if (device->get_processor_type() == AMDSMI_PROCESSOR_TYPE_AMD_GPU) { - *gpudevice = static_cast(processor_handle); + *gpudevice = static_cast(device); return AMDSMI_STATUS_SUCCESS; } @@ -665,8 +666,11 @@ amdsmi_status_t amdsmi_get_violation_status(amdsmi_processor_handle processor_ha amdsmi_gpu_metrics_t metric_info_a = {}; amdsmi_status_t status = amdsmi_get_gpu_metrics_info( - processor_handle, &metric_info_a); + processor_handle, &metric_info_a); if (status != AMDSMI_STATUS_SUCCESS) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | amdsmi_get_gpu_metrics_info failed with status = " << smi_amdgpu_get_status_string(status, false); + LOG_ERROR(ss); return status; } @@ -1053,6 +1057,43 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i return AMDSMI_STATUS_SUCCESS; } +amdsmi_status_t +amdsmi_get_gpu_xgmi_link_status(amdsmi_processor_handle processor_handle, + amdsmi_xgmi_link_status_t *link_status) { + AMDSMI_CHECK_INIT(); + + if (link_status == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + amdsmi_gpu_metrics_t metric_info = {}; + amdsmi_status_t status = amdsmi_get_gpu_metrics_info( + processor_handle, &metric_info); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } + + uint32_t dev_num = 0; + auto r = rsmi_num_monitor_devices(&dev_num); + link_status->total_links = AMDSMI_MAX_NUM_XGMI_LINKS; + if (dev_num <= link_status->total_links) { + link_status->total_links = dev_num; + } + // get the status values from the metric info + for (unsigned int i = 0; i < link_status->total_links; i++) { + if (metric_info.xgmi_link_status[i] == std::numeric_limits::max()) { + link_status->status[i] = AMDSMI_XGMI_LINK_DISABLE; + } else if (metric_info.xgmi_link_status[i] == 0) { + link_status->status[i] = AMDSMI_XGMI_LINK_DOWN; + } else if (metric_info.xgmi_link_status[i] == 1) { + link_status->status[i] = AMDSMI_XGMI_LINK_UP; + } else { + return AMDSMI_STATUS_UNEXPECTED_DATA; + } + } + return AMDSMI_STATUS_SUCCESS; +} + amdsmi_status_t amdsmi_get_gpu_kfd_info(amdsmi_processor_handle processor_handle, amdsmi_kfd_info_t *info) { AMDSMI_CHECK_INIT(); @@ -1135,6 +1176,7 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( info->vram_size = 0; info->vram_vendor = AMDSMI_VRAM_VENDOR__PLACEHOLDER0; info->vram_bit_width = std::numeric_limitsvram_bit_width)>::max(); + info->vram_max_bandwidth = std::numeric_limitsvram_max_bandwidth)>::max(); // Only can read vram type from libdrm if (gpu_device->check_if_drm_is_supported()) { @@ -1148,6 +1190,13 @@ amdsmi_status_t amdsmi_get_gpu_vram_info( } } + // set info->vram_max_bandwidth to gpu_metrics vram_max_bandwidth if it is not set + amdsmi_gpu_metrics_t metric_info = {}; + r = amdsmi_get_gpu_metrics_info(processor_handle, &metric_info); + if (r == AMDSMI_STATUS_SUCCESS) { + info->vram_max_bandwidth = metric_info.vram_max_bandwidth; + } + // if vram type is greater than the max enum set it to unknown if (info->vram_type > AMDSMI_VRAM_TYPE__MAX) info->vram_type = AMDSMI_VRAM_TYPE_UNKNOWN; diff --git a/projects/amdsmi/tests/amd_smi_test/functional/gpu_metrics_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/gpu_metrics_read.cc index db07d15138..93f1041032 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/gpu_metrics_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/gpu_metrics_read.cc @@ -239,6 +239,12 @@ void TestGpuMetricsRead::Run(void) { amd::smi::make_ostream_joiner(&std::cout, ", ")); std::cout << std::dec << "]\n"; + std::cout << std::dec << "xgmi_link_status= ["; + std::copy(std::begin(smu.xgmi_link_status), + std::end(smu.xgmi_link_status), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << std::dec << "]\n"; + // Voltage (mV) std::cout << "voltage_soc = " << std::dec << smu.voltage_soc << "\n"; std::cout << "voltage_gfx = " << std::dec << smu.voltage_gfx << "\n"; @@ -254,6 +260,9 @@ void TestGpuMetricsRead::Run(void) { std::cout << "pcie_bandwidth_acc=" << std::dec << smu.pcie_bandwidth_acc << "\n"; std::cout << "pcie_bandwidth_inst=" << std::dec << smu.pcie_bandwidth_inst << "\n"; + // VRAM max bandwidth at max memory clock (GB/sec) + std::cout << "vram_max_bandwidth=" << std::dec << smu.vram_max_bandwidth << "\n"; + // Counts std::cout << "pcie_l0_to_recov_count_acc= " << std::dec << smu.pcie_l0_to_recov_count_acc << "\n"; @@ -329,6 +338,17 @@ void TestGpuMetricsRead::Run(void) { xcp++; } + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_acc = \n"; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_below_host_limit_acc), + std::end(row.gfx_below_host_limit_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + std::cout << "\n\n"; std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; constexpr uint16_t kMAX_ITER_TEST = 10; diff --git a/projects/amdsmi/tests/amd_smi_test/functional/id_info_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/id_info_read.cc index 91649cf97b..4ede24beed 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/id_info_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/id_info_read.cc @@ -129,17 +129,19 @@ void TestIdInfoRead::Run(void) { CHK_ERR_ASRT(err) IF_VERB(STANDARD) { std::cout << "\t**Device Vram type id: " - << vram_info.vram_type << std::endl; + << vram_info.vram_type << std::endl; std::cout << "\t**Device Vram vendor id: " - << vram_info.vram_vendor << std::endl; + << vram_info.vram_vendor << std::endl; std::cout << "\t**Device Vram size: 0x" - << std::hex << vram_info.vram_size - << " (" << std::dec << vram_info.vram_size << ")" - << std::endl; + << std::hex << vram_info.vram_size + << " (" << std::dec << vram_info.vram_size << ")" + << std::endl; std::cout << "\t**Device Bit Width: 0x" - << std::hex << vram_info.vram_bit_width - << " (" << std::dec << vram_info.vram_bit_width << ")" - << std::endl; + << std::hex << vram_info.vram_bit_width + << " (" << std::dec << vram_info.vram_bit_width << ")" + << std::endl; + std::cout << "\t**Device Vram Max Bandwidth: " + << vram_info.vram_max_bandwidth << " GB/s" << std::endl; } err = amdsmi_get_gpu_vendor_name(processor_handles_[i], buffer, kBufferLen);