From ce19b921b01756e50e0143351b7941cfe672da53 Mon Sep 17 00:00:00 2001 From: "Poag, Charis" Date: Mon, 20 Oct 2025 14:43:40 -0500 Subject: [PATCH] [SWDEV-535159] Add support for GPU partition metrics (#490) [SWDEV-535159] Add support for GPU partition metrics Changes include: - Internal logic to smart-switch between gpu_metrics/xcp_metrics files - [WIP] Initial plumbing for new partition metric API Change-Id: I4340fb1b48bac0117d80d5d486b9e871430d5cd8 Signed-off-by: Charis Poag Add amdsmi_get_gpu_partition_metrics_info() + minor cleanup Change-Id: I5d60604f18baddbd03852dc90e88aa0b8107d50e Signed-off-by: Charis Poag Fix partition metric logic + update logging/tests Change-Id: I9e89b19ead17694c54e224f8e13ff8ee3eb2e22a Signed-off-by: Charis Poag Adjust amd-smi metric/monitor/default to show (some) partition information Change-Id: I2e8d2745876a19bdaec3c039daa97345c9f701b5 Signed-off-by: Charis Poag Add C++ tests Change-Id: Ib9eb0b57a6d7a280992e05a4c6eba632826952ef Signed-off-by: Charis Poag Remove modification of energy counter, not needed Change-Id: I5c48eaaae248ee6dc79abba609d837ec35d78022 Signed-off-by: Charis Poag [CLI] amd-smi metric: cleaned up N/A'd multi-valued to show just N/A Changes: 1. amd-smi metric: cleaned up N/A'd multi-valued to show just N/A ex. JPEG_ACTIVITY: [N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A, N/A] Now just shows: N/A 2. [Python Unit Test] Changed testname TestAmdSmiPythonBDF(unittest.TestCase) -> AmdSmiPythonUnitTest Test name was confusing. Change-Id: Ieb3b036f30002fd22362508eb9fc5d443df395ae Signed-off-by: Charis Poag Log cleanup Change-Id: I1b1a95f1844d35bec7a7bd8cb996f87e4914c069 Signed-off-by: Charis Poag Add amd-smi partition-metrics CLI + general cleanup Change-Id: Ia91488e6cb3a4d62b4087afbddfe0b3bb9378fdc Signed-off-by: Charis Poag [1.3 metrics] Remove forwards compatibility for partition metrics Change-Id: Iab928983e6f6f1587bc9307f6f3fa2b2696ca6f7 Signed-off-by: Charis Poag Fixed violation output not showing % + general cleanup Change-Id: Icac1b0a55b18c7628b07109ae0c377d17e0825f1 Signed-off-by: Charis Poag Clean up amdsmi_get_gpu_partition_metrics_info & amd-smi partition-metric outputs Change-Id: I6427028b980874641e9ffb3b5d88ad493dbf9cf4 Signed-off-by: Charis Poag * Fix metrics not found + extra logging/formatting Change-Id: I841a27bb2c305e97ec7579a13ac915e5be497c3a Signed-off-by: Charis Poag * Update license to current default Change-Id: I0de9b8a2d5dbbeab4491097f0354ba17b0d30866 Signed-off-by: Charis Poag * Cleanup for review Change-Id: I96ed25c3f2b8968eea1af24c5e5860c2b4e74e6e Signed-off-by: Charis Poag * Moderize updated/new interal APIs. Change-Id: I3c48a250eeb703709b14cb5ffa68268d8321626c Signed-off-by: Charis Poag * Remove extra logging in dynamic metrics Change-Id: Idb97547bcbe143d6fa1cb5cb278ffe4da615ce14 Signed-off-by: Charis Poag * Remove amd-smi partition-metric command Change-Id: Ib83c17e5cd7e0da3798198943bddd46c296b411c Signed-off-by: Charis Poag * Move new CLI updates to another PR + minor fixes Change-Id: I3b1163eec12f9b5f7d95ee33de08e168cec1b1fe Signed-off-by: Charis Poag * Allow dynamic metrics to work for gpu/xcp metrics 1.9+/1.1+ Updated some logging as well. Change-Id: I2ed9f5a5ef8afb1520508820ca6153525f0644b4 Signed-off-by: Charis Poag * Allow dyn gpu/xcp metric v1.9+/v1.1+ Added tests for quick check Change-Id: I576d6f6582a55afb08e5ac57791ce95e2fa184a2 Signed-off-by: Charis Poag * Update tests for larger subset of version checks Change-Id: I3cdf4f8bb4fc6161f4c76566939f90545d0f362a Signed-off-by: Charis Poag * Fix XCP metrics in gpu/partition metric pre-v1.9/v1.1 (dynamic) Change-Id: I4dabc1ed6bef6b86c8e7f92bf9cb5992f3966fe2 Signed-off-by: Charis Poag --------- Signed-off-by: Charis Poag [ROCm/amdsmi commit: 01b4fe66142b659603ba64feef5a15ca53e24857] --- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 50 +- projects/amdsmi/amdsmi_cli/amdsmi_helpers.py | 71 +- projects/amdsmi/amdsmi_cli/amdsmi_parser.py | 1 - projects/amdsmi/include/amd_smi/amdsmi.h | 24 + projects/amdsmi/py-interface/__init__.py | 1 + .../amdsmi/py-interface/amdsmi_interface.py | 159 ++ .../amdsmi/py-interface/amdsmi_wrapper.py | 34 +- .../rocm_smi/include/rocm_smi/rocm_smi.h | 23 + .../include/rocm_smi/rocm_smi_device.h | 32 +- .../rocm_smi/rocm_smi_dyn_gpu_metrics.h | 48 +- .../include/rocm_smi/rocm_smi_gpu_metrics.h | 70 +- .../amdsmi/rocm_smi/src/rocm_smi_device.cc | 26 +- .../rocm_smi/src/rocm_smi_dyn_gpu_metrics.cc | 59 +- .../rocm_smi/src/rocm_smi_gpu_metrics.cc | 1711 ++++++++++------- projects/amdsmi/src/amd_smi/amd_smi.cc | 16 +- .../amdsmi/tests/amd_smi_test/CMakeLists.txt | 5 + .../functional/dynamic_metrics_test.cc | 203 ++ .../functional/gpu_partition_metrics_read.cc | 426 ++++ .../functional/gpu_partition_metrics_read.h | 51 + projects/amdsmi/tests/amd_smi_test/main.cc | 5 + .../amdsmi/tests/amd_smi_test/test_base.cc | 18 +- .../tests/python_unittest/unit_tests.py | 17 +- 22 files changed, 2235 insertions(+), 815 deletions(-) create mode 100644 projects/amdsmi/tests/amd_smi_test/functional/dynamic_metrics_test.cc create mode 100644 projects/amdsmi/tests/amd_smi_test/functional/gpu_partition_metrics_read.cc create mode 100644 projects/amdsmi/tests/amd_smi_test/functional/gpu_partition_metrics_read.h diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index bf070148e1..1e16e05d55 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -1632,6 +1632,7 @@ class AMDSMICommands(): # Add timestamp and store values for specified arguments values_dict = {} + is_partition_metrics = False # True if we get the metrics from xcp_metrics file (amdsmi_get_gpu_partition_metrics_info) #get metric info only once per gpu, this will speed up data output try: # Get GPU Metrics table @@ -1640,19 +1641,10 @@ class AMDSMICommands(): logging.debug("#3 - Unable to load GPU Metrics table for %s | %s", gpu_id, e.get_error_info()) gpu_metric = amdsmi_interface._NA_amdsmi_get_gpu_metrics_info() - # Workaround for XCP (partition) metrics not providing num_partition in v1.0 - # Confirmed with driver team that we can default to 1 if num_partition is not defined. - # Pending partitions exist, ie. partition_id > 0. See logic below. - try: - partition_id = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)['current_partition_id'] - except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get current partition id for gpu %s | %s", gpu_id, e.get_error_info()) - partition_id = "N/A" - - num_partition = gpu_metric['num_partition'] - if num_partition == "N/A": - num_partition = 1 # Workaround for XCP metrics not providing num_partition in v1.0 - logging.debug(f"num_partition is N/A and partition_id: {partition_id} (greater > 0).\nModified num_partition: {num_partition} to adjust for XCP metrics.") + # Workaround for XCP (partition) metrics not providing num_partition in v1.9+/v1.1+ + # Provides original formatting for earlier metric versions + partition_metric_info = self.helpers._get_metric_version_and_partition_info(gpu_metric, is_partition_metrics, gpu_id, args.gpu) + num_partition = partition_metric_info['num_partition'] if self.logger.is_json_format(): values_dict['gpu'] = int(gpu_id) @@ -2679,7 +2671,7 @@ class AMDSMICommands(): value[k][index] = self.helpers.unit_format(self.logger, activity, activity_unit) value[k] = '[' + ", ".join(value[k]) + ']' elif value != "N/A": - value = self.helpers.unit_format(self.logger, value, activity_unit) + throttle_status[key] = self.helpers.unit_format(self.logger, value, activity_unit) if self.logger.is_json_format(): if isinstance(value, (list, dict)): for k, v in value.items(): @@ -3090,7 +3082,6 @@ class AMDSMICommands(): if not self.logger.is_json_format(): self.logger.print_output(multiple_device_enabled=multiple_devices_csv_override) - def metric(self, args, multiple_devices=False, watching_output=False, gpu=None, usage=None, watch=None, watch_time=None, iterations=None, power=None, clock=None, temperature=None, ecc=None, ecc_blocks=None, pcie=None, @@ -5710,6 +5701,7 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: logging.debug("#5 - Unable to load GPU Metrics table for %s | %s", gpu_id, e.get_error_info()) + is_partition_metrics = False # True if we get the metrics from xcp_metrics file (amdsmi_get_gpu_partition_metrics_info) #get metric info only once per gpu, this will speed up data output try: # Get GPU Metrics table @@ -5721,25 +5713,15 @@ class AMDSMICommands(): gpu_metrics_info = amdsmi_interface._NA_amdsmi_get_gpu_metrics_info() logging.debug("Unable to load GPU Metrics table for %s | %s", gpu_id, e.get_error_info()) - # Workaround for XCP (partition) metrics not providing num_partition in v1.0 - # Confirmed with driver team that we can default to 1 if num_partition is not defined. - # Pending partitions exist, ie. partition_id > 0. See logic below. - try: - partition_id = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu)['current_partition_id'] - except amdsmi_exception.AmdSmiLibraryException as e: - logging.debug("Failed to get current partition id for gpu %s | %s", gpu_id, e.get_error_info()) - partition_id = "N/A" + # Workaround for XCP (partition) metrics not providing num_partition in v1.9+/v1.1+ + # Provides original formatting for earlier metric versions + partition_metric_info = self.helpers._get_metric_version_and_partition_info(gpu_metrics_info, is_partition_metrics, gpu_id, args.gpu) + partition_id = partition_metric_info['partition_id'] + num_partition = partition_metric_info['num_partition'] - num_partition = gpu_metrics_info['num_partition'] - if num_partition == "N/A": - num_partition = partition_id - - num_xcp = num_partition # used later for XCP metrics + # Update logger for XCP display (only if applicable) self.logger.table_header += 'XCP'.rjust(5, ' ') - self.logger.store_output(args.gpu, 'xcp', partition_id) # Starting with partition_id. - # Outputs which have xcp details - # will update this value via num_xcp. - # This value will help map to primary device. + self.logger.store_output(args.gpu, 'xcp', partition_id) # Store partition_id initially; can be updated via num_xcp # Store the pcie_bw values due to possible increase in bandwidth due to repeated gpu_metrics calls if args.pcie: @@ -5979,7 +5961,7 @@ class AMDSMICommands(): "unit" : freq_unit} except (KeyError, amdsmi_exception.AmdSmiLibraryException) as e: monitor_values['dclock'] = "N/A" - logging.debug("Failed to get vclock on gpu %s | %s", gpu_id, e) + logging.debug("Failed to get dclock on gpu %s | %s", gpu_id, e) self.logger.table_header += 'DCLOCK'.rjust(10) @@ -6322,7 +6304,7 @@ class AMDSMICommands(): self.logger.store_multiple_device_output() current_xcp += 1 else: - self.logger.store_output(args.gpu, 'xcp', num_xcp) + self.logger.store_output(args.gpu, 'xcp', partition_id) self.logger.store_output(args.gpu, 'values', monitor_values) # Store typical output for all commands (XCP data will be handled separately, eg. violation status) diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py index a1874b78a0..f08cf0fd8c 100755 --- a/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_helpers.py @@ -1018,7 +1018,6 @@ class AMDSMIHelpers(): """This function will format output with unit based on the logger output format params: - args - argparser args to pass to subcommand logger (AMDSMILogger) - Logger to print out output value - the value to be formatted unit - the unit to be formatted with the value @@ -1041,6 +1040,9 @@ class AMDSMIHelpers(): return {"value": value, "unit": unit} else: return value + if logger.is_csv_format(): + # For CSV, return the raw value (number or "N/A"), not a string + return value if logger.is_human_readable_format(): if unit: return f"{value} {unit}".rstrip() @@ -1745,3 +1747,70 @@ class AMDSMIHelpers(): # Flatten nested lists and filter integers flat = [v for value in data for v in (value if isinstance(value, list) else [value]) if isinstance(v, int)] return round(sum(flat) / len(flat)) if flat else "N/A" + + def _get_metric_version_and_partition_info(self, gpu_metrics_info, is_partition_metrics, gpu_id, gpu_handle): + """ + Helper method to compute metric version, partition ID, and num_partition for dynamic metrics. + Handles logging updates internally for reusability. + + Args: + gpu_metrics_info (dict): GPU metrics info from amdsmi_get_gpu_metrics_info. + is_partition_metrics (bool): Whether this is for partition metrics. + gpu_id (int): GPU ID for logging. + gpu_handle: GPU device handle for KFD info retrieval. + + Returns: + dict: { + 'metric_version': float or "N/A", + 'partition_id': int or "N/A", + 'num_partition': int or "N/A", + 'num_xcp': int or "N/A" # Alias for num_partition + } + """ + # Compute metric version from header revisions + metric_version = "N/A" + format_rev = gpu_metrics_info.get('common_header.format_revision', "N/A") + content_rev = gpu_metrics_info.get('common_header.content_revision', "N/A") + if format_rev != "N/A" and content_rev != "N/A": + try: + metric_version = float(f"{format_rev}.{content_rev}") + except ValueError: + metric_version = "N/A" # Fallback if conversion fails + + # Retrieve partition ID from KFD info + partition_id = "N/A" + try: + kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(gpu_handle) + partition_id = kfd_info.get('current_partition_id', "N/A") + except amdsmi_exception.AmdSmiLibraryException as e: + logging.debug("Failed to get current partition ID for GPU %s | %s", gpu_id, e.get_error_info()) + + # Determine num_partition with fallback logic for dynamic metrics + num_partition = gpu_metrics_info.get('num_partition', "N/A") + if metric_version != "N/A" and num_partition == "N/A": + # Workaround: Default to 1 for newer metric versions if num_partition is missing + # (Confirmed with driver team; applies to GPU and partition metrics) + if not is_partition_metrics and metric_version >= 1.9: + num_partition = 1 + elif is_partition_metrics and metric_version >= 1.1: + num_partition = 1 + elif partition_id != "N/A" and partition_id > 0: + # Fallback to partition_id if partitions exist but num_partition is unavailable + num_partition = partition_id + # Else: Remains "N/A" if no conditions match + + # Alias num_xcp for XCP metrics usage + num_xcp = num_partition + + # Debug logging + logging.debug( + "GPU %s | Metric version: %s, num_partition: %s, partition_id: %s, num_xcp: %s", + gpu_id, metric_version, num_partition, partition_id, num_xcp + ) + + return { + 'metric_version': metric_version, + 'partition_id': partition_id, + 'num_partition': num_partition, + 'num_xcp': num_xcp + } diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index da446fcdf5..85a9533aab 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -918,7 +918,6 @@ class AMDSMIParser(argparse.ArgumentParser): self._add_device_arguments(bad_pages_parser, required=False) self._add_command_modifiers(bad_pages_parser) - def _add_metric_parser(self, subparsers: argparse._SubParsersAction, func): # Subparser help text metric_help = "Gets metric/performance information about the specified GPU" diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 7f636fd990..8a7214ff9f 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -4055,6 +4055,30 @@ amdsmi_get_gpu_metrics_header_info(amdsmi_processor_handle processor_handle, amd amdsmi_status_t amdsmi_get_gpu_metrics_info(amdsmi_processor_handle processor_handle, amdsmi_gpu_metrics_t *pgpu_metrics); +/** + * @brief This function retrieves the partition metrics information. + * + * @ingroup tagClkPowerPerfQuery + * + * @platform{gpu_bm_linux} @platform{guest_1vf} + * + * @details Given a processor handle @p processor_handle and a pointer to a + * ::amdsmi_gpu_metrics_t structure @p pgpu_metrics, this function will populate + * @p pgpu_metrics. See ::amdsmi_gpu_metrics_t for more details. + * + * @param[in] processor_handle a processor handle + * + * @param[in,out] pgpu_metrics a pointer to an ::amdsmi_gpu_metrics_t structure + * If this parameter is nullptr, this function will return + * ::AMDSMI_STATUS_INVAL if the function is supported with the provided, + * arguments and ::AMDSMI_STATUS_NOT_SUPPORTED if it is not supported with the + * provided arguments. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t amdsmi_get_gpu_partition_metrics_info(amdsmi_processor_handle processor_handle, + amdsmi_gpu_metrics_t *pgpu_metrics); + /** * @brief Get the pm metrics table with provided device index. * diff --git a/projects/amdsmi/py-interface/__init__.py b/projects/amdsmi/py-interface/__init__.py index f9d2bf84c7..fbffcb11ff 100644 --- a/projects/amdsmi/py-interface/__init__.py +++ b/projects/amdsmi/py-interface/__init__.py @@ -184,6 +184,7 @@ from .amdsmi_interface import amdsmi_get_gpu_mem_overdrive_level from .amdsmi_interface import amdsmi_get_clk_freq from .amdsmi_interface import amdsmi_get_gpu_od_volt_info from .amdsmi_interface import amdsmi_get_gpu_metrics_info +from .amdsmi_interface import amdsmi_get_gpu_partition_metrics_info from .amdsmi_interface import amdsmi_get_gpu_od_volt_curve_regions from .amdsmi_interface import amdsmi_is_gpu_power_management_enabled diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 996d69379c..08dd540da2 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -4932,6 +4932,165 @@ def amdsmi_get_gpu_metrics_info( gpu_metrics_output['xcp_stats.gfx_below_host_limit_total_acc'][xcp_index] = xcp_detail return gpu_metrics_output +def amdsmi_get_gpu_partition_metrics_info( + processor_handle: processor_handle_t, +) -> Dict[str, Any]: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + gpu_metrics = amdsmi_wrapper.amdsmi_gpu_metrics_t() + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_partition_metrics_info( + processor_handle, ctypes.byref(gpu_metrics) + ) + ) + + gpu_metrics_output = { + "common_header.structure_size": _validate_if_max_uint(gpu_metrics.common_header.structure_size, MaxUIntegerTypes.UINT16_T), + "common_header.format_revision": _validate_if_max_uint(gpu_metrics.common_header.format_revision, MaxUIntegerTypes.UINT8_T), + "common_header.content_revision": _validate_if_max_uint(gpu_metrics.common_header.content_revision, MaxUIntegerTypes.UINT8_T), + "temperature_edge": _validate_if_max_uint(gpu_metrics.temperature_edge, MaxUIntegerTypes.UINT16_T), + "temperature_hotspot": _validate_if_max_uint(gpu_metrics.temperature_hotspot, MaxUIntegerTypes.UINT16_T), + "temperature_mem": _validate_if_max_uint(gpu_metrics.temperature_mem, MaxUIntegerTypes.UINT16_T), + "temperature_vrgfx": _validate_if_max_uint(gpu_metrics.temperature_vrgfx, MaxUIntegerTypes.UINT16_T), + "temperature_vrsoc": _validate_if_max_uint(gpu_metrics.temperature_vrsoc, MaxUIntegerTypes.UINT16_T), + "temperature_vrmem": _validate_if_max_uint(gpu_metrics.temperature_vrmem, MaxUIntegerTypes.UINT16_T), + "average_gfx_activity": _validate_if_max_uint(gpu_metrics.average_gfx_activity, MaxUIntegerTypes.UINT16_T, isActivity=True), + "average_umc_activity": _validate_if_max_uint(gpu_metrics.average_umc_activity, MaxUIntegerTypes.UINT16_T, isActivity=True), + "average_mm_activity": _validate_if_max_uint(gpu_metrics.average_mm_activity, MaxUIntegerTypes.UINT16_T, isActivity=True), + "average_socket_power": _validate_if_max_uint(gpu_metrics.average_socket_power, MaxUIntegerTypes.UINT16_T), + "energy_accumulator": _validate_if_max_uint(gpu_metrics.energy_accumulator, MaxUIntegerTypes.UINT64_T), + "system_clock_counter": _validate_if_max_uint(gpu_metrics.system_clock_counter, MaxUIntegerTypes.UINT64_T), + "average_gfxclk_frequency": _validate_if_max_uint(gpu_metrics.average_gfxclk_frequency, MaxUIntegerTypes.UINT16_T), + "average_socclk_frequency": _validate_if_max_uint(gpu_metrics.average_socclk_frequency, MaxUIntegerTypes.UINT16_T), + "average_uclk_frequency": _validate_if_max_uint(gpu_metrics.average_uclk_frequency, MaxUIntegerTypes.UINT16_T), + "average_vclk0_frequency": _validate_if_max_uint(gpu_metrics.average_vclk0_frequency, MaxUIntegerTypes.UINT16_T), + "average_dclk0_frequency": _validate_if_max_uint(gpu_metrics.average_dclk0_frequency, MaxUIntegerTypes.UINT16_T), + "average_vclk1_frequency": _validate_if_max_uint(gpu_metrics.average_vclk1_frequency, MaxUIntegerTypes.UINT16_T), + "average_dclk1_frequency": _validate_if_max_uint(gpu_metrics.average_dclk1_frequency, MaxUIntegerTypes.UINT16_T), + "current_gfxclk": _validate_if_max_uint(gpu_metrics.current_gfxclk, MaxUIntegerTypes.UINT16_T), + "current_socclk": _validate_if_max_uint(gpu_metrics.current_socclk, MaxUIntegerTypes.UINT16_T), + "current_uclk": _validate_if_max_uint(gpu_metrics.current_uclk, MaxUIntegerTypes.UINT16_T), + "current_vclk0": _validate_if_max_uint(gpu_metrics.current_vclk0, MaxUIntegerTypes.UINT16_T), + "current_dclk0": _validate_if_max_uint(gpu_metrics.current_dclk0, MaxUIntegerTypes.UINT16_T), + "current_vclk1": _validate_if_max_uint(gpu_metrics.current_vclk1, MaxUIntegerTypes.UINT16_T), + "current_dclk1": _validate_if_max_uint(gpu_metrics.current_dclk1, MaxUIntegerTypes.UINT16_T), + "throttle_status": _validate_if_max_uint(gpu_metrics.throttle_status, MaxUIntegerTypes.UINT32_T, isBool=True), + "current_fan_speed": _validate_if_max_uint(gpu_metrics.current_fan_speed, MaxUIntegerTypes.UINT16_T), + "pcie_link_width": _validate_if_max_uint(gpu_metrics.pcie_link_width, MaxUIntegerTypes.UINT16_T), + "pcie_link_speed": _validate_if_max_uint(gpu_metrics.pcie_link_speed, MaxUIntegerTypes.UINT16_T), + "gfx_activity_acc": _validate_if_max_uint(gpu_metrics.gfx_activity_acc, MaxUIntegerTypes.UINT32_T), + "mem_activity_acc": _validate_if_max_uint(gpu_metrics.mem_activity_acc, MaxUIntegerTypes.UINT32_T), + "temperature_hbm": _validate_if_max_uint(list(gpu_metrics.temperature_hbm), MaxUIntegerTypes.UINT16_T), + "firmware_timestamp": _validate_if_max_uint(gpu_metrics.firmware_timestamp, MaxUIntegerTypes.UINT64_T), + "voltage_soc": _validate_if_max_uint(gpu_metrics.voltage_soc, MaxUIntegerTypes.UINT16_T), + "voltage_gfx": _validate_if_max_uint(gpu_metrics.voltage_gfx, MaxUIntegerTypes.UINT16_T), + "voltage_mem": _validate_if_max_uint(gpu_metrics.voltage_mem, MaxUIntegerTypes.UINT16_T), + "indep_throttle_status": _validate_if_max_uint(gpu_metrics.indep_throttle_status, MaxUIntegerTypes.UINT64_T, isBool=True), + "current_socket_power": _validate_if_max_uint(gpu_metrics.current_socket_power, MaxUIntegerTypes.UINT16_T), + "vcn_activity": _validate_if_max_uint(list(gpu_metrics.vcn_activity), MaxUIntegerTypes.UINT16_T, isActivity=True), + "gfxclk_lock_status": _validate_if_max_uint(gpu_metrics.gfxclk_lock_status, MaxUIntegerTypes.UINT32_T), + "xgmi_link_width": _validate_if_max_uint(gpu_metrics.xgmi_link_width, MaxUIntegerTypes.UINT16_T), + "xgmi_link_speed": _validate_if_max_uint(gpu_metrics.xgmi_link_speed, MaxUIntegerTypes.UINT16_T), + "pcie_bandwidth_acc": _validate_if_max_uint(gpu_metrics.pcie_bandwidth_acc, MaxUIntegerTypes.UINT64_T), + "pcie_bandwidth_inst": _validate_if_max_uint(gpu_metrics.pcie_bandwidth_inst, MaxUIntegerTypes.UINT64_T), + "pcie_l0_to_recov_count_acc": _validate_if_max_uint(gpu_metrics.pcie_l0_to_recov_count_acc, MaxUIntegerTypes.UINT64_T), + "pcie_replay_count_acc": _validate_if_max_uint(gpu_metrics.pcie_replay_count_acc, MaxUIntegerTypes.UINT64_T), + "pcie_replay_rover_count_acc": _validate_if_max_uint(gpu_metrics.pcie_replay_rover_count_acc, MaxUIntegerTypes.UINT64_T), + "xgmi_read_data_acc": _validate_if_max_uint(list(gpu_metrics.xgmi_read_data_acc), MaxUIntegerTypes.UINT64_T), + "xgmi_write_data_acc": _validate_if_max_uint(list(gpu_metrics.xgmi_write_data_acc), MaxUIntegerTypes.UINT64_T), + "current_gfxclks": _validate_if_max_uint(list(gpu_metrics.current_gfxclks), MaxUIntegerTypes.UINT16_T), + "current_socclks": _validate_if_max_uint(list(gpu_metrics.current_socclks), MaxUIntegerTypes.UINT16_T), + "current_vclk0s": _validate_if_max_uint(list(gpu_metrics.current_vclk0s), MaxUIntegerTypes.UINT16_T), + "current_dclk0s": _validate_if_max_uint(list(gpu_metrics.current_dclk0s), MaxUIntegerTypes.UINT16_T), + "jpeg_activity": _validate_if_max_uint(list(gpu_metrics.jpeg_activity), MaxUIntegerTypes.UINT16_T, isActivity=True), + "pcie_nak_sent_count_acc": _validate_if_max_uint(gpu_metrics.pcie_nak_sent_count_acc, MaxUIntegerTypes.UINT32_T), + "pcie_nak_rcvd_count_acc": _validate_if_max_uint(gpu_metrics.pcie_nak_rcvd_count_acc, MaxUIntegerTypes.UINT32_T), + "accumulation_counter": _validate_if_max_uint(gpu_metrics.accumulation_counter, MaxUIntegerTypes.UINT64_T), + "prochot_residency_acc": _validate_if_max_uint(gpu_metrics.prochot_residency_acc, MaxUIntegerTypes.UINT64_T), + "ppt_residency_acc": _validate_if_max_uint(gpu_metrics.ppt_residency_acc, MaxUIntegerTypes.UINT64_T), + "socket_thm_residency_acc": _validate_if_max_uint(gpu_metrics.socket_thm_residency_acc, MaxUIntegerTypes.UINT64_T), + "vr_thm_residency_acc": _validate_if_max_uint(gpu_metrics.vr_thm_residency_acc, MaxUIntegerTypes.UINT64_T), + "hbm_thm_residency_acc": _validate_if_max_uint(gpu_metrics.hbm_thm_residency_acc, MaxUIntegerTypes.UINT64_T), + "num_partition": _validate_if_max_uint(gpu_metrics.num_partition, MaxUIntegerTypes.UINT16_T), + "xcp_stats.gfx_busy_inst": list(gpu_metrics.xcp_stats), + "xcp_stats.jpeg_busy": list(gpu_metrics.xcp_stats), + "xcp_stats.vcn_busy": list(gpu_metrics.xcp_stats), + "xcp_stats.gfx_busy_acc": list(gpu_metrics.xcp_stats), + "xcp_stats.gfx_below_host_limit_acc": list(gpu_metrics.xcp_stats), + "xcp_stats.gfx_below_host_limit_ppt_acc": list(gpu_metrics.xcp_stats), + "xcp_stats.gfx_below_host_limit_thm_acc": list(gpu_metrics.xcp_stats), + "xcp_stats.gfx_low_utilization_acc": list(gpu_metrics.xcp_stats), + "xcp_stats.gfx_below_host_limit_total_acc": list(gpu_metrics.xcp_stats), + "pcie_lc_perf_other_end_recovery": _validate_if_max_uint(gpu_metrics.pcie_lc_perf_other_end_recovery, MaxUIntegerTypes.UINT32_T), + "vram_max_bandwidth": _validate_if_max_uint(gpu_metrics.vram_max_bandwidth, MaxUIntegerTypes.UINT64_T), + "xgmi_link_status": _validate_if_max_uint(list(gpu_metrics.xgmi_link_status), MaxUIntegerTypes.UINT16_T), + } + + # Create 2d array with each XCD's stats + if 'xcp_stats.gfx_busy_inst' in gpu_metrics_output: + for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_busy_inst']): + xcp_detail = [] + for val in xcp_metrics.gfx_busy_inst: + xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT32_T, isActivity=True)) + gpu_metrics_output['xcp_stats.gfx_busy_inst'][xcp_index] = xcp_detail + + if 'xcp_stats.jpeg_busy' in gpu_metrics_output: + for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.jpeg_busy']): + xcp_detail = [] + for val in xcp_metrics.jpeg_busy: + xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT16_T, isActivity=True)) + gpu_metrics_output['xcp_stats.jpeg_busy'][xcp_index] = xcp_detail + + if 'xcp_stats.vcn_busy' in gpu_metrics_output: + for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.vcn_busy']): + xcp_detail = [] + for val in xcp_metrics.vcn_busy: + xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT16_T, isActivity=True)) + gpu_metrics_output["xcp_stats.vcn_busy"][xcp_index] = xcp_detail + + if 'xcp_stats.gfx_busy_acc' in gpu_metrics_output: + for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_busy_acc']): + xcp_detail = [] + for val in xcp_metrics.gfx_busy_acc: + xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) + gpu_metrics_output["xcp_stats.gfx_busy_acc"][xcp_index] = xcp_detail + + if 'xcp_stats.gfx_below_host_limit_acc' in gpu_metrics_output: + for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_acc']): + xcp_detail = [] + for val in xcp_metrics.gfx_below_host_limit_acc: + xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) + gpu_metrics_output['xcp_stats.gfx_below_host_limit_acc'][xcp_index] = xcp_detail + # new for gpu metrics v1.8 + if 'xcp_stats.gfx_below_host_limit_ppt_acc' in gpu_metrics_output: + for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_ppt_acc']): + xcp_detail = [] + for val in xcp_metrics.gfx_below_host_limit_ppt_acc: + xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) + gpu_metrics_output['xcp_stats.gfx_below_host_limit_ppt_acc'][xcp_index] = xcp_detail + if 'xcp_stats.gfx_below_host_limit_thm_acc' in gpu_metrics_output: + for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_thm_acc']): + xcp_detail = [] + for val in xcp_metrics.gfx_below_host_limit_thm_acc: + xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) + gpu_metrics_output['xcp_stats.gfx_below_host_limit_thm_acc'][xcp_index] = xcp_detail + if 'xcp_stats.gfx_low_utilization_acc' in gpu_metrics_output: + for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_low_utilization_acc']): + xcp_detail = [] + for val in xcp_metrics.gfx_low_utilization_acc: + xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) + gpu_metrics_output['xcp_stats.gfx_low_utilization_acc'][xcp_index] = xcp_detail + if 'xcp_stats.gfx_below_host_limit_total_acc' in gpu_metrics_output: + for xcp_index, xcp_metrics in enumerate(gpu_metrics_output['xcp_stats.gfx_below_host_limit_total_acc']): + xcp_detail = [] + for val in xcp_metrics.gfx_below_host_limit_total_acc: + xcp_detail.append(_validate_if_max_uint(val, MaxUIntegerTypes.UINT64_T)) + gpu_metrics_output['xcp_stats.gfx_below_host_limit_total_acc'][xcp_index] = xcp_detail + return gpu_metrics_output + def amdsmi_get_gpu_od_volt_curve_regions( processor_handle: processor_handle_t, num_regions: int diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index f5f05dfeb3..7c4ae3817b 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -964,6 +964,21 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum class struct_amdsmi_pcie_info_t(Structure): pass +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_width', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('max_pcie_interface_version', ctypes.c_uint32), + ('PADDING_1', ctypes.c_ubyte * 4), + ('reserved', ctypes.c_uint64 * 9), +] + class struct_pcie_metric_(Structure): pass @@ -984,21 +999,6 @@ struct_pcie_metric_._fields_ = [ ('reserved', ctypes.c_uint64 * 12), ] -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_width', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('max_pcie_interface_version', ctypes.c_uint32), - ('PADDING_1', ctypes.c_ubyte * 4), - ('reserved', ctypes.c_uint64 * 9), -] - struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), @@ -2630,6 +2630,9 @@ amdsmi_get_gpu_metrics_header_info.argtypes = [amdsmi_processor_handle, ctypes.P amdsmi_get_gpu_metrics_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_metrics_info amdsmi_get_gpu_metrics_info.restype = amdsmi_status_t amdsmi_get_gpu_metrics_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_gpu_metrics_t)] +amdsmi_get_gpu_partition_metrics_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_partition_metrics_info +amdsmi_get_gpu_partition_metrics_info.restype = amdsmi_status_t +amdsmi_get_gpu_partition_metrics_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_gpu_metrics_t)] amdsmi_get_gpu_pm_metrics_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_pm_metrics_info amdsmi_get_gpu_pm_metrics_info.restype = amdsmi_status_t amdsmi_get_gpu_pm_metrics_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.POINTER(struct_amdsmi_name_value_t)), ctypes.POINTER(ctypes.c_uint32)] @@ -3418,6 +3421,7 @@ __all__ = \ 'amdsmi_get_gpu_metrics_info', 'amdsmi_get_gpu_od_volt_curve_regions', 'amdsmi_get_gpu_od_volt_info', 'amdsmi_get_gpu_overdrive_level', + 'amdsmi_get_gpu_partition_metrics_info', 'amdsmi_get_gpu_pci_bandwidth', 'amdsmi_get_gpu_pci_replay_counter', 'amdsmi_get_gpu_pci_throughput', 'amdsmi_get_gpu_perf_level', diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index 862c97ee40..3e8a5c7b67 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -3264,6 +3264,29 @@ rsmi_status_t rsmi_dev_gpu_reset(uint32_t dv_ind); rsmi_status_t rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv); +/** + * @brief This function retrieves the gpu partition metrics information + * + * @details Given a device index @p dv_ind and a pointer to a + * ::rsmi_gpu_metrics_t structure @p pgpu_metrics, this function will populate + * @p pgpu_metrics. See ::rsmi_gpu_metrics_t for more details. + * + * @param[in] dv_ind a device index + * + * @param[inout] pgpu_metrics a pointer to an ::rsmi_gpu_metrics_t structure + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided, + * arguments and ::RSMI_STATUS_NOT_SUPPORTED if it is not supported with the + * provided arguments. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + */ +rsmi_status_t rsmi_dev_gpu_partition_metrics_info_get(uint32_t dv_ind, + rsmi_gpu_metrics_t *pgpu_metrics); + /** * @brief This function retrieves the gpu metrics information * diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h index 73f6140836..1abc04dce2 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -156,6 +156,7 @@ enum DevInfoTypes { kDevMemPageBad, kDevNumaNode, kDevGpuMetrics, + kdevGpuPartitionMetrics, kDevPmMetrics, kDevRegMetrics, kDevBaseBoardTempMetrics, @@ -215,7 +216,7 @@ class Device { int readDevInfo(DevInfoTypes type, std::vector *retVec); int readDevInfo(DevInfoTypes type, std::size_t b_size, void *p_binary_data); - std::string get_sys_file_path_by_type(DevInfoTypes type) const; + std::string get_sys_file_path_by_type(DevInfoTypes type, bool getPathOnly = false) const; // Get the property from a file which may contain multiple properties. int readDevInfo(DevInfoTypes type, const std::string& property, std::string& value); @@ -254,19 +255,31 @@ class Device { template std::string readBootPartitionState(uint32_t dv_ind); rsmi_status_t check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type); - void dev_set_gpu_metric(GpuMetricsBasePtr gpu_metrics_ptr) { m_gpu_metrics_ptr = std::move(gpu_metrics_ptr); }; - GpuMetricsBasePtr& dev_get_gpu_metric() { return m_gpu_metrics_ptr; }; const AMDGpuMetricsHeader_v1_t& dev_get_metrics_header() {return m_gpu_metrics_header; } - rsmi_status_t setup_gpu_metrics_reading(); - rsmi_status_t dev_read_gpu_metrics_header_data(); - rsmi_status_t dev_read_gpu_metrics_all_data(); - rsmi_status_t run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t metric_counter, AMDGpuDynamicMetricTblValues_t& values); - rsmi_status_t dev_log_gpu_metrics(std::ostringstream& outstream_metrics); - AMGpuMetricsPublicLatestTupl_t dev_copy_internal_to_external_metrics(); + auto setup_gpu_metrics_reading(DevInfoTypes type = DevInfoTypes::kDevGpuMetrics) + -> rsmi_status_t; + auto dev_read_gpu_metrics_header_data(DevInfoTypes type = DevInfoTypes::kDevGpuMetrics) + -> rsmi_status_t; + auto dev_read_gpu_metrics_all_data(DevInfoTypes type = DevInfoTypes::kDevGpuMetrics) + -> rsmi_status_t; + auto run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t metric_counter, + AMDGpuDynamicMetricTblValues_t &values, + DevInfoTypes type = DevInfoTypes::kDevGpuMetrics) + -> rsmi_status_t; + auto dev_log_gpu_metrics(std::ostringstream &outstream_metrics, + DevInfoTypes type = DevInfoTypes::kDevGpuMetrics) -> rsmi_status_t; + auto dev_copy_internal_to_external_metrics(DevInfoTypes type = DevInfoTypes::kDevGpuMetrics) + -> AMGpuMetricsPublicLatestTupl_t; static const std::map devInfoTypesStrings; void set_smi_device_id(uint32_t device_id) { m_device_id = device_id; } void set_smi_partition_id(uint32_t partition_id) { m_partition_id = partition_id; } + auto set_smi_dev_info_type(DevInfoTypes type) -> void { m_dev_info_type = type; } + auto get_smi_device_id(void) const -> uint32_t { return m_device_id; } + auto get_smi_partition_id(void) const -> uint32_t { return m_partition_id; } + auto is_smi_expecting_partition_metrics(void) const -> bool { + return m_dev_info_type == DevInfoTypes::kdevGpuPartitionMetrics; + } static const char* get_type_string(DevInfoTypes type); rsmi_status_t get_smi_device_identifiers(uint32_t device_id, rsmi_device_identifiers_t *device_identifiers); @@ -310,6 +323,7 @@ class Device { uint64_t m_gpu_metrics_updated_timestamp; uint32_t m_device_id; uint32_t m_partition_id; + DevInfoTypes m_dev_info_type{DevInfoTypes::kDevGpuMetrics}; // New dynamic GPU metrics support bool m_is_dynamic_gpu_metrics_supported = false; diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_dyn_gpu_metrics.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_dyn_gpu_metrics.h index ec923a0cc2..b2e6d8df3e 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_dyn_gpu_metrics.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_dyn_gpu_metrics.h @@ -1,49 +1,24 @@ /* - * MIT License - * * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * - * Developed by: - * - * AMD ML Software Engineering - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of Advanced Micro Devices, Inc, - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - * + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. */ - - #ifndef ROCM_SMI_ROCM_SMI_DYN_GPU_METRICS_H_ #define ROCM_SMI_ROCM_SMI_DYN_GPU_METRICS_H_ @@ -299,6 +274,7 @@ enum class AMDGpuMetricUnitType_t QUANTITY, STATUS_FLAG }; + using AMDGpuMetricUnitTypeTranslationTable_t = std::unordered_map; static const auto AMDGpuMetricUnitTypeToString = AMDGpuMetricUnitTypeTranslationTable_t { diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h index bea50610a4..1af827c4c5 100644 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_gpu_metrics.h @@ -26,6 +26,7 @@ #include "rocm_smi/rocm_smi_common.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_dyn_gpu_metrics.h" +#include "rocm_smi/rocm_smi_logger.h" #include #include @@ -689,6 +690,33 @@ struct AMDGpuMetrics_v17_t { uint32_t m_pcie_lc_perf_other_end_recovery; }; +struct AMDGpuMetrics_v18_Partition_v1_0_t { + ~AMDGpuMetrics_v18_Partition_v1_0_t() = default; + struct AMDGpuMetricsHeader_v1_t m_common_header; + + /* Current clocks (Mhz) */ + uint16_t m_current_gfxclk[kRSMI_MAX_NUM_XCC]; + uint16_t m_current_socclk[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_vclk0[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_dclk0[kRSMI_MAX_NUM_CLKS]; + uint16_t m_current_uclk; + uint16_t m_padding; + + /* Utilization Instantaneous (%) */ + uint32_t m_gfx_busy_inst[kRSMI_MAX_NUM_XCC]; + uint16_t m_jpeg_busy[kRSMI_MAX_NUM_JPEG_ENG_V1]; + uint16_t m_vcn_busy[kRSMI_MAX_NUM_VCNS]; + + /* Utilization Accumulated (%) */ + uint64_t m_gfx_busy_acc[kRSMI_MAX_NUM_XCC]; + + /* Total App Clock Counter Accumulated */ + uint64_t m_gfx_below_host_limit_ppt_acc[kRSMI_MAX_NUM_XCC]; + uint64_t m_gfx_below_host_limit_thm_acc[kRSMI_MAX_NUM_XCC]; + uint64_t m_gfx_low_utilization_acc[kRSMI_MAX_NUM_XCC]; + uint64_t m_gfx_below_host_limit_total_acc[kRSMI_MAX_NUM_XCC]; +}; + struct AMDGpuMetrics_v18_t { ~AMDGpuMetrics_v18_t() = default; struct AMDGpuMetricsHeader_v1_t m_common_header; @@ -1053,8 +1081,10 @@ enum class AMDGpuMetricVersionFlags_t : AMDGpuMetricVersionFlagId_t kGpuMetricV15 = (0x1 << 5), kGpuMetricV16 = (0x1 << 6), kGpuMetricV17 = (0x1 << 7), - kGpuMetricV18 = (0x1 << 8), // Added new version flag: Last static GPU Metrics - kGpuMetricV19 = (0x1 << 9), // Dyn.GPU Metrics + kGpuMetricV18 = (0x1 << 8), + kGpuXcpMetricV10 = (0x1 << 0), // Added in v1.8 for partition metrics v1.0 + kGpuMetricDynV19Plus = (0x1 << 9), // Dyn. GPU Metrics v1.9+ + kGpuXcpMetricDynV11Plus = (0x1 << 1), // Added in v1.9 for Dyn. partition metrics v1.1+ }; using AMDGpuMetricVersionTranslationTbl_t = std::map; using GpuMetricTypePtr_t = std::shared_ptr; @@ -1069,6 +1099,7 @@ class GpuMetricsBase_t { virtual AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() = 0; virtual void set_device_id(uint32_t device_id) { m_device_id = device_id; } virtual void set_partition_id(uint32_t partition_id) { m_partition_id = partition_id; } + virtual void set_is_partition_metrics(bool is_partition_req) { m_is_partition_metrics = is_partition_req; } static std::mutex s_base_tbl_mu; virtual AMDGpuDynamicMetricsTbl_t get_metrics_dynamic_tbl() { std::lock_guard lk(s_base_tbl_mu); @@ -1080,6 +1111,7 @@ class GpuMetricsBase_t { uint64_t m_metrics_timestamp; uint32_t m_device_id; uint32_t m_partition_id; + bool m_is_partition_metrics {false}; }; using GpuMetricsBasePtr = std::shared_ptr; using AMDGpuMetricFactories_t = const std::map; @@ -1293,11 +1325,31 @@ class GpuMetricsBase_v18_t final : public GpuMetricsBase_t { } GpuMetricTypePtr_t get_metrics_table() override { - if (!m_gpu_metric_ptr) { - m_gpu_metric_ptr.reset(&m_gpu_metrics_tbl, [](AMDGpuMetrics_v18_t*){}); + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ + << " ==== START ==== " + << " Initializing metrics table request: " + << " | Partition ID: " << m_partition_id + << " | Device ID: " << m_device_id + << " | Is Partition Metrics: " << std::boolalpha << m_is_partition_metrics + << " | m_gpu_metric_ptr: " << (!m_gpu_metric_ptr ? "nullptr" : "valid") + << " | m_gpu_metric_partition_ptr: " + << (!m_gpu_metric_partition_ptr ? "nullptr" : "valid"); + LOG_DEBUG(ss); + // If m_is_partition_metrics is false, we use the main GPU metrics table. + // Otherwise, we use the partition metrics table. + // This is to avoid having two pointers to the same table. + if (m_is_partition_metrics && !m_gpu_metric_partition_ptr) { + return std::shared_ptr( + &m_gpu_metrics_partition_tbl, [](AMDGpuMetrics_v18_Partition_v1_0_t*){/* no-op */}); + } else if (!m_is_partition_metrics && !m_gpu_metric_ptr) { + return std::shared_ptr( + &m_gpu_metrics_tbl, [](AMDGpuMetrics_v18_t*){/* no-op */}); } - assert(m_gpu_metric_ptr != nullptr); - return m_gpu_metric_ptr; + return std::shared_ptr( + nullptr, [](AMDGpuMetrics_v18_t*){/* no-op */}); // Return nullptr if we couldn't + // validate which metric table + // user is requesting } AMDGpuMetricVersionFlags_t get_gpu_metrics_version_used() override { @@ -1310,10 +1362,12 @@ class GpuMetricsBase_v18_t final : public GpuMetricsBase_t { private: AMDGpuMetrics_v18_t m_gpu_metrics_tbl; std::shared_ptr m_gpu_metric_ptr; + AMDGpuMetrics_v18_Partition_v1_0_t m_gpu_metrics_partition_tbl; + std::shared_ptr m_gpu_metric_partition_ptr; }; class GpuMetricsBaseDynamic_t final : public GpuMetricsBase_t { - public: + public: ~GpuMetricsBaseDynamic_t() = default; // Unused @@ -1341,7 +1395,7 @@ class GpuMetricsBaseDynamic_t final : public GpuMetricsBase_t { AMGpuMetricsPublicLatestTupl_t copy_internal_to_external_metrics() override; - private: + private: AMDGpuDynamicMetrics_t m_dyn; details::AMDGpuDynamicMetricsHeader_v1_t m_header{}; diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc index 09a5fb964c..90a818d6c7 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc @@ -114,6 +114,7 @@ static const char *kDevXGMIErrorFName = "xgmi_error"; static const char *kDevSerialNumberFName = "serial_number"; static const char *kDevNumaNodeFName = "numa_node"; static const char *kDevGpuMetricsFName = "gpu_metrics"; +static const char *kDevGpuPartitionMetricsFName = "xcp/xcp_metrics"; static const char *kDevPmMetricsFName = "pm_metrics"; // PM log static const char *kDevRegMetricsFName = "reg_state"; // register table static const char *kDevBaseBoardTempMetricsFName = "board/baseboard_temp"; @@ -321,6 +322,7 @@ static const std::map kDevAttribNameMap = { {kDevMemPageBad, kDevMemPageBadFName}, {kDevNumaNode, kDevNumaNodeFName}, {kDevGpuMetrics, kDevGpuMetricsFName}, + {kdevGpuPartitionMetrics, kDevGpuPartitionMetricsFName}, {kDevPmMetrics, kDevPmMetricsFName}, {kDevSocPstate, kDevSocPstateFName}, {kDevXgmiPlpd, kDevXgmiPlpdFName}, @@ -498,6 +500,7 @@ Device::devInfoTypesStrings = { {kDevMemPageBad, "kDevMemPageBad"}, {kDevNumaNode, "kDevNumaNode"}, {kDevGpuMetrics, "kDevGpuMetrics"}, + {kdevGpuPartitionMetrics, "kdevGpuPartitionMetrics"}, {kDevPmMetrics, "kDevPmMetrics"}, {kDevRegMetrics, "kDevRegMetrics"}, {kDevBaseBoardTempMetrics, "kDevBaseBoardTempMetrics"}, @@ -747,10 +750,29 @@ int Device::openDebugFileStream(DevInfoTypes type, T *fs, const char *str) { return 0; } -std::string Device::get_sys_file_path_by_type(DevInfoTypes type) const { +/** + * @brief Get the sysfs file path for a given device attribute type. + * + * This function constructs the full path to a sysfs file corresponding to the specified + * device attribute type for this device instance. The path is constructed using the device's + * base path, appending "/device/" and the attribute name from kDevAttribNameMap. + * + * If getPathOnly is true, the constructed path is returned without checking for file existence. + * If getPathOnly is false, the function checks if the file exists; if not, an empty string is returned. + * + * @param type The device attribute type (DevInfoTypes) for which to get the sysfs file path. + * @param getPathOnly If true, return the constructed path without checking for file existence. + * If false, return an empty string if the file does not exist. + * @return std::string The full sysfs file path, or an empty string if the file does not exist + * and getPathOnly is false. + */ +std::string Device::get_sys_file_path_by_type(DevInfoTypes type, bool getPathOnly) const { auto sysfs_path = path_; sysfs_path += "/device/"; sysfs_path += kDevAttribNameMap.at(type); + if (getPathOnly) { + return sysfs_path; + } if (access(sysfs_path.c_str(), F_OK) != 0) { sysfs_path.clear(); @@ -1133,7 +1155,6 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, // is the issue, so should remain. const std::string key = path_ + "/device/" + kDevAttribNameMap.at(type) + "#" + std::to_string(b_size); - GpuMetricsCache* cache_ptr = nullptr; { std::lock_guard map_lk(g_gpu_metrics_cache_map_mu); @@ -1447,6 +1468,7 @@ int Device::readDevInfo(DevInfoTypes type, std::size_t b_size, switch (type) { case kDevGpuMetrics: + case kdevGpuPartitionMetrics: return readDevInfoBinary(type, b_size, p_binary_data); break; diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_dyn_gpu_metrics.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_dyn_gpu_metrics.cc index 87344a39e0..c7d4a3f79e 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_dyn_gpu_metrics.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_dyn_gpu_metrics.cc @@ -1,46 +1,23 @@ /* - * MIT License - * * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. * - * Developed by: - * - * AMD ML Software Engineering - * - * Advanced Micro Devices, Inc. - * - * www.amd.com - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * - Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimers. - * - Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimers in - * the documentation and/or other materials provided with the distribution. - * - Neither the names of Advanced Micro Devices, Inc, - * nor the names of its contributors may be used to endorse or promote - * products derived from this Software without specific prior written - * permission. - * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR - * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, - * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - * + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. */ #include "rocm_smi/rocm_smi.h" @@ -156,7 +133,7 @@ static inline std::optional read_metric_value(Curs auto AMDGpuDynamicMetrics_t::parse_from_buffer(const std::byte* data, std::size_t size) noexcept -> rsmi_status_t { - + std::ostringstream ss; rsmi_status_t status = RSMI_STATUS_SUCCESS; if (!data || (size < (sizeof(AMDGpuDynamicMetricsHeader_v1_t) + sizeof(uint32_t)))) { return RSMI_STATUS_INSUFFICIENT_SIZE; @@ -178,6 +155,17 @@ auto AMDGpuDynamicMetrics_t::parse_from_buffer(const std::byte* data, if (attr_count == 0 || attr_count > size){ return RSMI_STATUS_UNEXPECTED_SIZE; } + std::string m_header_version_str = std::to_string(static_cast(hdr.m_format_revision)) + + "." + + std::to_string(static_cast(hdr.m_content_revision)); + ss << __PRETTY_FUNCTION__ + << " | Info: Dynamic GPU Metrics" + << " | Attr Count: " << attr_count + << " | Header Version: " << m_header_version_str + << " | Header Size: " << hdr.get_size() + << " | Total Size: " << size + << " |"; + LOG_TRACE(ss); details::AMDGpuMetricSchemaType_t metrics_data; metrics_data.reserve(attr_count); @@ -212,7 +200,6 @@ auto AMDGpuDynamicMetrics_t::parse_from_buffer(const std::byte* data, AMDGpuMetricAttributeInstance_t inst{}; status = schema_lookup_instance(attr_id, attr_type, inst); if (status != RSMI_STATUS_SUCCESS){ - std::ostringstream ss; ss << __PRETTY_FUNCTION__ << " | Warn: schema lookup miss" << " | Attr ID: " << static_cast>(attr_id) diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc index 86c9baa73c..f89dac2b71 100644 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_gpu_metrics.cc @@ -95,9 +95,12 @@ auto timestamp_to_time_point(uint64_t timestamp_in_secs) return timestamp_time; } - -std::string stringfy_metrics_header(const AMDGpuMetricsHeader_v1_t& metrics_header) -{ +// header_details: +// - bool: true if header contains partition metrics, false otherwise +// - std::string: file path of the metrics header file +std::string stringfy_metrics_header(const AMDGpuMetricsHeader_v1_t& metrics_header, + bool is_partition_metrics, + const std::string& file_path) { std::stringstream metrics_header_info; metrics_header_info << "{Header Info: " @@ -111,6 +114,8 @@ std::string stringfy_metrics_header(const AMDGpuMetricsHeader_v1_t& metrics_head << " Revision: " << print_unsigned_hex_and_int(metrics_header.m_content_revision) << " Size: " << print_unsigned_hex_and_int(metrics_header.m_structure_size) << "]" + << " | Is Partition Metrics: " << std::boolalpha << is_partition_metrics + << " | Metric File: " << file_path << "\n"; return metrics_header_info.str(); @@ -137,9 +142,9 @@ std::string stringfy_metric_header_version(const AMDGpuMetricsHeader_v1_t& metri // version 1.5: 261 // version 1.6: 262 // version 1.7: 263 -// -const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_table -{ +// version 1.8: 264 +// version 1.9: 265 +const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_table { {join_metrics_version(1, 1), AMDGpuMetricVersionFlags_t::kGpuMetricV11}, {join_metrics_version(1, 2), AMDGpuMetricVersionFlags_t::kGpuMetricV12}, {join_metrics_version(1, 3), AMDGpuMetricVersionFlags_t::kGpuMetricV13}, @@ -148,7 +153,14 @@ const AMDGpuMetricVersionTranslationTbl_t amdgpu_metric_version_translation_tabl {join_metrics_version(1, 6), AMDGpuMetricVersionFlags_t::kGpuMetricV16}, {join_metrics_version(1, 7), AMDGpuMetricVersionFlags_t::kGpuMetricV17}, {join_metrics_version(1, 8), AMDGpuMetricVersionFlags_t::kGpuMetricV18}, - {join_metrics_version(1, 9), AMDGpuMetricVersionFlags_t::kGpuMetricV19}, // Dynamic GPU Metrics + {join_metrics_version(1, 9), AMDGpuMetricVersionFlags_t::kGpuMetricDynV19Plus}, // Dynamic GPU Metrics +}; + +// version 1.0: 256 +// version 1.1: 257 +const AMDGpuMetricVersionTranslationTbl_t amdgpu_partition_metric_version_translation_table { + {join_metrics_version(1, 0), AMDGpuMetricVersionFlags_t::kGpuXcpMetricV10}, + {join_metrics_version(1, 1), AMDGpuMetricVersionFlags_t::kGpuXcpMetricDynV11Plus}, // Dynamic XCP Metrics }; /** @@ -282,33 +294,41 @@ const AMDGpuMetricsUnitTypeTranslationTbl_t amdgpu_metrics_unit_type_translation }; -AMDGpuMetricVersionFlags_t translate_header_to_flag_version(const AMDGpuMetricsHeader_v1_t& metrics_header) -{ +AMDGpuMetricVersionFlags_t translate_header_to_flag_version( + const AMDGpuMetricsHeader_v1_t& metrics_header, + bool is_partition_metrics, + const std::string& file_path) { std::ostringstream ss; auto version_id(AMDGpuMetricVersionFlags_t::kGpuMetricNone); ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); const auto flag_version = join_metrics_version(metrics_header); - if (amdgpu_metric_version_translation_table.find(flag_version) != amdgpu_metric_version_translation_table.end()) { - version_id = amdgpu_metric_version_translation_table.at(flag_version); - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Success " - << " | Translation Tbl: " << flag_version - << " | Metric Version: " << stringfy_metrics_header(metrics_header) - << " | Returning = " - << static_cast(version_id) - << " |"; - LOG_TRACE(ss); - return version_id; + if (!is_partition_metrics) { + if (auto it = amdgpu_metric_version_translation_table.find(flag_version); + it != amdgpu_metric_version_translation_table.end()) { + return it->second; + } + if (metrics_header.m_format_revision == 1 && + metrics_header.m_content_revision >= 9) { + return AMDGpuMetricVersionFlags_t::kGpuMetricDynV19Plus; + } + } else { + if (auto it = amdgpu_partition_metric_version_translation_table.find(flag_version); + it != amdgpu_partition_metric_version_translation_table.end()) { + return it->second; + } + if (metrics_header.m_format_revision == 1 && + metrics_header.m_content_revision >= 2) { + return AMDGpuMetricVersionFlags_t::kGpuXcpMetricDynV11Plus; + } } ss << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | Fail " << " | Translation Tbl: " << flag_version - << " | Metric Version: " << stringfy_metrics_header(metrics_header) + << " | Metric Version: " << stringfy_metrics_header(metrics_header, is_partition_metrics, file_path) << " | Returning = " << static_cast(version_id) << " |"; @@ -349,27 +369,43 @@ uint16_t translate_flag_to_metric_version(AMDGpuMetricVersionFlags_t version_fla return version_id; } - -rsmi_status_t is_gpu_metrics_version_supported(const AMDGpuMetricsHeader_v1_t& metrics_header) -{ +// metric_details: +// - bool: true if header contains partition metrics, false otherwise +// - std::string: file path of the metrics header file +rsmi_status_t is_gpu_metrics_version_supported( + const AMDGpuMetricsHeader_v1_t& metrics_header, + bool is_partition_metrics) { + rsmi_status_t status_code(RSMI_STATUS_NOT_SUPPORTED); const auto flag_version = join_metrics_version(metrics_header); - return (amdgpu_metric_version_translation_table.find(flag_version) != - amdgpu_metric_version_translation_table.end()) - ? rsmi_status_t::RSMI_STATUS_SUCCESS : rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; + if (flag_version == static_cast( + AMDGpuMetricVersionFlags_t::kGpuMetricNone)) { + return RSMI_STATUS_NOT_SUPPORTED; + } + return RSMI_STATUS_SUCCESS; } -GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t v) { - switch (v) { - case AMDGpuMetricVersionFlags_t::kGpuMetricV11: return std::make_shared(); - case AMDGpuMetricVersionFlags_t::kGpuMetricV12: return std::make_shared(); - case AMDGpuMetricVersionFlags_t::kGpuMetricV13: return std::make_shared(); - case AMDGpuMetricVersionFlags_t::kGpuMetricV14: return std::make_shared(); - case AMDGpuMetricVersionFlags_t::kGpuMetricV15: return std::make_shared(); - case AMDGpuMetricVersionFlags_t::kGpuMetricV16: return std::make_shared(); - case AMDGpuMetricVersionFlags_t::kGpuMetricV17: return std::make_shared(); - case AMDGpuMetricVersionFlags_t::kGpuMetricV18: return std::make_shared(); - case AMDGpuMetricVersionFlags_t::kGpuMetricV19: return std::make_shared(); - default: return nullptr; +GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t v, + bool is_partition_metrics, + const std::string& file_path) { + if (!is_partition_metrics) { + switch (v) { + case AMDGpuMetricVersionFlags_t::kGpuMetricV11: return std::make_shared(); + case AMDGpuMetricVersionFlags_t::kGpuMetricV12: return std::make_shared(); + case AMDGpuMetricVersionFlags_t::kGpuMetricV13: return std::make_shared(); + case AMDGpuMetricVersionFlags_t::kGpuMetricV14: return std::make_shared(); + case AMDGpuMetricVersionFlags_t::kGpuMetricV15: return std::make_shared(); + case AMDGpuMetricVersionFlags_t::kGpuMetricV16: return std::make_shared(); + case AMDGpuMetricVersionFlags_t::kGpuMetricV17: return std::make_shared(); + case AMDGpuMetricVersionFlags_t::kGpuMetricV18: return std::make_shared(); + case AMDGpuMetricVersionFlags_t::kGpuMetricDynV19Plus: return std::make_shared(); + default: return nullptr; + } + } else { + switch (v) { + case AMDGpuMetricVersionFlags_t::kGpuXcpMetricV10: return std::make_shared(); + case AMDGpuMetricVersionFlags_t::kGpuXcpMetricDynV11Plus: return std::make_shared(); + default: return nullptr; + } } } @@ -384,8 +420,7 @@ constexpr T init_max_uint_types() (std::is_same_v) || (std::is_same_v)) { return std::numeric_limits::max(); - } - else { + } else { static_assert(is_dependent_false_v, "Error: Type not supported..."); } } @@ -534,7 +569,6 @@ rsmi_status_t GpuMetricsBaseDynamic_t::populate_metrics_dynamic_tbl() { for (const auto& r : m_dyn.get_metric_rows()) { switch (r.m_instance.m_attribute_id) { - // Power energy and temperature case details::AMDGpuMetricAttributeId_t::TEMPERATURE_HOTSPOT: emit(AMDGpuMetricsClassId_t::kGpuMetricTemperature, AMDGpuMetricsUnitType_t::kMetricTempHotspot, @@ -653,6 +687,16 @@ rsmi_status_t GpuMetricsBaseDynamic_t::populate_metrics_dynamic_tbl() { "xgmi_link_status", r); break; + case details::AMDGpuMetricAttributeId_t::MEM_MAX_BANDWIDTH: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, + "vram_max_bandwidth", r); + break; + + case details::AMDGpuMetricAttributeId_t::PCIE_LC_PERF_OTHER_END_RECOVERY: + emit(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, + "pcie_lc_perf_other_end_recovery", r); + break; + // Current Clock case details::AMDGpuMetricAttributeId_t::CURRENT_GFXCLK: emit(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, @@ -714,6 +758,22 @@ rsmi_status_t GpuMetricsBaseDynamic_t::populate_metrics_dynamic_tbl() { emit(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, "xcp_stats->gfx_busy_acc", r); break; + case details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_PPT_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitPptAcc, + "xcp_stats->gfx_below_host_limit_ppt_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_THM_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitThmAcc, + "xcp_stats->gfx_below_host_limit_thm_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::GFX_LOW_UTILIZATION_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricGfxLowUtilitizationAcc, + "xcp_stats->gfx_low_utilization_acc", r); + break; + case details::AMDGpuMetricAttributeId_t::GFX_BELOW_HOST_LIMIT_TOTAL_ACC: + emit(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitTotalAcc, + "xcp_stats->gfx_below_host_limit_total_acc", r); + break; default: ss << __PRETTY_FUNCTION__ @@ -745,6 +805,15 @@ rsmi_status_t GpuMetricsBase_v18_t::populate_metrics_dynamic_tbl() { LOG_TRACE(ss); auto m_metrics_dynamic_tbl = AMDGpuDynamicMetricsTbl_t{}; + + // Lambda function to populate the metrics table + auto populate_metrics_table = [&](AMDGpuMetricsClassId_t class_id, + AMDGpuMetricsUnitType_t unit_type, const auto& metric, + const std::string& metric_name) { + m_metrics_dynamic_tbl[class_id].insert( + std::make_pair(unit_type, format_metric_row(metric, metric_name))); + }; + // // Note: Any metric treatment/changes (if any) should happen before they // get written to internal/external tables. @@ -760,239 +829,262 @@ rsmi_status_t GpuMetricsBase_v18_t::populate_metrics_dynamic_tbl() { LOG_TRACE(ss); // firmware_timestamp is at 10ns resolution; leave as-is. - ss << __PRETTY_FUNCTION__ << " | firmware_timestamp (10ns) = " - << m_gpu_metrics_tbl.m_firmware_timestamp; + ss << __PRETTY_FUNCTION__ + << " | firmware_timestamp (10ns) = " << m_gpu_metrics_tbl.m_firmware_timestamp; LOG_DEBUG(ss); }; - run_metric_adjustments_v18(); + ss << __PRETTY_FUNCTION__ << " | ======= info ======= " + << " | START-> " << (m_is_partition_metrics ? "Partitioned" : "GPU") + << " metrics for partition: " << m_partition_id + << " | Is Partition Metrics: " << std::boolalpha << m_is_partition_metrics; + LOG_DEBUG(ss); - // Temperature Info - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempHotspot, - format_metric_row(m_gpu_metrics_tbl.m_temperature_hotspot, - "temperature_hotspot"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempMem, - format_metric_row(m_gpu_metrics_tbl.m_temperature_mem, - "temperature_mem"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTemperature] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTempVrSoc, - format_metric_row(m_gpu_metrics_tbl.m_temperature_vrsoc, - "temperature_vrsoc"))); + if (!m_is_partition_metrics) { + run_metric_adjustments_v18(); - // Power/Energy Info - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, - format_metric_row(m_gpu_metrics_tbl.m_current_socket_power, - "curr_socket_power"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_energy_accumulator, - "energy_acc"))); + // Temperature Info + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricTemperature, + AMDGpuMetricsUnitType_t::kMetricTempHotspot, m_gpu_metrics_tbl.m_temperature_hotspot, + "temperature_hotspot"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricTemperature, + AMDGpuMetricsUnitType_t::kMetricTempMem, + m_gpu_metrics_tbl.m_temperature_mem, "temperature_mem"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricTemperature, + AMDGpuMetricsUnitType_t::kMetricTempVrSoc, + m_gpu_metrics_tbl.m_temperature_vrsoc, "temperature_vrsoc"); - // Utilization Info - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity, - format_metric_row(m_gpu_metrics_tbl.m_average_gfx_activity, - "average_gfx_activity"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity, - format_metric_row(m_gpu_metrics_tbl.m_average_umc_activity, - "average_umc_activity"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_gfx_activity_acc, - "gfx_activity_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricUtilization] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_mem_activity_acc, - "mem_activity_acc"))); + // Power/Energy Info + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy, + AMDGpuMetricsUnitType_t::kMetricCurrSocketPower, + m_gpu_metrics_tbl.m_current_socket_power, "curr_socket_power"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricPowerEnergy, + AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator, + m_gpu_metrics_tbl.m_energy_accumulator, "energy_acc"); - // GfxLock Info - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, - format_metric_row(m_gpu_metrics_tbl.m_gfxclk_lock_status, - "gfxclk_lock_status"))); + // Utilization Info + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricUtilization, + AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity, + m_gpu_metrics_tbl.m_average_gfx_activity, "average_gfx_activity"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricUtilization, + AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity, + m_gpu_metrics_tbl.m_average_umc_activity, "average_umc_activity"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricUtilization, + AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator, + m_gpu_metrics_tbl.m_gfx_activity_acc, "gfx_activity_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricUtilization, + AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator, + m_gpu_metrics_tbl.m_mem_activity_acc, "mem_activity_acc"); - // Timestamp Info - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSFirmware, - format_metric_row(m_gpu_metrics_tbl.m_firmware_timestamp, - "firmware_timestamp"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricTimestamp] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricTSClockCounter, - format_metric_row(m_gpu_metrics_tbl.m_system_clock_counter, - "system_clock_counter"))); + // GfxLock Info + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricGfxClkLockStatus, + AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus, + m_gpu_metrics_tbl.m_gfxclk_lock_status, "gfxclk_lock_status"); - // Link/Width/Speed Info - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, - format_metric_row(m_gpu_metrics_tbl.m_pcie_link_width, - "pcie_link_width"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, - format_metric_row(m_gpu_metrics_tbl.m_pcie_link_speed, - "pcie_link_speed"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, - format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_width, - "xgmi_link_width"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, - format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_speed, - "xgmi_link_speed"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_acc, - "pcie_bandwidth_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, - format_metric_row(m_gpu_metrics_tbl.m_pcie_bandwidth_inst, - "pcie_bandwidth_inst"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc, - "pcie_l0_recov_count_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_count_acc, - "pcie_replay_count_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc, - "pcie_replay_rollover_count_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc, - "pcie_nak_sent_count_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc, - "pcie_nak_rcvd_count_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_xgmi_read_data_acc, - "[xgmi_read_data_acc]"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_xgmi_write_data_acc, - "[xgmi_write_data_acc]"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, - format_metric_row(m_gpu_metrics_tbl.m_xgmi_link_status, - "[xgmi_link_status]"))); + // Timestamp Info + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricTimestamp, + AMDGpuMetricsUnitType_t::kMetricTSFirmware, + m_gpu_metrics_tbl.m_firmware_timestamp, "firmware_timestamp"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricTimestamp, + AMDGpuMetricsUnitType_t::kMetricTSClockCounter, + m_gpu_metrics_tbl.m_system_clock_counter, "system_clock_counter"); - // CurrentClock Info - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, - format_metric_row(m_gpu_metrics_tbl.m_current_gfxclk, - "[current_gfxclk]"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrSocClock, - format_metric_row(m_gpu_metrics_tbl.m_current_socclk, - "[current_socclk]"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrVClock0, - format_metric_row(m_gpu_metrics_tbl.m_current_vclk0, - "[current_vclk0]"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrDClock0, - format_metric_row(m_gpu_metrics_tbl.m_current_dclk0, - "[current_dclk0]"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricCurrentClock] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricCurrUClock, - format_metric_row(m_gpu_metrics_tbl.m_current_uclk, - "current_uclk"))); + // Link/Width/Speed Info + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth, + m_gpu_metrics_tbl.m_pcie_link_width, "pcie_link_width"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed, + m_gpu_metrics_tbl.m_pcie_link_speed, "pcie_link_speed"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth, + m_gpu_metrics_tbl.m_xgmi_link_width, "xgmi_link_width"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed, + m_gpu_metrics_tbl.m_xgmi_link_speed, "xgmi_link_speed"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator, + m_gpu_metrics_tbl.m_pcie_bandwidth_acc, "pcie_bandwidth_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst, + m_gpu_metrics_tbl.m_pcie_bandwidth_inst, "pcie_bandwidth_inst"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator, + m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc, + "pcie_l0_recov_count_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator, + m_gpu_metrics_tbl.m_pcie_replay_count_acc, "pcie_replay_count_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator, + m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc, + "pcie_replay_rollover_count_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricPcieNakSentCountAccumulator, + m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc, "pcie_nak_sent_count_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricPcieNakReceivedCountAccumulator, + m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc, "pcie_nak_rcvd_count_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator, + m_gpu_metrics_tbl.m_xgmi_read_data_acc, "[xgmi_read_data_acc]"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator, + m_gpu_metrics_tbl.m_xgmi_write_data_acc, "[xgmi_write_data_acc]"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricXgmiLinkStatus, + m_gpu_metrics_tbl.m_xgmi_link_status, "[xgmi_link_status]"); - /* Accumulation cycle counter */ - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricAccumulationCounter, - format_metric_row(m_gpu_metrics_tbl.m_accumulation_counter, - "accumulation_counter"))); + // Current Clock Info + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, + AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, + m_gpu_metrics_tbl.m_current_gfxclk, "[current_gfxclk]"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, + AMDGpuMetricsUnitType_t::kMetricCurrSocClock, + m_gpu_metrics_tbl.m_current_socclk, "[current_socclk]"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, + AMDGpuMetricsUnitType_t::kMetricCurrVClock0, + m_gpu_metrics_tbl.m_current_vclk0, "[current_vclk0]"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, + AMDGpuMetricsUnitType_t::kMetricCurrDClock0, + m_gpu_metrics_tbl.m_current_dclk0, "[current_dclk0]"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, + AMDGpuMetricsUnitType_t::kMetricCurrUClock, + m_gpu_metrics_tbl.m_current_uclk, "current_uclk"); - /* Accumulated throttler residencies */ - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricProchotResidencyAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_prochot_residency_acc, - "prochot_residency_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPPTResidencyAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_ppt_residency_acc, - "ppt_residency_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricSocketThmResidencyAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_socket_thm_residency_acc, - "socket_thm_residency_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVRThmResidencyAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_vr_thm_residency_acc, - "vr_thm_residency_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator, - format_metric_row(m_gpu_metrics_tbl.m_hbm_thm_residency_acc, - "hbm_thm_residency_acc"))); + // Throttle residency counter + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, + AMDGpuMetricsUnitType_t::kMetricAccumulationCounter, + m_gpu_metrics_tbl.m_accumulation_counter, "accumulation_counter"); - /* Partition info */ - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricPartition] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kGpuMetricNumPartition, - format_metric_row(m_gpu_metrics_tbl.m_num_partition, - "num_partition"))); + // Accumulated throttler residencies + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, + AMDGpuMetricsUnitType_t::kMetricProchotResidencyAccumulator, + m_gpu_metrics_tbl.m_prochot_residency_acc, "prochot_residency_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, + AMDGpuMetricsUnitType_t::kMetricPPTResidencyAccumulator, + m_gpu_metrics_tbl.m_ppt_residency_acc, "ppt_residency_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, + AMDGpuMetricsUnitType_t::kMetricSocketThmResidencyAccumulator, + m_gpu_metrics_tbl.m_socket_thm_residency_acc, "socket_thm_residency_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, + AMDGpuMetricsUnitType_t::kMetricVRThmResidencyAccumulator, + m_gpu_metrics_tbl.m_vr_thm_residency_acc, "vr_thm_residency_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricThrottleResidency, + AMDGpuMetricsUnitType_t::kMetricHBMThmResidencyAccumulator, + m_gpu_metrics_tbl.m_hbm_thm_residency_acc, "hbm_thm_residency_acc"); - /* xcp_stats info */ - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, - format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_inst, - "xcp_stats->gfx_busy_inst"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVcnBusy, - format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->vcn_busy, - "xcp_stats->vcn_busy"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricJpegBusy, - format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->jpeg_busy, - "xcp_stats->jpeg_busy"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, - format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_acc, - "xcp_stats->gfx_busy_acc"))); + // Partition info + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricPartition, + AMDGpuMetricsUnitType_t::kGpuMetricNumPartition, + m_gpu_metrics_tbl.m_num_partition, "num_partition"); - /* gpu metrics v1.8 xcp_stats info */ - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitTotalAcc, - format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_total_acc, - "xcp_stats->gfx_below_host_limit_total_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitPptAcc, - format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_ppt_acc, - "xcp_stats->gfx_below_host_limit_ppt_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitThmAcc, - format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_thm_acc, - "xcp_stats->gfx_below_host_limit_thm_acc"))); - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricXcpStats] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricGfxLowUtilitizationAcc, - format_metric_row(m_gpu_metrics_tbl.m_xcp_stats->gfx_low_utilization_acc, - "xcp_stats->gfx_low_utilization_acc"))); + // xcp_stats info + populate_metrics_table( + AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, + m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_inst, "xcp_stats->gfx_busy_inst"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, + AMDGpuMetricsUnitType_t::kMetricVcnBusy, + m_gpu_metrics_tbl.m_xcp_stats->vcn_busy, "xcp_stats->vcn_busy"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, + AMDGpuMetricsUnitType_t::kMetricJpegBusy, + m_gpu_metrics_tbl.m_xcp_stats->jpeg_busy, "xcp_stats->jpeg_busy"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, + AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, + m_gpu_metrics_tbl.m_xcp_stats->gfx_busy_acc, "xcp_stats->gfx_busy_acc"); - /* PCIE other end recovery counter info */ - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, - format_metric_row(m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery, - "pcie_lc_perf_other_end_recovery"))); + // GPU metrics v1.8 xcp_stats info + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, + AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitTotalAcc, + m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_total_acc, + "xcp_stats->gfx_below_host_limit_total_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, + AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitPptAcc, + m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_ppt_acc, + "xcp_stats->gfx_below_host_limit_ppt_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, + AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitThmAcc, + m_gpu_metrics_tbl.m_xcp_stats->gfx_below_host_limit_thm_acc, + "xcp_stats->gfx_below_host_limit_thm_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, + AMDGpuMetricsUnitType_t::kMetricGfxLowUtilitizationAcc, + m_gpu_metrics_tbl.m_xcp_stats->gfx_low_utilization_acc, + "xcp_stats->gfx_low_utilization_acc"); - /* VRAM max bandwidth (in GB/sec) at max memory clock */ - m_metrics_dynamic_tbl[AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed] - .insert(std::make_pair(AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, - format_metric_row(m_gpu_metrics_tbl.m_mem_max_bandwidth, - "vram_max_bandwidth"))); + // PCIE other end recovery counter info + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricPcieLCPerfOtherEndRecov, + m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery, + "pcie_lc_perf_other_end_recovery"); + + // VRAM max bandwidth + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricLinkWidthSpeed, + AMDGpuMetricsUnitType_t::kMetricVramMaxBandwidth, + m_gpu_metrics_tbl.m_mem_max_bandwidth, "vram_max_bandwidth"); + } else { // Partition metrics + // Current clocks + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, + AMDGpuMetricsUnitType_t::kMetricCurrGfxClock, + m_gpu_metrics_partition_tbl.m_current_gfxclk, + "[partition 1.0] current_gfxclk"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, + AMDGpuMetricsUnitType_t::kMetricCurrSocClock, + m_gpu_metrics_partition_tbl.m_current_socclk, + "[partition 1.0] current_socclk"); + populate_metrics_table( + AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrVClock0, + m_gpu_metrics_partition_tbl.m_current_vclk0, "[partition 1.0] current_vclk0"); + populate_metrics_table( + AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrDClock0, + m_gpu_metrics_partition_tbl.m_current_dclk0, "[partition 1.0] current_dclk0"); + populate_metrics_table( + AMDGpuMetricsClassId_t::kGpuMetricCurrentClock, AMDGpuMetricsUnitType_t::kMetricCurrUClock, + m_gpu_metrics_partition_tbl.m_current_uclk, "[partition 1.0] current_uclk"); + + // XCP stats - Utilization + populate_metrics_table( + AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricGfxBusyInst, + m_gpu_metrics_partition_tbl.m_gfx_busy_inst, "[partition 1.0] gfx_busy_inst"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, + AMDGpuMetricsUnitType_t::kMetricVcnBusy, + m_gpu_metrics_partition_tbl.m_vcn_busy, "[partition 1.0] vcn_busy"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, + AMDGpuMetricsUnitType_t::kMetricJpegBusy, + m_gpu_metrics_partition_tbl.m_jpeg_busy, "[partition 1.0] jpeg_busy"); + populate_metrics_table( + AMDGpuMetricsClassId_t::kGpuMetricXcpStats, AMDGpuMetricsUnitType_t::kMetricGfxBusyAcc, + m_gpu_metrics_partition_tbl.m_gfx_busy_acc, "[partition 1.0] gfx_busy_acc"); + + // Total App Clock Counter Accumulated + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, + AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitTotalAcc, + m_gpu_metrics_partition_tbl.m_gfx_below_host_limit_total_acc, + "[partition 1.0] gfx_below_host_limit_total_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, + AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitPptAcc, + m_gpu_metrics_partition_tbl.m_gfx_below_host_limit_ppt_acc, + "[partition 1.0] gfx_below_host_limit_ppt_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, + AMDGpuMetricsUnitType_t::kMetricGfxBelowHostLimitThmAcc, + m_gpu_metrics_partition_tbl.m_gfx_below_host_limit_thm_acc, + "[partition 1.0] gfx_below_host_limit_thm_acc"); + populate_metrics_table(AMDGpuMetricsClassId_t::kGpuMetricXcpStats, + AMDGpuMetricsUnitType_t::kMetricGfxLowUtilitizationAcc, + m_gpu_metrics_partition_tbl.m_gfx_low_utilization_acc, + "[partition 1.0] gfx_low_utilization_acc"); + } ss << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | Success " - << " | Returning = " << getRSMIStatusString(status_code) << " |"; + << " | B4 copy" + << " | m_metrics_dynamic_tbl size: " << m_metrics_dynamic_tbl.size() + << " | m_base_metrics_dynamic_tbl size: " + << GpuMetricsBase_t::m_base_metrics_dynamic_tbl.size() + << " | m_partition_id: " << m_partition_id << " | is_partition_metrics: " << std::boolalpha + << m_is_partition_metrics << " | Returning = " << getRSMIStatusString(status_code, false) + << " |"; LOG_TRACE(ss); - { std::lock_guard lk(s_base_tbl_mu); // Copy to base class @@ -2106,35 +2198,33 @@ AMGpuMetricsPublicLatestTupl_t GpuMetricsBaseDynamic_t::copy_internal_to_externa AMGpuMetricsPublicLatest_t out{}; init_max_public_gpu_matrics(out); - out.common_header.structure_size = m_header.m_structure_size; + out.common_header.structure_size = m_header.m_structure_size; out.common_header.format_revision = m_header.m_format_revision; - out.common_header.content_revision= m_header.m_content_revision; + out.common_header.content_revision = m_header.m_content_revision; - auto assign_by_type = [&](auto& dst, - const details::AMDGpuMetricAttributeData_t& r) { + auto assign_by_type = [&](auto& dst, const details::AMDGpuMetricAttributeData_t& r) { using D = std::decay_t; - std::visit([&](const auto& x) { - using S = std::decay_t; - if constexpr (std::is_integral_v) { - dst = static_cast(x); - } - }, r.m_value); + std::visit( + [&](const auto& x) { + using S = std::decay_t; + if constexpr (std::is_integral_v) { + dst = static_cast(x); + } + }, + r.m_value); }; - auto assign_vector = [&]( auto& dst, - const details::AMDGpuMetricAttributeData_t& r, - std::size_t cap) { - - using Dst = std::remove_reference_t; - using T = std::remove_cv_t>; + auto assign_vector = [&](auto& dst, const details::AMDGpuMetricAttributeData_t& r, + std::size_t cap) { + using Dst = std::remove_reference_t; + using T = std::remove_cv_t>; auto v = std::get_if>(&r.m_value); const std::size_t n = std::min(v->size(), cap); std::copy_n(v->data(), n, dst); }; for (const auto& r : m_dyn.get_metric_rows()) { - switch (r.m_instance.m_attribute_id) { // Temps case details::AMDGpuMetricAttributeId_t::TEMPERATURE_HOTSPOT: @@ -2234,13 +2324,13 @@ AMGpuMetricsPublicLatestTupl_t GpuMetricsBaseDynamic_t::copy_internal_to_externa case details::AMDGpuMetricAttributeId_t::CURRENT_DCLK0: { assign_vector(out.current_dclk0s, r, RSMI_MAX_NUM_CLKS); break; } - + case details::AMDGpuMetricAttributeId_t::CURRENT_UCLK: assign_by_type(out.current_uclk, r); break; case details::AMDGpuMetricAttributeId_t::PCIE_LC_PERF_OTHER_END_RECOVERY: assign_by_type(out.pcie_lc_perf_other_end_recovery, r); break; - + // XCP stats // Only fill in entry 0 case details::AMDGpuMetricAttributeId_t::GFX_BUSY_INST: { @@ -2268,37 +2358,32 @@ AMGpuMetricsPublicLatestTupl_t GpuMetricsBaseDynamic_t::copy_internal_to_externa assign_vector(out.xcp_stats[0].gfx_below_host_limit_total_acc, r, RSMI_MAX_NUM_XCC); break; } - default: break; - } - + default: break; + } } out.current_gfxclk = out.current_gfxclks[0]; out.current_socclk = out.current_socclks[0]; - out.current_vclk0 = out.current_vclk0s[0]; - out.current_vclk1 = out.current_vclk0s[1]; - out.current_dclk0 = out.current_dclk0s[0]; - out.current_dclk1 = out.current_dclk0s[1]; + out.current_vclk0 = out.current_vclk0s[0]; + out.current_vclk1 = out.current_vclk0s[1]; + out.current_dclk0 = out.current_dclk0s[0]; + out.current_dclk1 = out.current_dclk0s[1]; - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " + ss << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | Success " - << " | Returning = " << getRSMIStatusString(status_code) - << " |"; + << " | Returning = " << getRSMIStatusString(status_code) << " |"; LOG_TRACE(ss); return std::make_tuple(status_code, out); } -AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v18_t::copy_internal_to_external_metrics() -{ +AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v18_t::copy_internal_to_external_metrics() { std::ostringstream ss; auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); - auto copy_data_from_internal_metrics_tbl = [&]() - { + auto copy_data_from_internal_metrics_tbl = [&]() { AMGpuMetricsPublicLatest_t metrics_public_init{}; // @@ -2306,167 +2391,253 @@ AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v18_t::copy_internal_to_external_m // no data was assigned to it. init_max_public_gpu_matrics(metrics_public_init); - // Header - metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size; - metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision; - metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision; + // Logic below: + // Default path (::kDevGpuMetrics / !m_is_partition_metrics): + // /sys/class/drm/renderDXXX/device/gpu_metrics + // Partition Path (::kDevGpuMetrics / m_is_partition_metrics): + // /sys/class/drm/renderDXXX/device/xcp/xcp_metrics + // 1. If ::kDevGpuMetrics, copy all data from primary gpu_metrics file. + // 2. If ::kdevGpuPartitionMetrics, copy data from xcp_metrics file. + // 3. Provide any backwards compatibility changes + if (!m_is_partition_metrics) { + // Header + metrics_public_init.common_header.structure_size = m_gpu_metrics_tbl.m_common_header.m_structure_size; + metrics_public_init.common_header.format_revision = m_gpu_metrics_tbl.m_common_header.m_format_revision; + metrics_public_init.common_header.content_revision = m_gpu_metrics_tbl.m_common_header.m_content_revision; - // Temperature - metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot; - metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem; - metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc; + // Temperature + metrics_public_init.temperature_hotspot = m_gpu_metrics_tbl.m_temperature_hotspot; + metrics_public_init.temperature_mem = m_gpu_metrics_tbl.m_temperature_mem; + metrics_public_init.temperature_vrsoc = m_gpu_metrics_tbl.m_temperature_vrsoc; - // Power - metrics_public_init.current_socket_power = m_gpu_metrics_tbl.m_current_socket_power; + // Power + metrics_public_init.current_socket_power = m_gpu_metrics_tbl.m_current_socket_power; - // Utilization - metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity; - metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity; + // Utilization + metrics_public_init.average_gfx_activity = m_gpu_metrics_tbl.m_average_gfx_activity; + metrics_public_init.average_umc_activity = m_gpu_metrics_tbl.m_average_umc_activity; - // Power/Energy - metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator; + // Power/Energy + metrics_public_init.energy_accumulator = m_gpu_metrics_tbl.m_energy_accumulator; - // Driver attached timestamp (in ns) - metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter; + // Driver attached timestamp (in ns) + metrics_public_init.system_clock_counter = m_gpu_metrics_tbl.m_system_clock_counter; - // Clock Lock Status. Each bit corresponds to clock instance - metrics_public_init.gfxclk_lock_status = m_gpu_metrics_tbl.m_gfxclk_lock_status; + // Clock Lock Status. Each bit corresponds to clock instance + metrics_public_init.gfxclk_lock_status = m_gpu_metrics_tbl.m_gfxclk_lock_status; - // Link width (number of lanes) and speed - metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width; - metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed; + // Link width (number of lanes) and speed + metrics_public_init.pcie_link_width = m_gpu_metrics_tbl.m_pcie_link_width; + metrics_public_init.pcie_link_speed = m_gpu_metrics_tbl.m_pcie_link_speed; - // XGMI bus width and bitrate - metrics_public_init.xgmi_link_width = m_gpu_metrics_tbl.m_xgmi_link_width; - metrics_public_init.xgmi_link_speed = m_gpu_metrics_tbl.m_xgmi_link_speed; + // XGMI bus width and bitrate + metrics_public_init.xgmi_link_width = m_gpu_metrics_tbl.m_xgmi_link_width; + metrics_public_init.xgmi_link_speed = m_gpu_metrics_tbl.m_xgmi_link_speed; - // Utilization Accumulated - metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc; - metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc; + // Utilization Accumulated + metrics_public_init.gfx_activity_acc = m_gpu_metrics_tbl.m_gfx_activity_acc; + metrics_public_init.mem_activity_acc = m_gpu_metrics_tbl.m_mem_activity_acc; - // PCIE accumulated bandwidth - metrics_public_init.pcie_bandwidth_acc = m_gpu_metrics_tbl.m_pcie_bandwidth_acc; + // PCIE accumulated bandwidth + metrics_public_init.pcie_bandwidth_acc = m_gpu_metrics_tbl.m_pcie_bandwidth_acc; - // PCIE instantaneous bandwidth - metrics_public_init.pcie_bandwidth_inst = m_gpu_metrics_tbl.m_pcie_bandwidth_inst; + // PCIE instantaneous bandwidth + metrics_public_init.pcie_bandwidth_inst = m_gpu_metrics_tbl.m_pcie_bandwidth_inst; - // PCIE L0 to recovery state transition accumulated count - metrics_public_init.pcie_l0_to_recov_count_acc = m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc; + // PCIE L0 to recovery state transition accumulated count + metrics_public_init.pcie_l0_to_recov_count_acc = m_gpu_metrics_tbl.m_pcie_l0_to_recov_count_acc; - // PCIE replay accumulated count - metrics_public_init.pcie_replay_count_acc = m_gpu_metrics_tbl.m_pcie_replay_count_acc; + // PCIE replay accumulated count + metrics_public_init.pcie_replay_count_acc = m_gpu_metrics_tbl.m_pcie_replay_count_acc; - // PCIE replay rollover accumulated count - metrics_public_init.pcie_replay_rover_count_acc = m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc; + // PCIE replay rollover accumulated count + metrics_public_init.pcie_replay_rover_count_acc = m_gpu_metrics_tbl.m_pcie_replay_rover_count_acc; - // PCIE NAK sent accumulated count - metrics_public_init.pcie_nak_sent_count_acc = m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc; + // PCIE NAK sent accumulated count + metrics_public_init.pcie_nak_sent_count_acc = m_gpu_metrics_tbl.m_pcie_nak_sent_count_acc; - // PCIE NAK received accumulated count - metrics_public_init.pcie_nak_rcvd_count_acc = m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc; + // PCIE NAK received accumulated count + metrics_public_init.pcie_nak_rcvd_count_acc = m_gpu_metrics_tbl.m_pcie_nak_rcvd_count_acc; - // Accumulated throttler residencies - // bumped up public to uint64_t due to planned size increase for newer ASICs - metrics_public_init.accumulation_counter = m_gpu_metrics_tbl.m_accumulation_counter; - metrics_public_init.prochot_residency_acc = m_gpu_metrics_tbl.m_prochot_residency_acc; - metrics_public_init.ppt_residency_acc = m_gpu_metrics_tbl.m_ppt_residency_acc; - metrics_public_init.socket_thm_residency_acc = m_gpu_metrics_tbl.m_socket_thm_residency_acc; - metrics_public_init.vr_thm_residency_acc = m_gpu_metrics_tbl.m_vr_thm_residency_acc; - metrics_public_init.hbm_thm_residency_acc = m_gpu_metrics_tbl.m_hbm_thm_residency_acc; + // Accumulated throttler residencies + // bumped up public to uint64_t due to planned size increase for newer ASICs + metrics_public_init.accumulation_counter = m_gpu_metrics_tbl.m_accumulation_counter; + metrics_public_init.prochot_residency_acc = m_gpu_metrics_tbl.m_prochot_residency_acc; + metrics_public_init.ppt_residency_acc = m_gpu_metrics_tbl.m_ppt_residency_acc; + metrics_public_init.socket_thm_residency_acc = m_gpu_metrics_tbl.m_socket_thm_residency_acc; + metrics_public_init.vr_thm_residency_acc = m_gpu_metrics_tbl.m_vr_thm_residency_acc; + metrics_public_init.hbm_thm_residency_acc = m_gpu_metrics_tbl.m_hbm_thm_residency_acc; - /* VRAM max bandwidth at max memory clock */ - metrics_public_init.vram_max_bandwidth = m_gpu_metrics_tbl.m_mem_max_bandwidth; + /* VRAM max bandwidth at max memory clock */ + metrics_public_init.vram_max_bandwidth = m_gpu_metrics_tbl.m_mem_max_bandwidth; - // XGMI accumulated data transfer size - // xgmi_read_data - const auto xgmi_read_data_num_elems = - static_cast( - std::end(m_gpu_metrics_tbl.m_xgmi_read_data_acc) - - std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc)); - std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc), - xgmi_read_data_num_elems, - metrics_public_init.xgmi_read_data_acc); - // xgmi_write_data - const auto xgmi_write_data_num_elems = - static_cast( - std::end(m_gpu_metrics_tbl.m_xgmi_write_data_acc) - - std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc)); - std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc), - xgmi_write_data_num_elems, - metrics_public_init.xgmi_write_data_acc); + // XGMI accumulated data transfer size + // xgmi_read_data + const auto xgmi_read_data_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_read_data_acc) - + std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_read_data_acc), + xgmi_read_data_num_elems, + metrics_public_init.xgmi_read_data_acc); + // xgmi_write_data + const auto xgmi_write_data_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_write_data_acc) - + std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_write_data_acc), + xgmi_write_data_num_elems, + metrics_public_init.xgmi_write_data_acc); - // xgmi_link_status // new for 1.7 - const auto xgmi_link_status_num_elems = static_cast( - std::end(m_gpu_metrics_tbl.m_xgmi_link_status) - - std::begin(m_gpu_metrics_tbl.m_xgmi_link_status)); - std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_link_status), - xgmi_link_status_num_elems, - metrics_public_init.xgmi_link_status); + // xgmi_link_status // new for 1.7 + const auto xgmi_link_status_num_elems = static_cast( + std::end(m_gpu_metrics_tbl.m_xgmi_link_status) - + std::begin(m_gpu_metrics_tbl.m_xgmi_link_status)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_xgmi_link_status), + xgmi_link_status_num_elems, + metrics_public_init.xgmi_link_status); - // PMFW attached timestamp (10ns resolution) - metrics_public_init.firmware_timestamp = m_gpu_metrics_tbl.m_firmware_timestamp; + // PMFW attached timestamp (10ns resolution) + metrics_public_init.firmware_timestamp = m_gpu_metrics_tbl.m_firmware_timestamp; - // Current clocks - // current_gfxclk - const auto curr_gfxclk_num_elems = - static_cast( - std::end(m_gpu_metrics_tbl.m_current_gfxclk) - - std::begin(m_gpu_metrics_tbl.m_current_gfxclk)); - std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_gfxclk), - curr_gfxclk_num_elems, - metrics_public_init.current_gfxclks); + // Current clocks + // current_gfxclk + const auto curr_gfxclk_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_gfxclk) - + std::begin(m_gpu_metrics_tbl.m_current_gfxclk)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_gfxclk), + curr_gfxclk_num_elems, + metrics_public_init.current_gfxclks); - // current_socclk - const auto curr_socclk_num_elems = - static_cast( - std::end(m_gpu_metrics_tbl.m_current_socclk) - - std::begin(m_gpu_metrics_tbl.m_current_socclk)); - std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_socclk), - curr_socclk_num_elems, - metrics_public_init.current_socclks); + // current_socclk + const auto curr_socclk_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_socclk) - + std::begin(m_gpu_metrics_tbl.m_current_socclk)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_socclk), + curr_socclk_num_elems, + metrics_public_init.current_socclks); - // current_vclk0 - const auto curr_vclk0_num_elems = - static_cast( - std::end(m_gpu_metrics_tbl.m_current_vclk0) - - std::begin(m_gpu_metrics_tbl.m_current_vclk0)); - std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_vclk0), - curr_vclk0_num_elems, - metrics_public_init.current_vclk0s); + // current_vclk0 + const auto curr_vclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_vclk0) - + std::begin(m_gpu_metrics_tbl.m_current_vclk0)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_vclk0), + curr_vclk0_num_elems, + metrics_public_init.current_vclk0s); - // current_dclk0 - const auto curr_dclk0_num_elems = - static_cast( - std::end(m_gpu_metrics_tbl.m_current_dclk0) - - std::begin(m_gpu_metrics_tbl.m_current_dclk0)); - std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_dclk0), - curr_dclk0_num_elems, - metrics_public_init.current_dclk0s); + // current_dclk0 + const auto curr_dclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_tbl.m_current_dclk0) - + std::begin(m_gpu_metrics_tbl.m_current_dclk0)); + std::copy_n(std::begin(m_gpu_metrics_tbl.m_current_dclk0), + curr_dclk0_num_elems, + metrics_public_init.current_dclk0s); - metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk; + metrics_public_init.current_uclk = m_gpu_metrics_tbl.m_current_uclk; - metrics_public_init.num_partition = m_gpu_metrics_tbl.m_num_partition; + metrics_public_init.num_partition = m_gpu_metrics_tbl.m_num_partition; - metrics_public_init.pcie_lc_perf_other_end_recovery = - m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery; + metrics_public_init.pcie_lc_perf_other_end_recovery = + m_gpu_metrics_tbl.m_pcie_lc_perf_other_end_recovery; - // xcp stats - auto priv_it = std::begin(m_gpu_metrics_tbl.m_xcp_stats); - for (auto pub_it = std::begin(metrics_public_init.xcp_stats); - pub_it != std::end(metrics_public_init.xcp_stats); ++pub_it, ++priv_it) { - std::copy_n(std::begin(priv_it->gfx_busy_inst), RSMI_MAX_NUM_XCC, pub_it->gfx_busy_inst); - std::copy_n(std::begin(priv_it->jpeg_busy), RSMI_MAX_NUM_JPEG_ENG_V1, pub_it->jpeg_busy); - std::copy_n(std::begin(priv_it->vcn_busy), RSMI_MAX_NUM_VCNS, pub_it->vcn_busy); - std::copy_n(std::begin(priv_it->gfx_busy_acc), RSMI_MAX_NUM_XCC, pub_it->gfx_busy_acc); - std::copy_n(std::begin(priv_it->gfx_below_host_limit_ppt_acc), RSMI_MAX_NUM_XCC, - pub_it->gfx_below_host_limit_ppt_acc); - std::copy_n(std::begin(priv_it->gfx_below_host_limit_thm_acc), RSMI_MAX_NUM_XCC, - pub_it->gfx_below_host_limit_thm_acc); - std::copy_n(std::begin(priv_it->gfx_low_utilization_acc), RSMI_MAX_NUM_XCC, - pub_it->gfx_low_utilization_acc); - std::copy_n(std::begin(priv_it->gfx_below_host_limit_total_acc), RSMI_MAX_NUM_XCC, - pub_it->gfx_below_host_limit_total_acc); + // xcp stats + auto priv_it = std::begin(m_gpu_metrics_tbl.m_xcp_stats); + for (auto pub_it = std::begin(metrics_public_init.xcp_stats); + pub_it != std::end(metrics_public_init.xcp_stats); ++pub_it, ++priv_it) { + std::copy_n(std::begin(priv_it->gfx_busy_inst), RSMI_MAX_NUM_XCC, pub_it->gfx_busy_inst); + std::copy_n(std::begin(priv_it->jpeg_busy), RSMI_MAX_NUM_JPEG_ENG_V1, pub_it->jpeg_busy); + std::copy_n(std::begin(priv_it->vcn_busy), RSMI_MAX_NUM_VCNS, pub_it->vcn_busy); + std::copy_n(std::begin(priv_it->gfx_busy_acc), RSMI_MAX_NUM_XCC, pub_it->gfx_busy_acc); + std::copy_n(std::begin(priv_it->gfx_below_host_limit_ppt_acc), RSMI_MAX_NUM_XCC, + pub_it->gfx_below_host_limit_ppt_acc); + std::copy_n(std::begin(priv_it->gfx_below_host_limit_thm_acc), RSMI_MAX_NUM_XCC, + pub_it->gfx_below_host_limit_thm_acc); + std::copy_n(std::begin(priv_it->gfx_low_utilization_acc), RSMI_MAX_NUM_XCC, + pub_it->gfx_low_utilization_acc); + std::copy_n(std::begin(priv_it->gfx_below_host_limit_total_acc), RSMI_MAX_NUM_XCC, + pub_it->gfx_below_host_limit_total_acc); + } + } else { + // Partition Data: /sys/class/drm/renderDXXX/device/xcp/xcp_metrics + // Copy common data from xcp metrics table + + // Header + metrics_public_init.common_header.structure_size = m_gpu_metrics_partition_tbl.m_common_header.m_structure_size; + metrics_public_init.common_header.format_revision = m_gpu_metrics_partition_tbl.m_common_header.m_format_revision; + metrics_public_init.common_header.content_revision = m_gpu_metrics_partition_tbl.m_common_header.m_content_revision; + + // Current clocks + // current_gfxclk + const auto curr_gfxclk_num_elems = + static_cast( + std::end(m_gpu_metrics_partition_tbl.m_current_gfxclk) - + std::begin(m_gpu_metrics_partition_tbl.m_current_gfxclk)); + std::copy_n(std::begin(m_gpu_metrics_partition_tbl.m_current_gfxclk), + curr_gfxclk_num_elems, + metrics_public_init.current_gfxclks); + + // current_socclk + const auto curr_socclk_num_elems = + static_cast( + std::end(m_gpu_metrics_partition_tbl.m_current_socclk) - + std::begin(m_gpu_metrics_partition_tbl.m_current_socclk)); + std::copy_n(std::begin(m_gpu_metrics_partition_tbl.m_current_socclk), + curr_socclk_num_elems, + metrics_public_init.current_socclks); + + // current_vclk0 + const auto curr_vclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_partition_tbl.m_current_vclk0) - + std::begin(m_gpu_metrics_partition_tbl.m_current_vclk0)); + std::copy_n(std::begin(m_gpu_metrics_partition_tbl.m_current_vclk0), + curr_vclk0_num_elems, + metrics_public_init.current_vclk0s); + + // current_dclk0 + const auto curr_dclk0_num_elems = + static_cast( + std::end(m_gpu_metrics_partition_tbl.m_current_dclk0) - + std::begin(m_gpu_metrics_partition_tbl.m_current_dclk0)); + std::copy_n(std::begin(m_gpu_metrics_partition_tbl.m_current_dclk0), + curr_dclk0_num_elems, + metrics_public_init.current_dclk0s); + metrics_public_init.current_uclk = m_gpu_metrics_partition_tbl.m_current_uclk; + + // Copy data from partition table to public table + // Only copy data for xcp #0 + // Partition metrics should default to XCP #0 position, since we can gather only one partition + // metrics at a time. + const uint32_t xcp_num = 0; + uint32_t row = 0; + for (auto it = std::begin(metrics_public_init.xcp_stats); + it != std::end(metrics_public_init.xcp_stats); ++it, ++row) { + if (row == xcp_num) { + std::copy_n(std::begin(m_gpu_metrics_partition_tbl.m_gfx_busy_inst), + RSMI_MAX_NUM_XCC, it->gfx_busy_inst); + std::copy_n(std::begin(m_gpu_metrics_partition_tbl.m_jpeg_busy), + RSMI_MAX_NUM_JPEG_ENG_V1, it->jpeg_busy); + std::copy_n(std::begin(m_gpu_metrics_partition_tbl.m_vcn_busy), + RSMI_MAX_NUM_VCNS, it->vcn_busy); + std::copy_n(std::begin(m_gpu_metrics_partition_tbl.m_gfx_busy_acc), + RSMI_MAX_NUM_XCC, it->gfx_busy_acc); + std::copy_n(std::begin(m_gpu_metrics_partition_tbl.m_gfx_below_host_limit_ppt_acc), + RSMI_MAX_NUM_XCC, it->gfx_below_host_limit_ppt_acc); + std::copy_n(std::begin(m_gpu_metrics_partition_tbl.m_gfx_below_host_limit_thm_acc), + RSMI_MAX_NUM_XCC, it->gfx_below_host_limit_thm_acc); + std::copy_n(std::begin(m_gpu_metrics_partition_tbl.m_gfx_low_utilization_acc), + RSMI_MAX_NUM_XCC, it->gfx_low_utilization_acc); + std::copy_n(std::begin(m_gpu_metrics_partition_tbl.m_gfx_below_host_limit_total_acc), + RSMI_MAX_NUM_XCC, it->gfx_below_host_limit_total_acc); + } else { + break; // No need to copy for other rows + } + } } // @@ -2494,8 +2665,8 @@ AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v18_t::copy_internal_to_external_m << " |"; LOG_TRACE(ss); - return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl); -}; + return std::make_tuple(status_code, copy_data_from_internal_metrics_tbl); +} AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v17_t::copy_internal_to_external_metrics() { @@ -3616,7 +3787,7 @@ AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v13_t::copy_internal_to_external_m metrics_public_init.indep_throttle_status = m_gpu_metrics_tbl.m_indep_throttle_status; // - // Note: Backwards compatibility -> Handling extra/exception cases + // Note: Forwards compatibility -> Handling extra/exception cases // related to earlier versions (1.2) // metrics_public_init.current_socket_power = metrics_public_init.average_socket_power; // average_mm_activity needs to not be UIN16_MAX and @@ -3625,12 +3796,6 @@ AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v13_t::copy_internal_to_external_m && metrics_public_init.vcn_activity[0] == UINT16_MAX) { metrics_public_init.vcn_activity[0] = metrics_public_init.average_mm_activity; } - // average_mm_activity needs to not be UIN16_MAX and - // metrics_public_init.xcp_stats->vcn_busy[0] should also be UINT16_MAX - if (metrics_public_init.average_mm_activity != UINT16_MAX - && metrics_public_init.xcp_stats->vcn_busy[0] == UINT16_MAX) { - metrics_public_init.xcp_stats->vcn_busy[0] = metrics_public_init.average_mm_activity; - } return metrics_public_init; }(); @@ -4310,101 +4475,145 @@ AMGpuMetricsPublicLatestTupl_t GpuMetricsBase_v11_t::copy_internal_to_external_m } -rsmi_status_t Device::dev_read_gpu_metrics_header_data() -{ +auto Device::dev_read_gpu_metrics_header_data(DevInfoTypes type) -> rsmi_status_t { std::ostringstream ss; auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); - - // Check if/when metrics table needs to be refreshed. - auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics, - sizeof(AMDGpuMetricsHeader_v1_t), - &m_gpu_metrics_header); - + int op_result; + std::string gpu_metrics_path = get_sys_file_path_by_type(type, true); + op_result = readDevInfo(type, sizeof(AMDGpuMetricsHeader_v1_t), + &m_gpu_metrics_header); if ((status_code = ErrnoToRsmiStatus(op_result)) != rsmi_status_t::RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) - << " | Cause: readDevInfo(kDevGpuMetrics)" - << " | Returning = " - << getRSMIStatusString(status_code) - << " Could not read Metrics Header: " - << print_unsigned_int(m_gpu_metrics_header.m_structure_size) - << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Partition ID: " << m_partition_id + << " | Is Partition Metrics: " << std::boolalpha << is_smi_expecting_partition_metrics() + << " | File Path: " << gpu_metrics_path + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header, is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Cause: " + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " Could not read Metrics Header: " + << print_unsigned_int(m_gpu_metrics_header.m_structure_size) + << " |"; LOG_ERROR(ss); return status_code; } - if ((status_code = is_gpu_metrics_version_supported(m_gpu_metrics_header)) == + ss << __PRETTY_FUNCTION__ << " | Before is_gpu_metrics_version_supported() " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Partition ID: " << m_partition_id + << " | Is Partition Metrics: " << std::boolalpha << is_smi_expecting_partition_metrics() + << " | File Path: " << gpu_metrics_path + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header, is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Update Timestamp: " << m_gpu_metrics_updated_timestamp + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; + LOG_TRACE(ss); + if ((status_code = is_gpu_metrics_version_supported(m_gpu_metrics_header, is_smi_expecting_partition_metrics())) == rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) { ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) - << " | Cause: gpu metric file version is not supported: " - << " | Returning = " - << getRSMIStatusString(status_code) - << " Could not read Metrics Header: " - << print_unsigned_int(m_gpu_metrics_header.m_structure_size) - << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Partition ID: " << m_partition_id + << " | Is Partition Metrics: " << std::boolalpha << is_smi_expecting_partition_metrics() + << " | File Path: " << gpu_metrics_path + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header, is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Cause: gpu metric file version is not supported: " + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " Could not read Metrics Header: " + << print_unsigned_int(m_gpu_metrics_header.m_structure_size) + << " |"; LOG_ERROR(ss); return status_code; } m_gpu_metrics_updated_timestamp = actual_timestamp_in_secs(); ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Success " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) - << " | Update Timestamp: " << m_gpu_metrics_updated_timestamp - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Success " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Partition ID: " << m_partition_id + << " | Is Partition Metrics: " << std::boolalpha << is_smi_expecting_partition_metrics() + << " | File Path: " << gpu_metrics_path + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header, is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Update Timestamp: " << m_gpu_metrics_updated_timestamp + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_TRACE(ss); return status_code; } -rsmi_status_t Device::dev_read_gpu_metrics_all_data() -{ +auto Device::dev_read_gpu_metrics_all_data(DevInfoTypes type) -> rsmi_status_t { std::ostringstream ss; auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); + int op_result; + std::string gpu_metrics_path = get_sys_file_path_by_type(type, true); + // Default path (::kDevGpuMetrics / !m_is_partition_metrics): + // /sys/class/drm/renderDXXX/device/gpu_metrics + // Partition Path (::kDevGpuMetrics / m_is_partition_metrics): + // /sys/class/drm/renderDXXX/device/xcp/xcp_metrics // At this point we should have a valid gpu_metrics pointer, and // we already read the header; setup_gpu_metrics_reading() - if ((!m_gpu_metrics_ptr) || - ((!m_gpu_metrics_header.m_structure_size) || - (!m_gpu_metrics_header.m_format_revision) || - (!m_gpu_metrics_header.m_content_revision))) { - status_code = rsmi_status_t::RSMI_STATUS_SETTING_UNAVAILABLE; + if (!m_gpu_metrics_ptr || (status_code = is_gpu_metrics_version_supported( + m_gpu_metrics_header, is_smi_expecting_partition_metrics())) == RSMI_STATUS_NOT_SUPPORTED + ) { + status_code = RSMI_STATUS_SETTING_UNAVAILABLE; ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: Couldn't get a valid metric object. setup_gpu_metrics_reading()" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Cause: Couldn't get a valid metric object. setup_gpu_metrics_reading()" + << " | m_gpu_metrics_ptr: " + << (m_gpu_metrics_ptr ? "valid" : "nullptr") + << " | m_gpu_metrics_header.m_structure_size: " + << print_unsigned_int(m_gpu_metrics_header.m_structure_size) + << " | m_gpu_metrics_header.m_format_revision: " + << print_unsigned_int(m_gpu_metrics_header.m_format_revision) + << " | m_gpu_metrics_header.m_content_revision: " + << print_unsigned_int(m_gpu_metrics_header.m_content_revision) + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_ERROR(ss); return status_code; } - if (m_is_dynamic_gpu_metrics_supported){ - - std::string file_name = "/sys/class/drm/card" - + std::to_string(index()) - + "/device/gpu_metrics"; + ss << __PRETTY_FUNCTION__ + << " | ======= P1 Start ======= " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Partition ID: " << m_partition_id + << " | Is Partition Metrics: " << std::boolalpha << is_smi_expecting_partition_metrics() + << " | Is Dynamic GPU Metrics Supported: " << std::boolalpha << m_is_dynamic_gpu_metrics_supported + << " | File Path: " << gpu_metrics_path + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header, is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Update Timestamp: " << m_gpu_metrics_updated_timestamp + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; + LOG_DEBUG(ss); + if (m_is_dynamic_gpu_metrics_supported) { // Parse blob to schema rows AMDGpuDynamicMetrics_t AMDGpuDynamicMetrics_t parsed; - rsmi_status_t st = parsed.parse_from_file(file_name, m_gpu_metrics_header.m_structure_size); + rsmi_status_t st = parsed.parse_from_file(gpu_metrics_path, m_gpu_metrics_header.m_structure_size); if (st != RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ @@ -4416,33 +4625,38 @@ rsmi_status_t Device::dev_read_gpu_metrics_all_data() LOG_ERROR(ss); return rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; } - + // Store AMDGpuDynamicMetrics_t auto* dyn = static_cast(m_gpu_metrics_ptr.get()); status_code = dyn->set_parsed_dynamic(std::move(parsed)); - if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS){ + if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { return status_code; } } else { - auto op_result = readDevInfo(DevInfoTypes::kDevGpuMetrics, - m_gpu_metrics_header.m_structure_size, - m_gpu_metrics_ptr->get_metrics_table().get()); - if ((status_code = ErrnoToRsmiStatus(op_result)) != - rsmi_status_t::RSMI_STATUS_SUCCESS) { - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) - << " | Cause: readDevInfo(kDevGpuMetrics)" - << " | Returning = " - << getRSMIStatusString(status_code) - << " Could not read Metrics Header: " - << print_unsigned_int(m_gpu_metrics_header.m_structure_size) - << " |"; - LOG_ERROR(ss); - return status_code; + op_result = readDevInfo(type, + m_gpu_metrics_header.m_structure_size, + m_gpu_metrics_ptr->get_metrics_table().get()); + + if ((status_code = ErrnoToRsmiStatus(op_result)) != + rsmi_status_t::RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Partition ID: " << m_partition_id + << " | Is Partition Metrics: " << std::boolalpha << is_smi_expecting_partition_metrics() + << " | File Path: " << gpu_metrics_path + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header, is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Cause: readDevInfo(kDevGpuMetrics)" + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " Could not read Metrics Header: " + << print_unsigned_int(m_gpu_metrics_header.m_structure_size) + << " |"; + LOG_ERROR(ss); + return status_code; } } @@ -4450,110 +4664,153 @@ rsmi_status_t Device::dev_read_gpu_metrics_all_data() status_code = m_gpu_metrics_ptr->populate_metrics_dynamic_tbl(); if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) - << " | Update Timestamp: " << m_gpu_metrics_updated_timestamp - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Partition ID: " << m_partition_id + << " | Is Partition Metrics: " << std::boolalpha << is_smi_expecting_partition_metrics() + << " | File Path: " << gpu_metrics_path + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header, is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Update Timestamp: " << m_gpu_metrics_updated_timestamp + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_ERROR(ss); } m_gpu_metrics_updated_timestamp = actual_timestamp_in_secs(); ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Success " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header) - << " | Update Timestamp: " << m_gpu_metrics_updated_timestamp - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Success " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Partition ID: " << m_partition_id + << " | Is Partition Metrics: " << std::boolalpha << is_smi_expecting_partition_metrics() + << " | File Path: " << gpu_metrics_path + << " | Metric Version: " << stringfy_metrics_header(m_gpu_metrics_header, is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Update Timestamp: " << m_gpu_metrics_updated_timestamp + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_TRACE(ss); return status_code; } -rsmi_status_t Device::setup_gpu_metrics_reading() -{ +auto Device::setup_gpu_metrics_reading(DevInfoTypes type) -> rsmi_status_t { std::ostringstream ss; auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); - status_code = dev_read_gpu_metrics_header_data(); + status_code = dev_read_gpu_metrics_header_data(type); if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { return status_code; } - const auto gpu_metrics_flag_version = translate_header_to_flag_version(dev_get_metrics_header()); + std::string gpu_metrics_path = get_sys_file_path_by_type(type, true); + // Default path (::kDevGpuMetrics / !m_is_partition_metrics): + // /sys/class/drm/renderDXXX/device/gpu_metrics + // Partition Path (::kDevGpuMetrics / m_is_partition_metrics): + // /sys/class/drm/renderDXXX/device/xcp/xcp_metrics + + std::string metric_version_str = + stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path); + + const auto gpu_metrics_flag_version = translate_header_to_flag_version(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path); if (gpu_metrics_flag_version == AMDGpuMetricVersionFlags_t::kGpuMetricNone) { status_code = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | [Translates to: " << join_metrics_version(dev_get_metrics_header()) - << " ] " - << " | Cause: Metric version found is not supported!" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | [Translates to: " << join_metrics_version(dev_get_metrics_header()) + << " ] " + << " | Cause: Metric version found is not supported!" + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_ERROR(ss); return status_code; } - m_is_dynamic_gpu_metrics_supported = (static_cast>(gpu_metrics_flag_version) >= - static_cast>(AMDGpuMetricVersionFlags_t::kGpuMetricV19)); + m_is_dynamic_gpu_metrics_supported = + (static_cast>(gpu_metrics_flag_version) >= + static_cast>( + AMDGpuMetricVersionFlags_t::kGpuMetricDynV19Plus) + && !is_smi_expecting_partition_metrics()) || + (static_cast>(gpu_metrics_flag_version) >= + static_cast>( + AMDGpuMetricVersionFlags_t::kGpuXcpMetricDynV11Plus) + && is_smi_expecting_partition_metrics()); - // m_gpu_metrics_ptr.reset(); - m_gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version); + ss << __PRETTY_FUNCTION__ + << " | ======= P1 Start ======= " + << " | Status: Before amdgpu_metrics_factory() " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Partition ID: " << m_partition_id + << " | Is Partition Metrics: " << std::boolalpha << is_smi_expecting_partition_metrics() + << " | Is Dynamic Metrics Supported: " << std::boolalpha << m_is_dynamic_gpu_metrics_supported + << " | Metric Flag Version: ||" << metric_version_str << "||" + << " | File Path: " << gpu_metrics_path + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Update Timestamp: " << m_gpu_metrics_updated_timestamp + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; + LOG_DEBUG(ss); + m_gpu_metrics_ptr = amdgpu_metrics_factory(gpu_metrics_flag_version, is_smi_expecting_partition_metrics(), gpu_metrics_path); if (!m_gpu_metrics_ptr) { status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: amdgpu_metrics_factory() couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Cause: amdgpu_metrics_factory() couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_ERROR(ss); return status_code; } - m_gpu_metrics_ptr->set_device_id(m_device_id); - m_gpu_metrics_ptr->set_partition_id(m_partition_id); + m_gpu_metrics_ptr->set_device_id(get_smi_device_id()); + m_gpu_metrics_ptr->set_partition_id(get_smi_partition_id()); + m_gpu_metrics_ptr->set_is_partition_metrics(is_smi_expecting_partition_metrics()); // m_gpu_metrics_ptr has the pointer to the proper object type/version. - status_code = dev_read_gpu_metrics_all_data(); + status_code = dev_read_gpu_metrics_all_data(type); if (status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: dev_read_gpu_metrics_all_data() couldn't read gpu metric data!" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Cause: dev_read_gpu_metrics_all_data() couldn't read gpu metric data!" + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_ERROR(ss); return status_code; } ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Success " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Fabric: [" << &m_gpu_metrics_ptr - << " ]" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Success " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | [A] Fabric: [" << &m_gpu_metrics_ptr + << " ]" + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_TRACE(ss); return status_code; } @@ -4592,30 +4849,34 @@ auto get_casted_value(const AMDGpuDynamicMetricsValue_t& metrics_value) } -rsmi_status_t Device::dev_log_gpu_metrics(std::ostringstream& outstream_metrics) { +auto Device::dev_log_gpu_metrics(std::ostringstream& outstream_metrics, + DevInfoTypes type) -> rsmi_status_t { std::ostringstream ss; std::ostringstream tmp_outstream_metrics; auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); + std::string gpu_metrics_path = get_sys_file_path_by_type(type, true); + // If we still don't have a valid gpu_metrics pointer; // meaning, we didn't run any queries, and just want to // print all the gpu metrics content, we need to setup // the environment first. - status_code = setup_gpu_metrics_reading(); + status_code = setup_gpu_metrics_reading(type); if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) { // At this point we should have a valid gpu_metrics pointer. status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: Couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_ERROR(ss); return status_code; } @@ -4696,95 +4957,105 @@ rsmi_status_t Device::dev_log_gpu_metrics(std::ostringstream& outstream_metrics) return; }; - // header_output(); table_content_output(); outstream_metrics << tmp_outstream_metrics.rdbuf(); LOG_DEBUG(tmp_outstream_metrics); ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Success " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Fabric: [" << &m_gpu_metrics_ptr - << " ]" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Success " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | [B] Fabric: [" << &m_gpu_metrics_ptr + << " ]" + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_TRACE(ss); return status_code; } -AMGpuMetricsPublicLatestTupl_t Device::dev_copy_internal_to_external_metrics() -{ +auto Device::dev_copy_internal_to_external_metrics(DevInfoTypes type) + -> AMGpuMetricsPublicLatestTupl_t { std::ostringstream ss; auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); + std::string gpu_metrics_path = get_sys_file_path_by_type(type, true); if (!m_gpu_metrics_ptr) { // At this point we should have a valid gpu_metrics pointer. status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: Couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_ERROR(ss); return std::make_tuple(status_code, AMGpuMetricsPublicLatest_t()); } ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Success " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Fabric: [" << &m_gpu_metrics_ptr - << " ]" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Success " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | [C] Fabric: [" << &m_gpu_metrics_ptr + << " ]" + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_TRACE(ss); return m_gpu_metrics_ptr->copy_internal_to_external_metrics(); } - -rsmi_status_t Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t metric_counter, AMDGpuDynamicMetricTblValues_t& values) -{ +auto Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t metric_counter, + AMDGpuDynamicMetricTblValues_t& values, + DevInfoTypes type) -> rsmi_status_t { std::ostringstream ss; auto status_code(rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED); ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); - status_code = setup_gpu_metrics_reading(); + std::string gpu_metrics_path = get_sys_file_path_by_type(type, true); + // Default path (::kDevGpuMetrics / !m_is_partition_metrics): + // /sys/class/drm/renderDXXX/device/gpu_metrics + // Partition Path (::kDevGpuMetrics / m_is_partition_metrics): + // /sys/class/drm/renderDXXX/device/xcp/xcp_metrics + status_code = setup_gpu_metrics_reading(type); if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || (!m_gpu_metrics_ptr)) { status_code = rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Cause: Couldn't get a valid metric object" - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Cause: Couldn't get a valid metric object" + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_ERROR(ss); return status_code; } // Lookup the dynamic table ss << __PRETTY_FUNCTION__ - << " | ======= info ======= " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Metric Unit: " << static_cast(metric_counter) - << " |"; + << " | ======= info ======= " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Metric Unit: " << static_cast(metric_counter) + << " |"; LOG_INFO(ss); const auto gpu_metrics_tbl = m_gpu_metrics_ptr->get_metrics_dynamic_tbl(); for (const auto& [metric_class, metric_data] : gpu_metrics_tbl) { @@ -4793,14 +5064,15 @@ rsmi_status_t Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t met values = metric_values; status_code = rsmi_status_t::RSMI_STATUS_SUCCESS; ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Success " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Metric Unit: " << static_cast(metric_counter) - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Success " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Metric Unit: " << static_cast(metric_counter) + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_TRACE(ss); return status_code; } @@ -4808,13 +5080,14 @@ rsmi_status_t Device::run_internal_gpu_metrics_query(AMDGpuMetricsUnitType_t met } ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << index() - << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header()) - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << index() + << " | Type: " << Device::get_type_string(type) + << " | Metric Version: " << stringfy_metrics_header(dev_get_metrics_header(), is_smi_expecting_partition_metrics(), gpu_metrics_path) + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_ERROR(ss); return status_code; } @@ -4870,17 +5143,17 @@ rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnit status_code = dev->run_internal_gpu_metrics_query(metric_counter, tmp_values); if ((status_code != rsmi_status_t::RSMI_STATUS_SUCCESS) || tmp_values.empty()) { ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Metric Version: " << stringfy_metrics_header(dev->dev_get_metrics_header()) - << " | Cause: Couldn't find metric/counter requested" - << " | Metric Type: " << static_cast(metric_counter) - << " " << amdgpu_metrics_unit_type_translation_table.at(metric_counter) - << " | Values: " << tmp_values.size() - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Metric Version: " << stringfy_metrics_header(dev->dev_get_metrics_header(), false, "N/A") + << " | Cause: Couldn't find metric/counter requested" + << " | Metric Type: " << static_cast(metric_counter) + << " " << amdgpu_metrics_unit_type_translation_table.at(metric_counter) + << " | Values: " << tmp_values.size() + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_ERROR(ss); return status_code; } @@ -4909,7 +5182,7 @@ rsmi_status_t rsmi_dev_gpu_metrics_info_query(uint32_t dv_ind, AMDGpuMetricsUnit << " | Device #: " << dv_ind << " | Metric Type: " << static_cast(metric_counter) << " | Returning = " - << getRSMIStatusString(status_code) + << getRSMIStatusString(status_code, false) << " |"; LOG_TRACE(ss); return status_code; @@ -4990,22 +5263,25 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) { ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); + DevInfoTypes type = DevInfoTypes::kDevGpuMetrics; assert(smu != nullptr); if (smu == nullptr) { status_code = rsmi_status_t::RSMI_STATUS_INVALID_ARGS; ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Returning = " - << getRSMIStatusString(status_code) - << " |"; + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << Device::get_type_string(type) + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; LOG_ERROR(ss); return status_code; } dev->set_smi_device_id(dv_ind); + dev->set_smi_dev_info_type(type); uint32_t partition_id = 0; auto ret = rsmi_dev_partition_id_get(dv_ind, &partition_id); if (ret == RSMI_STATUS_SUCCESS) { @@ -5015,15 +5291,15 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) { } // check if file exists, report not supported if it does not exist - std::string file_name = "/sys/class/drm/card" - + std::to_string(dev->index()) - + "/device/gpu_metrics"; + std::string file_name = dev->get_sys_file_path_by_type(type, true); if (access(file_name.c_str(), F_OK | R_OK) != 0) { status_code = RSMI_STATUS_NOT_SUPPORTED; ss << __PRETTY_FUNCTION__ << " | ======= end ======= " - << " | Fail " + << " | Cause: File does not exist or is not readable" << " | Device #: " << dv_ind + << " | Type: " << Device::get_type_string(type) + << " | File: " << file_name << " | Returning = " << getRSMIStatusString(status_code, false) << " |"; @@ -5035,12 +5311,107 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) { const auto [error_code, external_metrics] = dev->dev_copy_internal_to_external_metrics(); if (error_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Returning = " - << getRSMIStatusString(error_code) - << " |"; + << " | ======= end ======= " + << " | Cause: Could not copy internal to external metrics" + << " | Device #: " << dv_ind + << " | Type: " << Device::get_type_string(type) + << " | File: " << file_name + << " | Returning = " + << getRSMIStatusString(error_code, false) + << " |"; + LOG_ERROR(ss); + return error_code; + } + + *smu = external_metrics; + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " << Device::get_type_string(type) + << " | File: " << file_name + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; + LOG_INFO(ss); + + return status_code; + CATCH +} + +rsmi_status_t +rsmi_dev_gpu_partition_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) { + TRY + DEVICE_MUTEX + CHK_SUPPORT_NAME_ONLY(smu) + + auto status_code(rsmi_status_t::RSMI_STATUS_SUCCESS); + thread_local std::ostringstream ostrstream; + thread_local std::ostringstream ss; + + ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(ss); + DevInfoTypes type = DevInfoTypes::kdevGpuPartitionMetrics; + + assert(smu != nullptr); + if (smu == nullptr) { + status_code = rsmi_status_t::RSMI_STATUS_INVALID_ARGS; + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << Device::get_type_string(type) + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; + LOG_ERROR(ss); + return status_code; + } + + dev->set_smi_device_id(dv_ind); + dev->set_smi_dev_info_type(type); + uint32_t partition_id = 0; + auto ret = rsmi_dev_partition_id_get(dv_ind, &partition_id); + if (ret == RSMI_STATUS_SUCCESS) { + dev->set_smi_partition_id(partition_id); + } else { + dev->set_smi_partition_id(0); + } + + // check if file exists, report not supported if it does not exist + std::string file_name = dev->get_sys_file_path_by_type(type, true); + // Default path (::kDevGpuMetrics / !m_is_partition_metrics): + // /sys/class/drm/renderDXXX/device/gpu_metrics + // Partition Path (::kDevGpuMetrics / m_is_partition_metrics): + // /sys/class/drm/renderDXXX/device/xcp/xcp_metrics + if (access(file_name.c_str(), F_OK | R_OK) != 0) { + status_code = RSMI_STATUS_NOT_SUPPORTED; + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Cause: File does not exist or is not readable" + << " | Device #: " << dv_ind + << " | Type: " << Device::get_type_string(type) + << " | File: " << file_name + << " | Returning = " + << getRSMIStatusString(status_code, false) + << " |"; + LOG_ERROR(ss); + return status_code; + } + + dev->dev_log_gpu_metrics(ostrstream, type); + const auto [error_code, external_metrics] + = dev->dev_copy_internal_to_external_metrics(type); + if (error_code != rsmi_status_t::RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Cause: Could not copy internal to external metrics" + << " | Device #: " << dv_ind + << " | Type: " << Device::get_type_string(type) + << " | File: " << file_name + << " | Returning = " + << getRSMIStatusString(error_code, false) + << " |"; LOG_ERROR(ss); return error_code; } @@ -5050,10 +5421,12 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) { << " | ======= end ======= " << " | Success " << " | Device #: " << dv_ind + << " | Type: " << Device::get_type_string(type) + << " | File: " << file_name << " | Returning = " - << getRSMIStatusString(status_code) + << getRSMIStatusString(status_code, false) << " |"; - LOG_TRACE(ss); + LOG_INFO(ss); return status_code; CATCH diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 83a6310f5a..a1e2ba305a 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -3352,13 +3352,27 @@ amdsmi_get_gpu_metrics_header_info(amdsmi_processor_handle processor_handle, reinterpret_cast(header_value)); } +amdsmi_status_t amdsmi_get_gpu_partition_metrics_info( + amdsmi_processor_handle processor_handle, + amdsmi_gpu_metrics_t *pgpu_metrics) { + AMDSMI_CHECK_INIT(); + if (pgpu_metrics != nullptr) { + *pgpu_metrics = amdsmi_gpu_metrics_t{}; // Use a default initializer for the struct + } else { + return AMDSMI_STATUS_INVAL; // Return error if pgpu_metrics is null + } + return rsmi_wrapper(rsmi_dev_gpu_partition_metrics_info_get, processor_handle, 0, + reinterpret_cast(pgpu_metrics)); +} + amdsmi_status_t amdsmi_get_gpu_metrics_info( amdsmi_processor_handle processor_handle, amdsmi_gpu_metrics_t *pgpu_metrics) { AMDSMI_CHECK_INIT(); - // nullptr api supported if (pgpu_metrics != nullptr) { *pgpu_metrics = amdsmi_gpu_metrics_t{}; // Use a default initializer for the struct + } else { + return AMDSMI_STATUS_INVAL; // Return error if pgpu_metrics is null } return rsmi_wrapper(rsmi_dev_gpu_metrics_info_get, processor_handle, 0, reinterpret_cast(pgpu_metrics)); diff --git a/projects/amdsmi/tests/amd_smi_test/CMakeLists.txt b/projects/amdsmi/tests/amd_smi_test/CMakeLists.txt index 33bb0018b4..a8c9d224d4 100644 --- a/projects/amdsmi/tests/amd_smi_test/CMakeLists.txt +++ b/projects/amdsmi/tests/amd_smi_test/CMakeLists.txt @@ -52,6 +52,11 @@ include_directories(${TEST} ${CMAKE_CURRENT_SOURCE_DIR}/.. ${ROCM_INC_DIR}/..) add_executable(${TEST} ${tstSources} ${functionalSources}) target_link_libraries(${TEST} ${AMD_SMI} GTest::gtest_main c stdc++ pthread) +if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" + AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "9") + target_link_libraries(${TEST} stdc++fs) +endif() + # Install tests install( TARGETS ${TEST} diff --git a/projects/amdsmi/tests/amd_smi_test/functional/dynamic_metrics_test.cc b/projects/amdsmi/tests/amd_smi_test/functional/dynamic_metrics_test.cc new file mode 100644 index 0000000000..f1806ed113 --- /dev/null +++ b/projects/amdsmi/tests/amd_smi_test/functional/dynamic_metrics_test.cc @@ -0,0 +1,203 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include +#include + +#include +#include +#include +#include + +#include "rocm_smi/rocm_smi_gpu_metrics.h" + +namespace amd::smi { + +// Forward declarations of internal helpers we exercise in this unit-test. +AMDGpuMetricVersionFlags_t translate_header_to_flag_version( + const AMDGpuMetricsHeader_v1_t& metrics_header, bool is_partition_metrics, + const std::string& file_path); + +GpuMetricsBasePtr amdgpu_metrics_factory(AMDGpuMetricVersionFlags_t gpu_metric_version, + bool is_partition_metrics, const std::string& file_path); + +} // namespace amd::smi + +namespace { +// Version helper checker +auto GetExpectedMetricVersionFlag(uint16_t major, uint16_t minor, bool is_partition_metrics) + -> amd::smi::AMDGpuMetricVersionFlags_t { + using Flag = amd::smi::AMDGpuMetricVersionFlags_t; + if (is_partition_metrics) { + if (major == 1) { + if (minor == 0) { + return Flag::kGpuXcpMetricV10; + } else if (minor >= 1) { + return Flag::kGpuXcpMetricDynV11Plus; + } else { + return Flag::kGpuMetricNone; + } + } + } else { // GPU metrics + if (major == 1) { + switch (minor) { + case 0: return Flag::kGpuMetricNone; + case 1: return Flag::kGpuMetricV11; + case 2: return Flag::kGpuMetricV12; + case 3: return Flag::kGpuMetricV13; + case 4: return Flag::kGpuMetricV14; + case 5: return Flag::kGpuMetricV15; + case 6: return Flag::kGpuMetricV16; + case 7: return Flag::kGpuMetricV17; + case 8: return Flag::kGpuMetricV18; + default: return Flag::kGpuMetricDynV19Plus; + } + } + } + return Flag::kGpuMetricNone; +} + +// pass a header we want to test against +auto BuildFakeMetricsBlob(amd::smi::AMDGpuMetricsHeader_v1_t new_header) -> std::vector { + if (new_header.m_structure_size < sizeof(new_header)) { + throw std::runtime_error("Header size too small"); + } + amd::smi::AMDGpuMetricsHeader_v1_t header{}; + header.m_structure_size = static_cast(sizeof(header)); + header.m_format_revision = new_header.m_format_revision; + header.m_content_revision = new_header.m_content_revision; + + const uint8_t* begin = reinterpret_cast(&header); + return std::vector(begin, begin + sizeof(header)); +} + +auto WriteBlobToTempFile(const std::vector& blob, + const std::string& filename = "amdsmi_fake_metrics.bin") + -> std::filesystem::path { + auto temp_dir = std::filesystem::temp_directory_path(); + auto file_path = temp_dir / filename; + + std::ofstream stream(file_path, std::ios::binary | std::ios::trunc); + stream.write(reinterpret_cast(blob.data()), + static_cast(blob.size())); + stream.close(); + + return file_path; +} + +} // namespace + +TEST(AmdSmiDynamicMetricTest, GPUMetricDynamicVersionSupported) { + const bool is_partition_metrics = false; + for (auto ver : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}) { + std::string test_detail = "[GPUMetric"; + if (ver >= 9) { + test_detail += "Dynamic] "; + } else { + test_detail += "] "; + } + std::cout << test_detail << "Checking version 1." << ver << std::endl; + SCOPED_TRACE(testing::Message() << "Subtest for minor version: 1." << ver); + const auto blob = BuildFakeMetricsBlob(amd::smi::AMDGpuMetricsHeader_v1_t{ + .m_structure_size = sizeof(amd::smi::AMDGpuMetricsHeader_v1_t), + .m_format_revision = 1, + .m_content_revision = static_cast(ver), // Known minor versions + }); + const auto fake_path = + WriteBlobToTempFile(blob, "amdsmi_fake_gpu_metrics_v1" + std::to_string(ver) + ".bin"); + + ASSERT_FALSE(blob.empty()); + ASSERT_TRUE(std::filesystem::exists(fake_path)); + + const auto* header = reinterpret_cast(blob.data()); + + const auto flag = amd::smi::translate_header_to_flag_version(*header, is_partition_metrics, + fake_path.string()); + + EXPECT_EQ(flag, GetExpectedMetricVersionFlag(1, ver, is_partition_metrics)) + << "Version 1." << ver << " should be treated as supported"; + + auto gpu_metrics_ptr = + amd::smi::amdgpu_metrics_factory(flag, is_partition_metrics, fake_path.string()); + + if (ver != 0) { + EXPECT_NE(gpu_metrics_ptr, nullptr) + << "Factory must create metrics object for supported version"; + } else { + EXPECT_EQ(gpu_metrics_ptr, nullptr) + << "Factory must not create metrics object for unsupported versions"; + } + if (gpu_metrics_ptr) { + std::cout << test_detail << "Created valid object for version 1." << ver << std::endl; + } else { + std::cout << test_detail << "Unsupported Metric Version" + << " | Failed to create valid object for version 1." << ver << std::endl; + } + + std::filesystem::remove(fake_path); + } +} + +TEST(AmdSmiDynamicMetricTest, XCPMetricDynamicVersionSupported) { + const bool is_partition_metrics = true; + for (auto ver : {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}) { + std::string test_detail = "[XCPMetric"; + if (ver >= 1) { + test_detail += "Dynamic] "; + } else { + test_detail += "] "; + } + std::cout << test_detail << "Checking version 1." << ver << std::endl; + SCOPED_TRACE(testing::Message() << "Subtest for minor version: 1." << ver); + const auto blob = BuildFakeMetricsBlob(amd::smi::AMDGpuMetricsHeader_v1_t{ + .m_structure_size = sizeof(amd::smi::AMDGpuMetricsHeader_v1_t), + .m_format_revision = 1, + .m_content_revision = static_cast(ver), // Known minor versions + }); + const auto fake_path = + WriteBlobToTempFile(blob, "amdsmi_fake_xcp_metrics_v1" + std::to_string(ver) + ".bin"); + + ASSERT_FALSE(blob.empty()); + ASSERT_TRUE(std::filesystem::exists(fake_path)); + + const auto* header = reinterpret_cast(blob.data()); + + const auto flag = amd::smi::translate_header_to_flag_version(*header, is_partition_metrics, + fake_path.string()); + + EXPECT_EQ(flag, GetExpectedMetricVersionFlag(1, ver, is_partition_metrics)) + << "Version 1." << ver << " should be treated as supported"; + + auto xcp_metrics_ptr = + amd::smi::amdgpu_metrics_factory(flag, is_partition_metrics, fake_path.string()); + + EXPECT_NE(xcp_metrics_ptr, nullptr) + << "Factory must create metrics object for supported version"; + if (xcp_metrics_ptr) { + std::cout << test_detail << "Created valid object for version 1." << ver << std::endl; + } else { + std::cout << test_detail << "Failed to create valid object for version 1." << ver + << std::endl; + } + + std::filesystem::remove(fake_path); + } +} diff --git a/projects/amdsmi/tests/amd_smi_test/functional/gpu_partition_metrics_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/gpu_partition_metrics_read.cc new file mode 100644 index 0000000000..8bd5e77227 --- /dev/null +++ b/projects/amdsmi/tests/amd_smi_test/functional/gpu_partition_metrics_read.cc @@ -0,0 +1,426 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include + +#include +#include +#include +#include + +#include +#include "amd_smi/amdsmi.h" +#include "gpu_partition_metrics_read.h" +#include "../test_common.h" +#include "rocm_smi/rocm_smi_utils.h" +#include "amd_smi/impl/amd_smi_utils.h" + + +TestGpuPartitionMetricsRead::TestGpuPartitionMetricsRead() : TestBase() { + set_title("AMDSMI GPU Partition (XCP) Metrics Read Test"); + set_description("The GPU Partition (XCP) Metrics tests verifies that " + "the gpu metrics info can be read properly."); +} + +TestGpuPartitionMetricsRead::~TestGpuPartitionMetricsRead(void) { +} + +void TestGpuPartitionMetricsRead::SetUp(void) { + TestBase::SetUp(); + return; +} + +void TestGpuPartitionMetricsRead::DisplayTestInfo(void) { + TestBase::DisplayTestInfo(); +} + +void TestGpuPartitionMetricsRead::DisplayResults(void) const { + TestBase::DisplayResults(); + return; +} + +void TestGpuPartitionMetricsRead::Close() { + // This will close handles opened within amdsmitst utility calls and call + // amdsmi_shut_down(), so it should be done after other hsa cleanup + TestBase::Close(); +} + + + +void TestGpuPartitionMetricsRead::Run(void) { + amdsmi_status_t err; + + TestBase::Run(); + if (setup_failed_) { + std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl; + return; + } + + for (uint32_t i = 0; i < num_monitor_devs(); ++i) { + PrintDeviceHeader(processor_handles_[i]); + std::cout << "Device #" << std::to_string(i) << "\n"; + + IF_VERB(STANDARD) { + std::cout << "\n\n"; + std::cout << "\t**GPU PARTITION METRICS: Using static struct (Backwards Compatibility):\n"; + } + amdsmi_gpu_metrics_t smu = {}; + err = amdsmi_get_gpu_partition_metrics_info(processor_handles_[i], &smu); + const char *status_string; + amdsmi_status_code_to_string(err, &status_string); + std::cout << "\t\t** amdsmi_get_gpu_partition_metrics_info(): " << status_string + << "\n"; + if (err != AMDSMI_STATUS_SUCCESS) { + if (err == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**" << + "Not supported on this machine" << std::endl; + continue; + } + } + CHK_ERR_ASRT(err); // Anything else should be a failure + // (ie, we are not handling the metrics right/etc..) + } else { + IF_VERB(STANDARD) { + std::cout << "METRIC TABLE HEADER:\n"; + std::cout << "structure_size=" << std::dec + << static_cast(smu.common_header.structure_size) << "\n"; + std::cout << "format_revision=" << std::dec + << static_cast(smu.common_header.format_revision) << "\n"; + std::cout << "content_revision=" << std::dec + << static_cast(smu.common_header.content_revision) << "\n"; + + std::cout << "\n"; + std::cout << "TIME STAMPS (ns):\n"; + std::cout << std::dec << "system_clock_counter=" << smu.system_clock_counter << "\n"; + std::cout << "firmware_timestamp (10ns resolution)=" << std::dec << smu.firmware_timestamp + << "\n"; + + std::cout << "\n"; + std::cout << "TEMPERATURES (C):\n"; + std::cout << std::dec << "temperature_edge= " << smu.temperature_edge << "\n"; + std::cout << std::dec << "temperature_hotspot= " << smu.temperature_hotspot << "\n"; + std::cout << std::dec << "temperature_mem= " << smu.temperature_mem << "\n"; + std::cout << std::dec << "temperature_vrgfx= " << smu.temperature_vrgfx << "\n"; + std::cout << std::dec << "temperature_vrsoc= " << smu.temperature_vrsoc << "\n"; + std::cout << std::dec << "temperature_vrmem= " << smu.temperature_vrmem << "\n"; + std::cout << "temperature_hbm = ["; + std::copy(std::begin(smu.temperature_hbm), + std::end(smu.temperature_hbm), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << std::dec << "]\n"; + + std::cout << "\n"; + std::cout << "UTILIZATION (%):\n"; + std::cout << std::dec << "average_gfx_activity=" << smu.average_gfx_activity << "\n"; + std::cout << std::dec << "average_umc_activity=" << smu.average_umc_activity << "\n"; + std::cout << std::dec << "average_mm_activity=" << smu.average_mm_activity << "\n"; + std::cout << std::dec << "vcn_activity= ["; + std::copy(std::begin(smu.vcn_activity), + std::end(smu.vcn_activity), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << std::dec << "]\n"; + + std::cout << "\n"; + std::cout << std::dec << "jpeg_activity= ["; + std::copy(std::begin(smu.jpeg_activity), + std::end(smu.jpeg_activity), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << std::dec << "]\n"; + + std::cout << "\n"; + std::cout << "POWER (W)/ENERGY (15.259uJ per 1ns):\n"; + std::cout << std::dec << "average_socket_power=" << smu.average_socket_power << "\n"; + std::cout << std::dec << "current_socket_power=" << smu.current_socket_power << "\n"; + std::cout << std::dec << "energy_accumulator=" << smu.energy_accumulator << "\n"; + + std::cout << "\n"; + std::cout << "AVG CLOCKS (MHz):\n"; + std::cout << std::dec << "average_gfxclk_frequency=" << smu.average_gfxclk_frequency + << "\n"; + std::cout << std::dec << "average_gfxclk_frequency=" << smu.average_gfxclk_frequency + << "\n"; + std::cout << std::dec << "average_uclk_frequency=" << smu.average_uclk_frequency << "\n"; + std::cout << std::dec << "average_vclk0_frequency=" << smu.average_vclk0_frequency + << "\n"; + std::cout << std::dec << "average_dclk0_frequency=" << smu.average_dclk0_frequency + << "\n"; + std::cout << std::dec << "average_vclk1_frequency=" << smu.average_vclk1_frequency + << "\n"; + std::cout << std::dec << "average_dclk1_frequency=" << smu.average_dclk1_frequency + << "\n"; + + std::cout << "\n"; + std::cout << "CURRENT CLOCKS (MHz):\n"; + std::cout << std::dec << "current_gfxclk=" << smu.current_gfxclk << "\n"; + std::cout << std::dec << "current_gfxclks= ["; + std::copy(std::begin(smu.current_gfxclks), + std::end(smu.current_gfxclks), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << std::dec << "]\n"; + + std::cout << std::dec << "current_socclk=" << smu.current_socclk << "\n"; + std::cout << std::dec << "current_socclks= ["; + std::copy(std::begin(smu.current_socclks), + std::end(smu.current_socclks), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << std::dec << "]\n"; + + std::cout << std::dec << "current_uclk=" << smu.current_uclk << "\n"; + std::cout << std::dec << "current_vclk0=" << smu.current_vclk0 << "\n"; + std::cout << std::dec << "current_vclk0s= ["; + std::copy(std::begin(smu.current_vclk0s), + std::end(smu.current_vclk0s), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << std::dec << "]\n"; + + std::cout << std::dec << "current_dclk0=" << smu.current_dclk0 << "\n"; + std::cout << std::dec << "current_dclk0s= ["; + std::copy(std::begin(smu.current_dclk0s), + std::end(smu.current_dclk0s), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << std::dec << "]\n"; + + std::cout << std::dec << "current_vclk1=" << smu.current_vclk1 << "\n"; + std::cout << std::dec << "current_dclk1=" << smu.current_dclk1 << "\n"; + + std::cout << "\n"; + std::cout << "TROTTLE STATUS:\n"; + std::cout << std::dec << "throttle_status=" << smu.throttle_status << "\n"; + + std::cout << "\n"; + std::cout << "FAN SPEED:\n"; + std::cout << std::dec << "current_fan_speed=" << smu.current_fan_speed << "\n"; + + std::cout << "\n"; + std::cout << "LINK WIDTH (number of lanes) /SPEED (0.1 GT/s):\n"; + std::cout << "pcie_link_width=" << smu.pcie_link_width << "\n"; + std::cout << "pcie_link_speed=" << smu.pcie_link_speed << "\n"; + std::cout << "xgmi_link_width=" << smu.xgmi_link_width << "\n"; + std::cout << "xgmi_link_speed=" << smu.xgmi_link_speed << "\n"; + + std::cout << "\n"; + std::cout << "Utilization Accumulated(%):\n"; + std::cout << "gfx_activity_acc=" << std::dec << smu.gfx_activity_acc << "\n"; + std::cout << "mem_activity_acc=" << std::dec << smu.mem_activity_acc << "\n"; + + std::cout << "\n"; + std::cout << "XGMI ACCUMULATED DATA TRANSFER SIZE (KB):\n"; + std::cout << std::dec << "xgmi_read_data_acc= ["; + std::copy(std::begin(smu.xgmi_read_data_acc), + std::end(smu.xgmi_read_data_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << std::dec << "]\n"; + + std::cout << std::dec << "xgmi_write_data_acc= ["; + std::copy(std::begin(smu.xgmi_write_data_acc), + std::end(smu.xgmi_write_data_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << std::dec << "]\n"; + + std::cout << std::dec << "xgmi_link_status= ["; + std::copy(std::begin(smu.xgmi_link_status), + std::end(smu.xgmi_link_status), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << std::dec << "]\n"; + + // Voltage (mV) + std::cout << "voltage_soc = " << std::dec << smu.voltage_soc << "\n"; + std::cout << "voltage_gfx = " << std::dec << smu.voltage_gfx << "\n"; + std::cout << "voltage_mem = " << std::dec << smu.voltage_mem << "\n"; + + std::cout << "indep_throttle_status = " << std::dec << smu.indep_throttle_status << "\n"; + + // Clock Lock Status. Each bit corresponds to clock instance + std::cout << "gfxclk_lock_status (in hex) = " << std::hex + << smu.gfxclk_lock_status << std::dec <<"\n"; + + // Bandwidth (GB/sec) + std::cout << "pcie_bandwidth_acc=" << std::dec << smu.pcie_bandwidth_acc << "\n"; + std::cout << "pcie_bandwidth_inst=" << std::dec << smu.pcie_bandwidth_inst << "\n"; + + // VRAM max bandwidth at max memory clock (GB/sec) + std::cout << "vram_max_bandwidth=" << std::dec << smu.vram_max_bandwidth << "\n"; + + // Counts + std::cout << "pcie_l0_to_recov_count_acc= " << std::dec << smu.pcie_l0_to_recov_count_acc + << "\n"; + std::cout << "pcie_replay_count_acc= " << std::dec << smu.pcie_replay_count_acc << "\n"; + std::cout << "pcie_replay_rover_count_acc= " << std::dec + << smu.pcie_replay_rover_count_acc << "\n"; + std::cout << "pcie_nak_sent_count_acc= " << std::dec << smu.pcie_nak_sent_count_acc + << "\n"; + std::cout << "pcie_nak_rcvd_count_acc= " << std::dec << smu.pcie_nak_rcvd_count_acc + << "\n"; + + // Accumulation cycle counter + // Accumulated throttler residencies + std::cout << "\n"; + std::cout << "RESIDENCY ACCUMULATION / COUNTER:\n"; + std::cout << "accumulation_counter = " << std::dec << smu.accumulation_counter << "\n"; + std::cout << "prochot_residency_acc = " << std::dec << smu.prochot_residency_acc << "\n"; + std::cout << "ppt_residency_acc = " << std::dec << smu.ppt_residency_acc << "\n"; + std::cout << "socket_thm_residency_acc = " << std::dec << smu.socket_thm_residency_acc + << "\n"; + std::cout << "vr_thm_residency_acc = " << std::dec << smu.vr_thm_residency_acc + << "\n"; + std::cout << "hbm_thm_residency_acc = " << std::dec << smu.hbm_thm_residency_acc << "\n"; + + // Number of current partitions + std::cout << "num_partition = " << std::dec << smu.num_partition << "\n"; + + // PCIE other end recovery counter + std::cout << "pcie_lc_perf_other_end_recovery = " + << std::dec << smu.pcie_lc_perf_other_end_recovery << "\n"; + + std::cout << std::dec << "xcp_stats.gfx_busy_inst = \n"; + auto xcp = 0; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_busy_inst), + std::end(row.gfx_busy_inst), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + xcp = 0; + std::cout << std::dec << "xcp_stats.jpeg_busy = \n"; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.jpeg_busy), + std::end(row.jpeg_busy), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + xcp = 0; + std::cout << std::dec << "xcp_stats.vcn_busy = \n"; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.vcn_busy), + std::end(row.vcn_busy), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_busy_acc = \n"; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_busy_acc), + std::end(row.gfx_busy_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_acc = \n"; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_below_host_limit_acc), + std::end(row.gfx_below_host_limit_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + // new for gpu metrics v1.8 + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_ppt_acc = \n"; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_below_host_limit_ppt_acc), + std::end(row.gfx_below_host_limit_ppt_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_thm_acc = \n"; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_below_host_limit_thm_acc), + std::end(row.gfx_below_host_limit_thm_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_low_utilization_acc = \n"; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_low_utilization_acc), + std::end(row.gfx_low_utilization_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + xcp = 0; + std::cout << std::dec << "xcp_stats.gfx_below_host_limit_total_acc = \n"; + for (auto& row : smu.xcp_stats) { + std::cout << "XCP[" << xcp << "] = " << "[ "; + std::copy(std::begin(row.gfx_below_host_limit_total_acc), + std::end(row.gfx_below_host_limit_total_acc), + amd::smi::make_ostream_joiner(&std::cout, ", ")); + std::cout << " ]\n"; + xcp++; + } + + std::cout << "\n\n"; + std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; + constexpr uint16_t kMAX_ITER_TEST = 10; + amdsmi_gpu_metrics_t gpu_xcp_metrics_check = {}; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + amdsmi_get_gpu_metrics_info(processor_handles_[i], &gpu_xcp_metrics_check); + std::cout << "\t\t -> firmware_timestamp [" << idx << "/" << kMAX_ITER_TEST << "]: " + << gpu_xcp_metrics_check.firmware_timestamp << "\n"; + } + + std::cout << "\n"; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + amdsmi_get_gpu_partition_metrics_info(processor_handles_[i], &gpu_xcp_metrics_check); + std::cout << "\t\t -> system_clock_counter [" << idx << "/" << kMAX_ITER_TEST << "]: " + << gpu_xcp_metrics_check.system_clock_counter << "\n"; + } + + std::cout << "\n"; + std::cout << " ** Note: Values MAX'ed out " + << "(UINTX MAX are unsupported for the version in question) ** " << "\n\n"; + } + } + + // Verify api support checking functionality is working + err = amdsmi_get_gpu_partition_metrics_info(processor_handles_[i], nullptr); + if (err !=AMDSMI_STATUS_INVAL) { + DISPLAY_AMDSMI_ERR(err); + } + amdsmi_status_code_to_string(err, &status_string); + std::cout << "\t\t** amdsmi_get_gpu_partition_metrics_info(nullptr check): " << status_string << "\n"; + ASSERT_EQ(err, AMDSMI_STATUS_INVAL); + } +} diff --git a/projects/amdsmi/tests/amd_smi_test/functional/gpu_partition_metrics_read.h b/projects/amdsmi/tests/amd_smi_test/functional/gpu_partition_metrics_read.h new file mode 100644 index 0000000000..d1cd55aedb --- /dev/null +++ b/projects/amdsmi/tests/amd_smi_test/functional/gpu_partition_metrics_read.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef TESTS_AMD_SMI_TEST_FUNCTIONAL_GPU_PARTITION_METRICS_READ_H_ +#define TESTS_AMD_SMI_TEST_FUNCTIONAL_GPU_PARTITION_METRICS_READ_H_ + +#include "../test_base.h" + +class TestGpuPartitionMetricsRead : public TestBase { + public: + TestGpuPartitionMetricsRead(); + + // @Brief: Destructor for test case of TestGpuPartitionMetricsRead + virtual ~TestGpuPartitionMetricsRead(); + + // @Brief: Setup the environment for measurement + virtual void SetUp(); + + // @Brief: Core measurement execution + virtual void Run(); + + // @Brief: Clean up and retrive the resource + virtual void Close(); + + // @Brief: Display results + virtual void DisplayResults() const; + + // @Brief: Display information about what this test does + virtual void DisplayTestInfo(void); +}; + +#endif // TESTS_AMD_SMI_TEST_FUNCTIONAL_GPU_PARTITION_METRICS_READ_H_ diff --git a/projects/amdsmi/tests/amd_smi_test/main.cc b/projects/amdsmi/tests/amd_smi_test/main.cc index 8e9c549797..c07285dfdb 100644 --- a/projects/amdsmi/tests/amd_smi_test/main.cc +++ b/projects/amdsmi/tests/amd_smi_test/main.cc @@ -37,6 +37,7 @@ #include "functional/process_info_read.h" #include "functional/gpu_busy_read.h" #include "functional/gpu_metrics_read.h" +#include "functional/gpu_partition_metrics_read.h" #include "functional/err_cnt_read.h" #include "functional/power_read.h" #include "functional/power_read_write.h" @@ -224,6 +225,10 @@ TEST(amdsmitstReadOnly, TestGpuMetricsRead) { TestGpuMetricsRead tst; RunGenericTest(&tst); } +TEST(amdsmitstReadOnly, TestGpuPartitionMetricsRead) { + TestGpuPartitionMetricsRead tst; + RunGenericTest(&tst); +} TEST(amdsmitstReadOnly, TestMetricsCounterRead) { TestMetricsCounterRead tst; RunGenericTest(&tst); diff --git a/projects/amdsmi/tests/amd_smi_test/test_base.cc b/projects/amdsmi/tests/amd_smi_test/test_base.cc index 5320c297fd..c2d65262a6 100644 --- a/projects/amdsmi/tests/amd_smi_test/test_base.cc +++ b/projects/amdsmi/tests/amd_smi_test/test_base.cc @@ -282,7 +282,23 @@ void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) { } } - std::cout << std::setbase(10); + amdsmi_kfd_info_t kfd_info; + err = amdsmi_get_gpu_kfd_info(dv_ind, &kfd_info); + if (err == AMDSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**KFD info: " << smi_amdgpu_get_status_string(err, false) << std::endl; + } + ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**KFD info: " << std::endl; + std::cout << "\t\t**GPU ID: " << std::dec << kfd_info.kfd_id << std::endl; + std::cout << "\t\t**Node ID: " << std::dec << kfd_info.node_id << std::endl; + std::cout << "\t\t**Partition ID: " + << std::dec << kfd_info.current_partition_id << std::endl; + } + } } void TestBase::Run(void) { std::string label; diff --git a/projects/amdsmi/tests/python_unittest/unit_tests.py b/projects/amdsmi/tests/python_unittest/unit_tests.py index a502b100a7..08532ff985 100755 --- a/projects/amdsmi/tests/python_unittest/unit_tests.py +++ b/projects/amdsmi/tests/python_unittest/unit_tests.py @@ -1581,8 +1581,6 @@ class TestAmdSmiPython(unittest.TestCase): def test_get_gpu_metrics_info(self): self._print_func_name('') - if self.TODO_SKIP_FAIL: - self.skipTest("Skipping test_get_gpu_metrics_info as it fails (MI350X, AMDSMI_STATUS_UNEXPECTED_DATA).") for i, gpu in enumerate(self.processors): msg = f'gpu({i}):' try: @@ -1595,6 +1593,19 @@ class TestAmdSmiPython(unittest.TestCase): raise self.raise_exception return + def test_get_gpu_partition_metrics_info(self): + self._print_func_name('') + for i, gpu in enumerate(self.processors): + try: + msg = f'gpu({i}): ' + ret = amdsmi.amdsmi_get_gpu_partition_metrics_info(gpu) + self._print(msg, ret) + except amdsmi.AmdSmiLibraryException as e: + if self._check_ret(msg, e, self.PASS): + self.raise_exception = e + if self.raise_exception: + raise self.raise_exception + def test_get_gpu_od_volt_curve_regions(self): self._print_func_name('') num_region = 10 @@ -3110,6 +3121,8 @@ class TestAmdSmiPython(unittest.TestCase): def test_set_gpu_perf_level(self): self._print_func_name('') + if self.TODO_SKIP_NOT_COMPLETE: + self.skipTest("Skipping test_set_gpu_perf_level as it is not complete.") dev_perf_level_current = self.dev_perf_levels[0][1] for i, gpu in enumerate(self.processors): msg = f'gpu({i}):'