From df9d5d3ee5725bd8af4d365aeb5b4b3541cc5544 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Wed, 11 Sep 2024 09:42:32 -0500 Subject: [PATCH] [SWDEV-483526] Fix MI3x partitions not showing all logical nodes Changes: - Updates to amdsmi_asic_info_t structure to include: target_graphics_version, kfd_id, node_id, partition_id - Updates to amd-smi static --asic to display new samdsmi_asic_info_t fields - Updates to gpu enumeration during amdsmi_init() to discover all logical GPUs when in a non-SPX mode (ex. DPX, TPX, QPX, or CPX) - Updates to amdsmi_get_gpu_bdf_id(..) to include partition_id details when in BDF or optional bits. - bits [63:32] = domain - bits [31:28] or bits [2:0] = partition id - bits [27:16] = reserved - bits [15:8] = Bus - bits [7:3] = Device - bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes - C++/Python tests updated to reflect these outputs Change-Id: I4be0ea35bb98f3109ae2ca9e82f6b21baa38de29 Signed-off-by: Charis Poag [ROCm/amdsmi commit: a33e4c9e14219376a9a85ea58c96e11d983fec06] --- projects/amdsmi/CHANGELOG.md | 147 +- projects/amdsmi/include/amd_smi/amdsmi.h | 46 +- .../amdsmi/py-interface/amdsmi_interface.py | 6 +- .../amdsmi/py-interface/amdsmi_wrapper.py | 26 +- projects/amdsmi/pytest/integration_test.py | 8 + .../rocm_smi/example/rocm_smi_example.cc | 242 +-- .../rocm_smi/include/rocm_smi/rocm_smi.h | 1157 ++--------- .../rocm_smi/include/rocm_smi/rocm_smi_kfd.h | 5 + .../include/rocm_smi/rocm_smi_utils.h | 4 + projects/amdsmi/rocm_smi/src/rocm_smi.cc | 1742 +++-------------- .../amdsmi/rocm_smi/src/rocm_smi_device.cc | 2 +- projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc | 71 +- projects/amdsmi/rocm_smi/src/rocm_smi_main.cc | 158 +- .../amdsmi/rocm_smi/src/rocm_smi_utils.cc | 24 +- projects/amdsmi/src/amd_smi/amd_smi.cc | 40 +- projects/amdsmi/src/amd_smi/amd_smi_drm.cc | 34 +- .../amd_smi_test/functional/sys_info_read.cc | 30 +- .../amd_smi_test/functional/temp_read.cc | 3 +- 18 files changed, 971 insertions(+), 2774 deletions(-) diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 7fe1b40c5e..40fc986d49 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -175,6 +175,98 @@ Legend: 64,32 = 64 bit and 32 bit atomic support - ``` +- **Added Target_Graphics_Version, KFD_ID, Node_id, and partition id to `amd-smi static --asic`**. +Due to fixes needed to properly enumerate all logical GPUs in CPX, new device identifiers +were placed within the `amdsmi_asic_info_t` struct. These new fields are only available for BM/Guest Linux +devices at this time. + +```C +typedef struct { + char market_name[AMDSMI_256_LENGTH]; + uint32_t vendor_id; //< Use 32 bit to be compatible with other platform. + char vendor_name[AMDSMI_MAX_STRING_LENGTH]; + uint32_t subvendor_id; //< The subsystem vendor id + uint64_t device_id; //< The device id of a GPU + uint32_t rev_id; + char asic_serial[AMDSMI_NORMAL_STRING_LENGTH]; + uint32_t oam_id; //< 0xFFFF if not supported + uint32_t num_of_compute_units; //< 0xFFFFFFFF if not supported + uint64_t target_graphics_version; //< 0xFFFFFFFFFFFFFFFF if not supported + uint64_t kfd_id; //< 0xFFFFFFFFFFFFFFFF if not supported + uint32_t node_id; //< 0xFFFFFFFF if not supported + uint32_t partition_id; //< 0xFFFFFFFF if not supported + uint32_t reserved[17]; +} amdsmi_asic_info_t; +``` + +```shell +$ amd-smi static --asic --board --bus --partition +GPU: 0 + ASIC: + MARKET_NAME: MI308X + VENDOR_ID: 0x1002 + VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] + SUBVENDOR_ID: 0x1002 + DEVICE_ID: 0x74a2 + TARGET_GRAPHICS_VERSION: gfx942 + KFD_ID: 24248 + NODE_ID: 2 + PARTITION_ID: 0 + SUBSYSTEM_ID: 0x74a2 + REV_ID: 0x00 + ASIC_SERIAL: + OAM_ID: 5 + NUM_COMPUTE_UNITS: 20 + BUS: + BDF: 0000:0A:00.0 + MAX_PCIE_WIDTH: 16 + MAX_PCIE_SPEED: 32 GT/s + PCIE_INTERFACE_VERSION: Gen 5 + SLOT_TYPE: PCIE + BOARD: + MODEL_NUMBER: 102-G30218-00 + PRODUCT_SERIAL: 692432000576 + FRU_ID: 113-AMDG302180002-0000000000000 + PRODUCT_NAME: AMD Instinct MI308X OAM + MANUFACTURER_NAME: AMD + PARTITION: + COMPUTE_PARTITION: CPX + MEMORY_PARTITION: NPS4 + +GPU: 1 + ASIC: + MARKET_NAME: MI308X + VENDOR_ID: 0x1002 + VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] + SUBVENDOR_ID: 0x1002 + DEVICE_ID: 0x74a2 + TARGET_GRAPHICS_VERSION: gfx942 + KFD_ID: 41657 + NODE_ID: 3 + PARTITION_ID: 1 + SUBSYSTEM_ID: 0x74a2 + REV_ID: 0x00 + ASIC_SERIAL: + OAM_ID: 5 + NUM_COMPUTE_UNITS: 20 + BUS: + BDF: 0000:0A:00.1 + MAX_PCIE_WIDTH: 16 + MAX_PCIE_SPEED: 32 GT/s + PCIE_INTERFACE_VERSION: Gen 5 + SLOT_TYPE: PCIE + BOARD: + MODEL_NUMBER: 102-G30218-00 + PRODUCT_SERIAL: 692432000576 + FRU_ID: 113-AMDG302180002-0000000000000 + PRODUCT_NAME: AMD Instinct MI308X OAM + MANUFACTURER_NAME: AMD + PARTITION: + COMPUTE_PARTITION: CPX + MEMORY_PARTITION: NPS4 +... +``` + ### Removals @@ -186,7 +278,58 @@ Legend: ### Resolved issues -- N/A +- **Fixed CPX not showing total number of logical GPUs**. +Updates were made to `amdsmi_init()` and `amdsmi_get_gpu_bdf_id(..)`. In order to display all logical devices, we needed a way to provide order to GPU's enumerated. This was done +by adding a partition_id within the BDF optional pci_id bits. + +Due to driver changes in KFD, some devices may report bits [31:28] or [2:0]. With the newly added `amdsmi_get_gpu_bdf_id(..)`, we provided this fallback to properly retreive partition ID. We +plan to eventually remove partition ID from the function portion of the BDF (Bus Device Function). See below for PCI ID description. + + - bits [63:32] = domain + - bits [31:28] or bits [2:0] = partition id + - bits [27:16] = reserved + - bits [15:8] = Bus + - bits [7:3] = Device + - bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes + +Previously in non-SPX modes (ex. CPX/TPX/DPX/etc) some MI3x ASICs would not report all logical GPU devices within AMD SMI. + +```shell +$ amd-smi monitor -p -t -v +GPU POWER GPU_TEMP MEM_TEMP VRAM_USED VRAM_TOTAL + 0 248 W 55 °C 48 °C 283 MB 196300 MB + 1 247 W 55 °C 48 °C 283 MB 196300 MB + 2 247 W 55 °C 48 °C 283 MB 196300 MB + 3 247 W 55 °C 48 °C 283 MB 196300 MB + 4 221 W 50 °C 42 °C 283 MB 196300 MB + 5 221 W 50 °C 42 °C 283 MB 196300 MB + 6 222 W 50 °C 42 °C 283 MB 196300 MB + 7 221 W 50 °C 42 °C 283 MB 196300 MB + 8 239 W 53 °C 46 °C 283 MB 196300 MB + 9 239 W 53 °C 46 °C 283 MB 196300 MB + 10 239 W 53 °C 46 °C 283 MB 196300 MB + 11 239 W 53 °C 46 °C 283 MB 196300 MB + 12 219 W 51 °C 48 °C 283 MB 196300 MB + 13 219 W 51 °C 48 °C 283 MB 196300 MB + 14 219 W 51 °C 48 °C 283 MB 196300 MB + 15 219 W 51 °C 48 °C 283 MB 196300 MB + 16 222 W 51 °C 47 °C 283 MB 196300 MB + 17 222 W 51 °C 47 °C 283 MB 196300 MB + 18 222 W 51 °C 47 °C 283 MB 196300 MB + 19 222 W 51 °C 48 °C 283 MB 196300 MB + 20 241 W 55 °C 48 °C 283 MB 196300 MB + 21 241 W 55 °C 48 °C 283 MB 196300 MB + 22 241 W 55 °C 48 °C 283 MB 196300 MB + 23 240 W 55 °C 48 °C 283 MB 196300 MB + 24 211 W 51 °C 45 °C 283 MB 196300 MB + 25 211 W 51 °C 45 °C 283 MB 196300 MB + 26 211 W 51 °C 45 °C 283 MB 196300 MB + 27 211 W 51 °C 45 °C 283 MB 196300 MB + 28 227 W 51 °C 49 °C 283 MB 196300 MB + 29 227 W 51 °C 49 °C 283 MB 196300 MB + 30 227 W 51 °C 49 °C 283 MB 196300 MB + 31 227 W 51 °C 49 °C 283 MB 196300 MB +``` ### Known issues @@ -829,7 +972,7 @@ $ /opt/rocm/bin/amd-smi topology -a -t --json Previously our reset could attempting to reset non-amd GPUS- resuting in "Unable to reset non-amd GPU" error. Fix updates CLI to target only AMD ASICs. -- **Fix for `amd-smi metric --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards**. +- **Fix for `amd-smi static --pcie` and `amdsmi_get_pcie_info()`Navi32/31 cards**. Updated API to include `amdsmi_card_form_factor_t.AMDSMI_CARD_FORM_FACTOR_CEM`. Prevously, this would report "UNKNOWN". This fix provides the correct board `SLOT_TYPE` associated with these ASICs (and other Navi cards). diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index e9c415339d..1a8f5637a2 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -281,16 +281,16 @@ typedef enum { */ typedef enum { AMDSMI_COMPUTE_PARTITION_INVALID = 0, - AMDSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with - //!< shared memory - AMDSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work - //!< together with shared memory - AMDSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work - //!< together with shared memory - AMDSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs - //!< work together with shared memory - AMDSMI_COMPUTE_PARTITION_QPX //!< Quad GPU mode (QPX)- Quarter XCCs - //!< work together with shared memory + AMDSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + AMDSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + AMDSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + AMDSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory + AMDSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory } amdsmi_compute_partition_type_t; /** @@ -589,7 +589,11 @@ typedef struct { char asic_serial[AMDSMI_NORMAL_STRING_LENGTH]; uint32_t oam_id; //< 0xFFFF if not supported uint32_t num_of_compute_units; //< 0xFFFFFFFF if not supported - uint32_t reserved[17]; + uint64_t target_graphics_version; //< 0xFFFFFFFFFFFFFFFF if not supported + uint64_t kfd_id; //< 0xFFFFFFFFFFFFFFFF if not supported + uint32_t node_id; //< 0xFFFFFFFF if not supported + uint32_t partition_id; //< 0xFFFFFFFF if not supported + uint32_t reserved[11]; } amdsmi_asic_info_t; typedef enum { @@ -2233,16 +2237,18 @@ amdsmi_get_gpu_pci_bandwidth(amdsmi_processor_handle processor_handle, * * The format of @p bdfid will be as follows: * - * BDFID = ((DOMAIN & 0xffffffff) << 32) | ((BUS & 0xff) << 8) | - * ((DEVICE & 0x1f) <<3 ) | (FUNCTION & 0x7) + * BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((Partition & 0xF) << 28) + * | ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 ) + * | (FUNCTION & 0x7) * - * | Name | Field | - * ---------- | ------- | - * | Domain | [64:32] | - * | Reserved | [31:16] | - * | Bus | [15: 8] | - * | Device | [ 7: 3] | - * | Function | [ 2: 0] | + * | Name | Field | KFD property KFD -> PCIe ID (uint64_t) + * -------------- | ------- | ---------------- | ---------------------------- | + * | Domain | [63:32] | "domain" | (DOMAIN & 0xFFFFFFFF) << 32 | + * | Partition id | [31:28] | "location id" | (LOCATION & 0xF0000000) | + * | Reserved | [27:16] | "location id" | N/A | + * | Bus | [15: 8] | "location id" | (LOCATION & 0xFF00) | + * | Device | [ 7: 3] | "location id" | (LOCATION & 0xF8) | + * | Function | [ 2: 0] | "location id" | (LOCATION & 0x7) | * * @param[in] processor_handle a processor handle * diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index c91319662a..07188e7902 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -1664,7 +1664,11 @@ def amdsmi_get_gpu_asic_info( "rev_id": _padHexValue(hex(asic_info_struct.rev_id), 2), "asic_serial": asic_info_struct.asic_serial.decode("utf-8"), "oam_id": asic_info_struct.oam_id, - "num_compute_units": asic_info_struct.num_of_compute_units + "num_compute_units": asic_info_struct.num_of_compute_units, + "target_graphics_version": "gfx" + str(asic_info_struct.target_graphics_version), + "kfd_id": asic_info_struct.kfd_id, + "node_id": asic_info_struct.node_id, + "partition_id": asic_info_struct.partition_id } string_values = ["market_name", "vendor_name"] diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index 8d8b0aa739..15fc5f2ac7 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -380,18 +380,18 @@ amdsmi_clk_type_t = ctypes.c_uint32 # enum # values for enumeration 'amdsmi_compute_partition_type_t' amdsmi_compute_partition_type_t__enumvalues = { 0: 'AMDSMI_COMPUTE_PARTITION_INVALID', - 1: 'AMDSMI_COMPUTE_PARTITION_CPX', - 2: 'AMDSMI_COMPUTE_PARTITION_SPX', - 3: 'AMDSMI_COMPUTE_PARTITION_DPX', - 4: 'AMDSMI_COMPUTE_PARTITION_TPX', - 5: 'AMDSMI_COMPUTE_PARTITION_QPX', + 1: 'AMDSMI_COMPUTE_PARTITION_SPX', + 2: 'AMDSMI_COMPUTE_PARTITION_DPX', + 3: 'AMDSMI_COMPUTE_PARTITION_TPX', + 4: 'AMDSMI_COMPUTE_PARTITION_QPX', + 5: 'AMDSMI_COMPUTE_PARTITION_CPX', } AMDSMI_COMPUTE_PARTITION_INVALID = 0 -AMDSMI_COMPUTE_PARTITION_CPX = 1 -AMDSMI_COMPUTE_PARTITION_SPX = 2 -AMDSMI_COMPUTE_PARTITION_DPX = 3 -AMDSMI_COMPUTE_PARTITION_TPX = 4 -AMDSMI_COMPUTE_PARTITION_QPX = 5 +AMDSMI_COMPUTE_PARTITION_SPX = 1 +AMDSMI_COMPUTE_PARTITION_DPX = 2 +AMDSMI_COMPUTE_PARTITION_TPX = 3 +AMDSMI_COMPUTE_PARTITION_QPX = 4 +AMDSMI_COMPUTE_PARTITION_CPX = 5 amdsmi_compute_partition_type_t = ctypes.c_uint32 # enum # values for enumeration 'amdsmi_memory_partition_type_t' @@ -902,7 +902,13 @@ struct_amdsmi_asic_info_t._fields_ = [ ('asic_serial', ctypes.c_char * 32), ('oam_id', ctypes.c_uint32), ('num_of_compute_units', ctypes.c_uint32), + ('PADDING_0', ctypes.c_ubyte * 4), + ('target_graphics_version', ctypes.c_uint64), + ('kfd_id', ctypes.c_uint64), + ('node_id', ctypes.c_uint32), + ('partition_id', ctypes.c_uint32), ('reserved', ctypes.c_uint32 * 17), + ('PADDING_1', ctypes.c_ubyte * 4), ] amdsmi_asic_info_t = struct_amdsmi_asic_info_t diff --git a/projects/amdsmi/pytest/integration_test.py b/projects/amdsmi/pytest/integration_test.py index 71de1f7114..2a3367323c 100755 --- a/projects/amdsmi/pytest/integration_test.py +++ b/projects/amdsmi/pytest/integration_test.py @@ -509,6 +509,14 @@ def walk_through(self): asic_info['asic_serial'])) print(" asic_info['oam_id'] is: {}\n".format( asic_info['oam_id'])) + print(" asic_info['target_graphics_version'] is: {}\n".format( + asic_info['target_graphics_version'])) + print(" asic_info['kfd_id'] is: {}\n".format( + asic_info['kfd_id'])) + print(" asic_info['node_id'] is: {}\n".format( + asic_info['node_id'])) + print(" asic_info['partition_id'] is: {}\n".format( + asic_info['partition_id'])) print("###Test amdsmi_get_power_cap_info \n") power_info = amdsmi.amdsmi_get_power_cap_info(processors[i]) print(" power_info['dpm_cap'] is: {}".format( diff --git a/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc b/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc index fa54728b5e..247bdd2aba 100755 --- a/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc +++ b/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc @@ -53,6 +53,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_utils.h" @@ -730,30 +731,6 @@ template constexpr float convert_mw_to_w(T mw) { return static_cast(mw / 1000.0); } -template -auto print_error_or_value(rsmi_status_t status_code, const T& metric) { - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - if constexpr (std::is_array_v) { - auto idx = uint16_t(0); - auto str_values = std::string(); - const auto num_elems = static_cast(std::end(metric) - std::begin(metric)); - str_values = ("\n\t\t num of values: " + std::to_string(num_elems) + "\n"); - for (const auto& el : metric) { - str_values += "\t\t [" + std::to_string(idx) + "]: " + std::to_string(el) + "\n"; - ++idx; - } - return str_values; - } - else if constexpr ((std::is_same_v) || - (std::is_same_v) || - (std::is_same_v)) { - return std::to_string(metric); - } - } - else { - return ("\n\t\tStatus: [" + std::to_string(status_code) + "] " + "-> " + amd::smi::getRSMIStatusString(status_code)); - } -}; template std::string print_unsigned_int(T value) { @@ -780,6 +757,7 @@ int main() { uint32_t num_monitor_devs = 0; rsmi_gpu_metrics_t gpu_metrics; std::string val_str; + RSMI_POWER_TYPE power_type = RSMI_INVALID_POWER; rsmi_num_monitor_devices(&num_monitor_devs); @@ -791,13 +769,23 @@ int main() { ret = rsmi_dev_revision_get(i, &val_ui16); CHK_RSMI_RET_I(ret) std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << val_ui16 << "\n"; - ret = amd::smi::rsmi_get_gfx_target_version(i , &val_str); - std::cout << "\t**Target Graphics Version: " << val_str << "\n"; - - char pcie_vendor_name[256]; - ret = rsmi_dev_pcie_vendor_name_get(i, pcie_vendor_name, 256); - CHK_RSMI_RET_I(ret) - std::cout << "\t**PCIe vendor name: " << pcie_vendor_name << std::endl; + ret = rsmi_dev_target_graphics_version_get(i, &val_ui64); + std::cout << "\t**Target Graphics Version: " << std::dec + << static_cast(val_ui64) << "\n"; + ret = rsmi_dev_guid_get(i, &val_ui64); + std::cout << "\t**GUID: " << std::dec + << static_cast(val_ui64) << "\n"; + ret = rsmi_dev_node_id_get(i, &val_ui32); + std::cout << "\t**Node ID: " << std::dec + << static_cast(val_ui32) << "\n"; + char vbios_version[256]; + ret = rsmi_dev_vbios_version_get(i, vbios_version, 256); + if (ret == RSMI_STATUS_SUCCESS) { + std::cout << "\t**VBIOS Version: " << vbios_version << "\n"; + } else { + std::cout << "\t**VBIOS Version: " + << amd::smi::getRSMIStatusString(ret, false) << "\n"; + } char current_compute_partition[256]; current_compute_partition[0] = '\0'; @@ -848,8 +836,9 @@ int main() { // std::cout << "\n"; print_test_header("GPU METRICS: Using static struct (Backwards Compatibility) ", i); - print_function_header_with_rsmi_ret(ret, "rsmi_dev_gpu_metrics_info_get(" + std::to_string(i) + ", &gpu_metrics)"); - rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics); + ret = rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics); + print_function_header_with_rsmi_ret(ret, "rsmi_dev_gpu_metrics_info_get(" + + std::to_string(i) + ", &gpu_metrics)"); std::cout << "\t**.common_header.format_revision : " << print_unsigned_int(gpu_metrics.common_header.format_revision) << "\n"; @@ -988,173 +977,58 @@ int main() { for (const auto& dclk : gpu_metrics.current_dclk0s) { std::cout << "\t -> " << std::dec << dclk << "\n"; } - std::cout << " ** Note: Values MAX'ed out (UINTX MAX are unsupported for the version in question) ** " << "\n"; + + std::cout << "\n"; + std::cout << "\t ** -> Checking metrics with constant changes ** " << "\n"; + constexpr uint16_t kMAX_ITER_TEST = 10; + rsmi_gpu_metrics_t gpu_metrics_check; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics_check); + std::cout << "\t\t -> firmware_timestamp [" << idx + << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.firmware_timestamp << "\n"; + } + + std::cout << "\n"; + for (auto idx = uint16_t(1); idx <= kMAX_ITER_TEST; ++idx) { + rsmi_dev_gpu_metrics_info_get(i, &gpu_metrics_check); + std::cout << "\t\t -> system_clock_counter [" << idx + << "/" << kMAX_ITER_TEST << "]: " << gpu_metrics_check.system_clock_counter << "\n"; + } + + std::cout << "\n\n"; + std::cout << " ** Note: Values MAX'ed out " + "(UINTX MAX are unsupported for the version in question) ** " << "\n"; + std::cout << "\n\n"; print_test_header("GPU METRICS: Using direct APIs (newer)", i); metrics_table_header_t header_values; - GPUMetricTempHbm_t hbm_values; - GPUMetricVcnActivity_t vcn_values; - GPUMetricXgmiReadDataAcc_t xgmi_read_values; - GPUMetricXgmiWriteDataAcc_t xgmi_write_values; - GPUMetricCurrGfxClk_t curr_gfxclk_values; - GPUMetricCurrSocClk_t curr_socclk_values; - GPUMetricCurrVClk0_t curr_vclk0_values; - GPUMetricCurrDClk0_t curr_dclk0_values; ret = rsmi_dev_metrics_header_info_get(i, &header_values); std::cout << "\t[Metrics Header]" << "\n"; - std::cout << "\t -> format_revision : " << print_unsigned_int(header_values.format_revision) << "\n"; - std::cout << "\t -> content_revision : " << print_unsigned_int(header_values.content_revision) << "\n"; + std::cout << "\t -> format_revision : " + << print_unsigned_int(header_values.format_revision) << "\n"; + std::cout << "\t -> content_revision : " + << print_unsigned_int(header_values.content_revision) << "\n"; std::cout << "\t--------------------" << "\n"; - std::cout << "\n"; - std::cout << "\t[Temperature]" << "\n"; - ret = rsmi_dev_metrics_temp_edge_get(i, &val_ui16); - std::cout << "\t -> temp_edge(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_temp_hotspot_get(i, &val_ui16); - std::cout << "\t -> temp_hotspot(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_temp_mem_get(i, &val_ui16); - std::cout << "\t -> temp_mem(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_temp_vrgfx_get(i, &val_ui16); - std::cout << "\t -> temp_vrgfx(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_temp_vrsoc_get(i, &val_ui16); - std::cout << "\t -> temp_vrsoc(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_temp_vrmem_get(i, &val_ui16); - std::cout << "\t -> temp_vrmem(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_temp_hbm_get(i, &hbm_values); - std::cout << "\t -> temp_hbm(): " << print_error_or_value(ret, hbm_values) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Power/Energy]" << "\n"; - ret = rsmi_dev_metrics_curr_socket_power_get(i, &val_ui16); - std::cout << "\t -> current_socket_power(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_energy_acc_get(i, &val_ui64); - std::cout << "\t -> energy_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_avg_socket_power_get(i, &val_ui16); - std::cout << "\t -> average_socket_power(): " << print_error_or_value(ret, val_ui16) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Utilization]" << "\n"; - ret = rsmi_dev_metrics_avg_gfx_activity_get(i, &val_ui16); - std::cout << "\t -> average_gfx_activity(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_umc_activity_get(i, &val_ui16); - std::cout << "\t -> average_umc_activity(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_mm_activity_get(i, &val_ui16); - std::cout << "\t -> average_mm_activity(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_vcn_activity_get(i, &vcn_values); - std::cout << "\t -> vcn_activity(): " << print_error_or_value(ret, vcn_values) << "\n"; - ret = rsmi_dev_metrics_mem_activity_acc_get(i, &val_ui32); - std::cout << "\t -> mem_activity_accum(): " << print_error_or_value(ret, val_ui32) << "\n"; - ret = rsmi_dev_metrics_gfx_activity_acc_get(i, &val_ui32); - std::cout << "\t -> gfx_activity_accum(): " << print_error_or_value(ret, val_ui32) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Average Clock]" << "\n"; - ret = rsmi_dev_metrics_avg_gfx_clock_frequency_get(i, &val_ui16); - std::cout << "\t -> average_gfx_clock_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_soc_clock_frequency_get(i, &val_ui16); - std::cout << "\t -> average_soc_clock_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_uclock_frequency_get(i, &val_ui16); - std::cout << "\t -> average_uclock_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_vclock0_frequency_get(i, &val_ui16); - std::cout << "\t -> average_vclock0_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_dclock0_frequency_get(i, &val_ui16); - std::cout << "\t -> average_dclock0_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_vclock1_frequency_get(i, &val_ui16); - std::cout << "\t -> average_vclock1_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_avg_dclock1_frequency_get(i, &val_ui16); - std::cout << "\t -> average_dclock1_frequency(): " << print_error_or_value(ret, val_ui16) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Current Clock]" << "\n"; - ret = rsmi_dev_metrics_curr_vclk1_get(i, &val_ui16); - std::cout << "\t -> current_vclock1(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_curr_dclk1_get(i, &val_ui16); - std::cout << "\t -> current_dclock1(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_curr_uclk_get(i, &val_ui16); - std::cout << "\t -> current_uclock(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_curr_dclk0_get(i, &curr_dclk0_values); - std::cout << "\t -> current_dclk0(): " << print_error_or_value(ret, curr_dclk0_values) << "\n"; - ret = rsmi_dev_metrics_curr_gfxclk_get(i, &curr_gfxclk_values); - std::cout << "\t -> current_gfxclk(): " << print_error_or_value(ret, curr_gfxclk_values) << "\n"; - ret = rsmi_dev_metrics_curr_socclk_get(i, &curr_socclk_values); - std::cout << "\t -> current_soc_clock(): " << print_error_or_value(ret, curr_socclk_values) << "\n"; - ret = rsmi_dev_metrics_curr_vclk0_get(i, &curr_vclk0_values); - std::cout << "\t -> current_vclk0(): " << print_error_or_value(ret, curr_vclk0_values) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Throttle]" << "\n"; - ret = rsmi_dev_metrics_indep_throttle_status_get(i, &val_ui64); - std::cout << "\t -> indep_throttle_status(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_throttle_status_get(i, &val_ui32); - std::cout << "\t -> throttle_status(): " << print_error_or_value(ret, val_ui32) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Gfx Clock Lock]" << "\n"; - ret = rsmi_dev_metrics_gfxclk_lock_status_get(i, &val_ui32); - std::cout << "\t -> gfxclk_lock_status(): " << print_error_or_value(ret, val_ui32) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Current Fan Speed]" << "\n"; - ret = rsmi_dev_metrics_curr_fan_speed_get(i, &val_ui16); - std::cout << "\t -> current_fan_speed(): " << print_error_or_value(ret, val_ui16) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Link/Bandwidth/Speed]" << "\n"; - ret = rsmi_dev_metrics_pcie_link_width_get(i, &val_ui16); - std::cout << "\t -> pcie_link_width(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_pcie_link_speed_get(i, &val_ui16); - std::cout << "\t -> pcie_link_speed(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_pcie_bandwidth_acc_get(i, &val_ui64); - std::cout << "\t -> pcie_bandwidth_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_pcie_bandwidth_inst_get(i, &val_ui64); - std::cout << "\t -> pcie_bandwidth_inst(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_pcie_l0_recov_count_acc_get(i, &val_ui64); - std::cout << "\t -> pcie_l0_recov_count_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_pcie_replay_count_acc_get(i, &val_ui64); - std::cout << "\t -> pcie_replay_count_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_pcie_replay_rover_count_acc_get(i, &val_ui64); - std::cout << "\t -> pcie_replay_rollover_count_accum(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_xgmi_link_width_get(i, &val_ui16); - std::cout << "\t -> xgmi_link_width(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_xgmi_link_speed_get(i, &val_ui16); - std::cout << "\t -> xgmi_link_speed(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_xgmi_read_data_get(i, &xgmi_read_values); - std::cout << "\t -> xgmi_read_data(): " << print_error_or_value(ret, xgmi_read_values) << "\n"; - ret = rsmi_dev_metrics_xgmi_write_data_get(i, &xgmi_write_values); - std::cout << "\t -> xgmi_write_data(): " << print_error_or_value(ret, xgmi_write_values) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Voltage]" << "\n"; - ret = rsmi_dev_metrics_volt_soc_get(i, &val_ui16); - std::cout << "\t -> voltage_soc(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_volt_gfx_get(i, &val_ui16); - std::cout << "\t -> voltage_gfx(): " << print_error_or_value(ret, val_ui16) << "\n"; - ret = rsmi_dev_metrics_volt_mem_get(i, &val_ui16); - std::cout << "\t -> voltage_mem(): " << print_error_or_value(ret, val_ui16) << "\n"; - - std::cout << "\n"; - std::cout << "\t[Timestamp]" << "\n"; - ret = rsmi_dev_metrics_system_clock_counter_get(i, &val_ui64); - std::cout << "\t -> system_clock_counter(): " << print_error_or_value(ret, val_ui64) << "\n"; - ret = rsmi_dev_metrics_firmware_timestamp_get(i, &val_ui64); - std::cout << "\t -> firmware_timestamp(): " << print_error_or_value(ret, val_ui64) << "\n"; - std::cout << "\n"; std::cout << "\t[XCD CounterVoltage]" << "\n"; ret = rsmi_dev_metrics_xcd_counter_get(i, &val_ui16); - std::cout << "\t -> xcd_counter(): " << print_error_or_value(ret, val_ui16) << "\n"; + std::cout << "\t -> xcd_counter(): " << val_ui16; std::cout << "\n\n"; - ret = rsmi_dev_perf_level_get(i, &pfl); CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**Performance Level:" << perf_level_string(pfl) << "\n"; ret = rsmi_dev_overdrive_level_get(i, &val_ui32); - CHK_AND_PRINT_RSMI_ERR_RET(ret) - std::cout << "\t**OverDrive Level:" << val_ui32 << "\n"; + std::cout << "\t**OverDrive Level: "; + if (ret == RSMI_STATUS_SUCCESS) { + std::cout << val_ui32 << "\n"; + } else { + CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret) + } print_test_header("GPU Clocks", i); for (int clkType = static_cast(RSMI_CLK_TYPE_SYS); @@ -1271,9 +1145,6 @@ int main() { } for (uint32_t i = 0; i < num_monitor_devs; ++i) { - ret = test_set_overdrive(i); - CHK_AND_PRINT_RSMI_ERR_RET(ret) - ret = test_set_perf_level(i); CHK_AND_PRINT_RSMI_ERR_RET(ret) @@ -1294,6 +1165,9 @@ int main() { ret = test_set_memory_partition(i); CHK_AND_PRINT_RSMI_ERR_RET(ret) + + ret = test_set_overdrive(i); + CHK_RSMI_NOT_SUPPORTED_RET(ret) } return 0; diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h index 47edd8cc09..e8109a03a7 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi.h @@ -361,7 +361,6 @@ typedef enum { RSMI_EVT_NOTIF_NONE = KFD_SMI_EVENT_NONE, //!< Unused RSMI_EVT_NOTIF_VMFAULT = KFD_SMI_EVENT_VMFAULT, //!< VM page fault RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT, - RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE, RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET, RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET, @@ -415,8 +414,6 @@ typedef rsmi_clk_type_t rsmi_clk_type; */ typedef enum { RSMI_COMPUTE_PARTITION_INVALID = 0, - RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with - //!< shared memory RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work //!< together with shared memory RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work @@ -425,6 +422,8 @@ typedef enum { //!< work together with shared memory RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs //!< work together with shared memory + RSMI_COMPUTE_PARTITION_CPX //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory } rsmi_compute_partition_type_t; /// \cond Ignore in docs. typedef rsmi_compute_partition_type_t rsmi_compute_partition_type; @@ -797,7 +796,6 @@ typedef struct { uint16_t fine_value_count; } rsmi_utilization_counter_t; - /** * @brief Reserved Memory Page Record */ @@ -905,7 +903,7 @@ typedef rsmi_pcie_bandwidth_t rsmi_pcie_bandwidth; */ typedef struct { /* Utilization */ - uint16_t average_gfx_activity; + uint16_t average_gfx_activity; //!< Average graphics activity uint16_t average_umc_activity; //!< memory controller uint16_t average_mm_activity; //!< UVD or VCN } rsmi_activity_metric_counter_t; @@ -1038,10 +1036,6 @@ struct metrics_table_header_t { typedef struct metrics_table_header_t metrics_table_header_t; /// \endcond -/** - * @brief The following structure holds the gpu metrics values for a device. - */ - /** * @brief Unit conversion factor for HBM temperatures */ @@ -1098,7 +1092,7 @@ typedef struct { */ struct metrics_table_header_t common_header; - // Temperature + // Temperature (C) uint16_t temperature_edge; uint16_t temperature_hotspot; uint16_t temperature_mem; @@ -1106,19 +1100,19 @@ typedef struct { uint16_t temperature_vrsoc; uint16_t temperature_vrmem; - // Utilization + // Utilization (%) uint16_t average_gfx_activity; uint16_t average_umc_activity; // memory controller uint16_t average_mm_activity; // UVD or VCN - // Power/Energy + // Power (W) /Energy (15.259uJ per 1ns) uint16_t average_socket_power; uint64_t energy_accumulator; // v1 mod. (32->64) // Driver attached timestamp (in ns) uint64_t system_clock_counter; // v1 mod. (moved from top of struct) - // Average clocks + // Average clocks (MHz) uint16_t average_gfxclk_frequency; uint16_t average_socclk_frequency; uint16_t average_uclk_frequency; @@ -1127,7 +1121,7 @@ typedef struct { uint16_t average_vclk1_frequency; uint16_t average_dclk1_frequency; - // Current clocks + // Current clocks (MHz) uint16_t current_gfxclk; uint16_t current_socclk; uint16_t current_uclk; @@ -1139,10 +1133,10 @@ typedef struct { // Throttle status uint32_t throttle_status; - // Fans + // Fans (RPM) uint16_t current_fan_speed; - // Link width/speed + // Link width (number of lanes) /speed (0.1 GT/s) uint16_t pcie_link_width; // v1 mod.(8->16) uint16_t pcie_link_speed; // in 0.1 GT/s; v1 mod. (8->16) @@ -1158,7 +1152,7 @@ typedef struct { /* * v1.2 additions */ - // PMFW attached timestamp (10ns resolution) + // PMFW attached timestamp (10ns resolution) uint64_t firmware_timestamp; @@ -1181,19 +1175,19 @@ typedef struct { uint16_t current_socket_power; // Utilization (%) - uint16_t vcn_activity[RSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode) + uint16_t vcn_activity[RSMI_MAX_NUM_VCNS]; // VCN instances activity percent (encode/decode) // Clock Lock Status. Each bit corresponds to clock instance uint32_t gfxclk_lock_status; - // XGMI bus width and bitrate (in Gbps) + // XGMI bus width and bitrate (in GB/s) uint16_t xgmi_link_width; uint16_t xgmi_link_speed; // PCIE accumulated bandwidth (GB/sec) uint64_t pcie_bandwidth_acc; - // PCIE instantaneous bandwidth (GB/sec) + // PCIE instantaneous bandwidth (GB/sec) uint64_t pcie_bandwidth_inst; // PCIE L0 to recovery state transition accumulated count @@ -1447,7 +1441,7 @@ rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision); * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid * */ -rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, char *sku); +rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *sku); /** * @brief Get the device vendor id associated with the device with provided @@ -1830,13 +1824,62 @@ rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *id); * * @param[in] dv_ind a device index * - * @param[inout] revision a pointer to uint32_t to which the XGMI physical id + * @param[inout] id a pointer to uint32_t to which the XGMI physical id * will be written * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. * */ -rsmi_status_t rsmi_dev_oam_id_get(uint32_t dv_ind, uint16_t *id); +rsmi_status_t rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id); + +/** + * @brief Get the GUID, also known as the GPU device id, + * associated with the provided device index indicated by KFD. + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t + * @p guid, this function will write the KFD GPU id value to the + * uint64_t pointed to by @p guid. + * + * @param[in] dv_ind a device index + * + * @param[inout] guid a pointer to uint64_t to which the KFD gpu id will be + * written. If the @p guid parameter is nullptr, this function will return + * ::RSMI_STATUS_INVALID_ARGS. If the GPU ID is not supported with + * the device index queried, gpu_id will return MAX UINT64 value an + * arguments and ::RSMI_STATUS_NOT_SUPPORTED as a response. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t rsmi_dev_guid_get(uint32_t dv_ind, uint64_t *guid); + +/** + * @brief Get the node id associated with the provided device index + * indicated by KFD. + * + * @details Given a device index @p dv_ind and a pointer to a uint32_t + * @p node_id, this function will write the KFD node id value to the + * uint32_t pointed to by @p node_id. + * + * @param[in] dv_ind a device index + * + * @param[inout] node_id a pointer to uint64_t to which the KFD gpu id will be + * written. If the @p node_id parameter is nullptr, this function will return + * ::RSMI_STATUS_INVALID_ARGS. If @p node_id is not supported with + * the device index queried, @p node_id will return MAX UINT64 value as an + * argument and ::RSMI_STATUS_NOT_SUPPORTED as a response. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t rsmi_dev_node_id_get(uint32_t dv_ind, uint32_t *node_id); + /** @} */ // end of IDQuer @@ -1877,16 +1920,18 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *bandwidth); * * The format of @p bdfid will be as follows: * - * BDFID = ((DOMAIN & 0xffffffff) << 32) | ((BUS & 0xff) << 8) | - * ((DEVICE & 0x1f) <<3 ) | (FUNCTION & 0x7) + * BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((Partition & 0xF) << 28) + * | ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 ) + * | (FUNCTION & 0x7) * - * | Name | Field | - * ---------- | ------- | - * | Domain | [64:32] | - * | Reserved | [31:16] | - * | Bus | [15: 8] | - * | Device | [ 7: 3] | - * | Function | [ 2: 0] | + * | Name | Field | KFD property KFD -> PCIe ID (uint64_t) + * -------------- | ------- | ---------------- | ---------------------------- | + * | Domain | [63:32] | "domain" | (DOMAIN & 0xFFFFFFFF) << 32 | + * | Partition id | [31:28] | "location id" | (LOCATION & 0xF0000000) | + * | Reserved | [27:16] | "location id" | N/A | + * | Bus | [15: 8] | "location id" | (LOCATION & 0xFF00) | + * | Device | [ 7: 3] | "location id" | (LOCATION & 0xF8) | + * | Function | [ 2: 0] | "location id" | (LOCATION & 0x7) | * * @param[in] dv_ind a device index * @@ -2033,6 +2078,11 @@ rsmi_status_t rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask); * @p power, this function will write the current average power consumption * (in microwatts) to the uint64_t pointed to by @p power. * + * @deprecated ::rsmi_dev_power_get() is preferred due to providing + * backwards compatibility, which looks at both average and current power + * values. Whereas ::rsmi_dev_power_ave_get only looks for average power + * consumption. Newer ASICs will support current power only. + * * @param[in] dv_ind a device index * * @param[in] sensor_ind a 0-based sensor index. Normally, this will be 0. @@ -2101,7 +2151,10 @@ rsmi_dev_current_socket_power_get(uint32_t dv_ind, uint64_t *socket_power); * @param[inout] type a pointer to RSMI_POWER_TYPE object. Returns the type * of power retrieved from the device. Current power is ::RSMI_CURRENT_POWER * and average power is ::RSMI_AVERAGE_POWER. If an error occurs, - * returns an invalid power type ::RSMI_INVALID_POWER. + * returns an invalid power type ::RSMI_INVALID_POWER - example device + * neither supports average power or current power. + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVALID_ARGS. * * @retval ::RSMI_STATUS_SUCCESS call was successful * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not @@ -3079,7 +3132,6 @@ rsmi_status_t rsmi_dev_clk_extremum_set(uint32_t dv_ind, rsmi_freq_ind_t level, uint64_t clkvalue, rsmi_clk_type_t clkType); - /** * @brief This function sets the clock frequency information * @@ -3625,6 +3677,29 @@ rsmi_status_t rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block, uint64_t *fw_version); +/** + * @brief Get the target graphics version for a GPU device + * + * @details Given a device ID @p dv_ind and a uint64_t pointer + * @p gfx_version, this function will write the graphics version. + * + * @param[in] dv_ind a device index + * + * @param[inout] gfx_version The device graphics version number indicated by + * KFD. If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVALID_ARGS. If device does not support this value, + * will return ::RSMI_STATUS_NOT_SUPPORTED and a maximum UINT64 value as + * @p gfx_version. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * + */ +rsmi_status_t rsmi_dev_target_graphics_version_get(uint32_t dv_ind, + uint64_t *gfx_version); + /** @} */ // end of VersQuer /*****************************************************************************/ @@ -4456,6 +4531,30 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, */ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind); +/** + * @brief Retrieves the partition_id for a desired device + * + * @details + * Given a device index @p dv_ind and a uint32_t pointer @p partition_id , + * this function will attempt to obtain the device's partition ID. + * Upon successful retreival, the obtained device's partition will be stored + * in the passed @p partition_id uint32_t variable. If device does + * not support partitions or is in SPX, a @p partition_id ID of 0 shall + * be returned. + * + * @param[in] dv_ind a device index + * + * @param[inout] partition_id a uint32_t variable, + * which the device's partition_id will be written to. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * + */ +rsmi_status_t rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id); + /** @} */ // end of ComputePartition /*****************************************************************************/ @@ -4897,995 +4996,6 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind); * @{ */ -/** - * Metric multi-valued counter types - */ -typedef uint16_t GPUMetricTempHbm_t[RSMI_NUM_HBM_INSTANCES]; -typedef uint16_t GPUMetricVcnActivity_t[RSMI_MAX_NUM_VCNS]; -typedef uint16_t GPUMetricJpegActivity_t[RSMI_MAX_NUM_JPEG_ENGS]; -typedef uint64_t GPUMetricXgmiReadDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS]; -typedef uint64_t GPUMetricXgmiWriteDataAcc_t[RSMI_MAX_NUM_XGMI_LINKS]; -typedef uint16_t GPUMetricCurrGfxClk_t[RSMI_MAX_NUM_GFX_CLKS]; -typedef uint16_t GPUMetricCurrSocClk_t[RSMI_MAX_NUM_CLKS]; -typedef uint16_t GPUMetricCurrVClk0_t[RSMI_MAX_NUM_CLKS]; -typedef uint16_t GPUMetricCurrDClk0_t[RSMI_MAX_NUM_CLKS]; - - -/****** - * Metric single-valued counter types - */ - -/** - * @brief Get the 'temp_hotspot' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_hotspot' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] hotspot_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_hotspot_get(uint32_t dv_ind, uint16_t* hotspot_value); - -/** - * @brief Get the 'temp_mem' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_mem' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] mem_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_mem_get(uint32_t dv_ind, uint16_t* mem_value); - -/** - * @brief Get the 'temp_vrsoc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_vrsoc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] vrsoc_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_vrsoc_get(uint32_t dv_ind, uint16_t* vrsoc_value); - -/** - * @brief Get the 'curr_socket_power' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'socket_power' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] socket_power_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value); - -/** - * @brief Get the 'avg_gfx_activity' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'gfx_activity' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] gfx_activity_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_gfx_activity_get(uint32_t dv_ind, uint16_t* gfx_activity_value); - -/** - * @brief Get the 'avg_umc_activity' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'umc_activity' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] umc_activity_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_umc_activity_get(uint32_t dv_ind, uint16_t* umc_activity_value); - -/** - * @brief Get the 'energy_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'energy_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] energy_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_energy_acc_get(uint32_t dv_ind, uint64_t* energy_acc_value); - -/** - * @brief Get the 'system_clock_counter' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'system_clock_counter' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] system_clock_counter_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_system_clock_counter_get(uint32_t dv_ind, uint64_t* system_clock_counter_value); - -/** - * @brief Get the 'firmware_timestamp' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'firmware_timestamp' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] firmware_timestamp_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_firmware_timestamp_get(uint32_t dv_ind, uint64_t* firmware_timestamp_value); - -/** - * @brief Get the 'throttle_status' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint32_t in which - * the 'throttle_status' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] throttle_status_value a pointer to uint32_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_throttle_status_get(uint32_t dv_ind, uint32_t* throttle_status_value); - -/** - * @brief Get the 'pcie_link_width' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'pcie_link_width' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_link_width_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_link_width_get(uint32_t dv_ind, uint16_t* pcie_link_width_value); - -/** - * @brief Get the 'pcie_link_speed' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'pcie_link_speed' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_link_speed_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_link_speed_get(uint32_t dv_ind, uint16_t* pcie_link_speed_value); - -/** - * @brief Get the 'xgmi_link_width' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'xgmi_link_width' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] xgmi_link_width_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_xgmi_link_width_get(uint32_t dv_ind, uint16_t* xgmi_link_width_value); - -/** - * @brief Get the 'xgmi_link_speed' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'xgmi_link_speed' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] xgmi_link_speed_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_xgmi_link_speed_get(uint32_t dv_ind, uint16_t* xgmi_link_speed_value); - -/** - * @brief Get the 'gfxclk_lock_status' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint32_t in which - * the 'gfxclk_lock_status' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] gfxclk_lock_status_value a pointer to uint32_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_gfxclk_lock_status_get(uint32_t dv_ind, uint32_t* gfxclk_lock_status_value); - -/** - * @brief Get the 'gfx_activity_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint32_t in which - * the 'gfx_activity_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] gfx_activity_acc_value a pointer to uint32_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_gfx_activity_acc_get(uint32_t dv_ind, uint32_t* gfx_activity_acc_value); - -/** - * @brief Get the 'mem_activity_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint32_t in which - * the 'mem_activity_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] mem_activity_acc_value a pointer to uint32_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_mem_activity_acc_get(uint32_t dv_ind, uint32_t* mem_activity_acc_value); - -/** - * @brief Get the 'pcie_bandwidth_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'pcie_bandwidth_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_bandwidth_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_bandwidth_acc_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_acc_value); - -/** - * @brief Get the 'pcie_bandwidth_inst' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'pcie_bandwidth_inst' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_bandwidth_inst_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_bandwidth_inst_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_inst_value); - -/** - * @brief Get the 'pcie_l0_recov_count_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'pcie_l0_recov_count_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_count_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_l0_recov_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value); - -/** - * @brief Get the 'pcie_replay_count_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'pcie_replay_count_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_count_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_replay_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value); - -/** - * @brief Get the 'pcie_replay_rover_count_acc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'pcie_replay_rover_count_acc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] pcie_count_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_pcie_replay_rover_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value); - -/** - * @brief Get the 'curr_uclk' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_uclk' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] uclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_uclk_get(uint32_t dv_ind, uint16_t* uclk_value); - - -/****** - * Metric multi-valued counter types - */ - -/** - * @brief Get the 'temp_hbm' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_hbm' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] temp_hbm_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding a 4 (RSMI_NUM_HBM_INSTANCES) - * element array (GPUMetricTempHbm_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_value); - -/** - * @brief Get the 'vcn_activity' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'vcn_activity' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] vcn_activity_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_VCNS) - * element array (GPUMetricVcnActivity_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_vcn_activity_get(uint32_t dv_ind, GPUMetricVcnActivity_t* vcn_activity_value); - -/** - * @brief Get the 'xgmi_read_data' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'xgmi_read_data' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] xgmi_read_data_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding an 8 (RSMI_MAX_NUM_XGMI_LINKS) - * element array (GPUMetricXgmiReadDataAcc_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_xgmi_read_data_get(uint32_t dv_ind, GPUMetricXgmiReadDataAcc_t* xgmi_read_data_acc_value); - -/** - * @brief Get the 'xgmi_write_data' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'xgmi_write_data' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] xgmi_write_data_acc_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding an 8 (RSMI_MAX_NUM_XGMI_LINKS) - * element array (GPUMetricXgmiWriteDataAcc_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_xgmi_write_data_get(uint32_t dv_ind, GPUMetricXgmiWriteDataAcc_t* xgmi_write_data_acc_value); - -/** - * @brief Get the 'curr_gfxclk' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'curr_gfxclk' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] current_gfxclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding an 8 (RSMI_MAX_NUM_GFX_CLKS) - * element array (GPUMetricCurrGfxClk_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_gfxclk_get(uint32_t dv_ind, GPUMetricCurrGfxClk_t* current_gfxclk_value); - -/** - * @brief Get the 'curr_socclk' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_socclk' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] current_socclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_CLKS) - * element array (GPUMetricCurrSocClk_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_socclk_get(uint32_t dv_ind, GPUMetricCurrSocClk_t* current_socclk_value); - -/** - * @brief Get the 'curr_vclk0' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_vclk0' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] current_vclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_CLKS) - * element array (GPUMetricCurrVClk0_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_vclk0_get(uint32_t dv_ind, GPUMetricCurrVClk0_t* current_vclk_value); - -/** - * @brief Get the 'curr_dclk0' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_dclk0' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] current_dclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - This is a multi-valued counter holding a 4 (RSMI_MAX_NUM_CLKS) - * element array (GPUMetricCurrDClk0_t) - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_dclk_value); - -/** - * @brief Get the 'temp_edge' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_edge' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] edge_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_edge_get(uint32_t dv_ind, uint16_t* edge_value); - -/** - * @brief Get the 'temp_vrgfx' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_vrgfx' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] vrgfx_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_vrgfx_get(uint32_t dv_ind, uint16_t* vrgfx_value); - -/** - * @brief Get the 'temp_vrmem' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'temp_vrmem' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] vrmem_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_temp_vrmem_get(uint32_t dv_ind, uint16_t* vrmem_value); - -/** - * @brief Get the 'avg_mm_activity' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_mm_activity' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] mm_activity_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_mm_activity_get(uint32_t dv_ind, uint16_t* mm_activity_value); - -/** - * @brief Get the 'curr_vclk1' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_vclk1' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] current_vclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_vclk1_get(uint32_t dv_ind, uint16_t* current_vclk_value); - -/** - * @brief Get the 'curr_dclk1' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_dclk1' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] current_dclk_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_dclk1_get(uint32_t dv_ind, uint16_t* current_dclk_value); - -/** - * @brief Get the 'indep_throttle_status' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint64_t in which - * the 'indep_throttle_status' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] throttle_status_value a pointer to uint64_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_indep_throttle_status_get(uint32_t dv_ind, uint64_t* throttle_status_value); - -/** - * @brief Get the 'avg_socket_power' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_socket_power' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] socket_power_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value); - -/** - * @brief Get the 'curr_fan_speed' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'curr_fan_speed' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] fan_speed_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_curr_fan_speed_get(uint32_t dv_ind, uint16_t* fan_speed_value); - -/** - * @brief Get the 'avg_gfx_clock_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_gfx_clock_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_gfx_clock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'avg_soc_clock_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_soc_clock_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_soc_clock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'avg_uclock_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_uclock_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_uclock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'avg_vclock0_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_vclock0_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_vclock0_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'avg_dclock0_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_dclock0_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_dclock0_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'avg_vclock1_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_vclock1_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_vclock1_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'avg_dclock1_frequency' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'avg_dclock1_frequency' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] clock_frequency_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_avg_dclock1_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value); - -/** - * @brief Get the 'volt_soc' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'volt_soc' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] voltage_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_volt_soc_get(uint32_t dv_ind, uint16_t* voltage_value); - -/** - * @brief Get the 'volt_gfx' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'volt_gfx' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] voltage_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_volt_gfx_get(uint32_t dv_ind, uint16_t* voltage_value); - -/** - * @brief Get the 'volt_mem' from the GPU metrics associated with the device - * - * @details Given a device index @p dv_ind and a pointer to a uint16_t in which - * the 'volt_mem' will stored - * - * @param[in] dv_ind a device index - * - * @param[inout] voltage_value a pointer to uint16_t to which the device gpu - * metric unit will be stored - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. - * ::RSMI_STATUS_NOT_SUPPORTED is returned in case the metric unit - * does not exist for the given device - * - */ -rsmi_status_t -rsmi_dev_metrics_volt_mem_get(uint32_t dv_ind, uint16_t* voltage_value); - /** * @brief Get the 'metrics_header_info' from the GPU metrics associated with the device * @@ -5938,6 +5048,7 @@ rsmi_dev_metrics_xcd_counter_get(uint32_t dv_ind, uint16_t* xcd_counter_value); rsmi_status_t rsmi_dev_metrics_log_get(uint32_t dv_ind); +/** @} */ // end of DevMetricsHeaderInfoGet #ifdef __cplusplus } diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_kfd.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_kfd.h index 81a76400ce..87338d8d7c 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_kfd.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_kfd.h @@ -94,6 +94,11 @@ class KFDNode { int32_t get_simd_per_cu(uint64_t* simd_per_cu) const; int32_t get_simd_count(uint64_t* simd_count) const; + // Get gpu_id (AKA GUID) version from kfd + int get_gpu_id(uint64_t *gpu_id); + // Get node id from kfd + int get_node_id(uint32_t *node_id); + private: uint32_t node_indx_; uint32_t amdgpu_dev_index_; diff --git a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h index 36261d89e6..e4130c46ee 100755 --- a/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/rocm_smi/include/rocm_smi/rocm_smi_utils.h @@ -48,8 +48,11 @@ #include #include #include +#include #include +#include #include +#include #include #include #include @@ -594,6 +597,7 @@ class TagTextContents_t } } } + }; using TextFileTagContents_t = TagTextContents_t: <|*>" @@ -783,7 +783,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, } ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", reporting " << amd::smi::getRSMIStatusString(ret);; + << ", reporting " << amd::smi::getRSMIStatusString(ret); LOG_TRACE(ss); return ret; CATCH @@ -806,16 +806,30 @@ rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) { kfd_node->get_property_value("domain", &domain); - // Replace the 16 bit domain originally set like this: - // BDFID = (( & 0xffff) << 32) | (( & 0xff) << 8) | - // ((device& 0x1f) <<3 ) | (function & 0x7) - // with this: - // BDFID = (( & 0xffffffff) << 32) | (( & 0xff) << 8) | - // ((device& 0x1f) <<3 ) | (function & 0x7) - + /** + * Add domain to full pci_id: + * BDFID = ((DOMAIN & 0xFFFFFFFF) << 32) | ((PARTITION_ID & 0xF) << 28) | + * ((BUS & 0xFF) << 8) | ((DEVICE & 0x1F) <<3 ) | (FUNCTION & 0x7) + * + * bits [63:32] = domain + * bits [31:28] or bits [2:0] = partition id + * bits [27:16] = reserved + * bits [15:8] = Bus + * bits [7:3] = Device + * bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes + */ assert((domain & 0xFFFFFFFF00000000) == 0); - (*bdfid) &= 0xFFFF; // Clear out the old 16 bit domain - *bdfid |= (domain & 0xFFFFFFFF) << 32; + (*bdfid) &= 0xFFFFFFFF; // keep bottom 32 bits of pci_id + *bdfid |= (domain & 0xFFFFFFFF) << 32; // Add domain to top of pci_id + uint64_t pci_id = *bdfid; + uint32_t node = UINT32_MAX; + rsmi_dev_node_id_get(dv_ind, &node); + ss << __PRETTY_FUNCTION__ << " | kfd node = " + << std::to_string(node) << "\n" + << " returning pci_id = " + << std::to_string(pci_id) << " (" + << amd::smi::print_int_as_hex(pci_id) << ")"; + LOG_INFO(ss); ss << __PRETTY_FUNCTION__ << " | ======= end =======" << ", reporting RSMI_STATUS_SUCCESS"; @@ -957,7 +971,7 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) { } rsmi_status_t -rsmi_dev_oam_id_get(uint32_t dv_ind, uint16_t *id) { +rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id) { std::ostringstream ss; rsmi_status_t ret; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; @@ -1561,6 +1575,7 @@ rsmi_status_t rsmi_dev_clk_extremum_set(uint32_t dv_ind, rsmi_freq_ind_t level, if (ret != RSMI_STATUS_SUCCESS) { return ret; } + // For clock frequency setting, enter a new value by writing a string that // contains "s/m index clock" to the file. The index should be 0 if to set // minimum clock. And 1 if to set maximum clock. E.g., "s 0 500" will update @@ -1585,7 +1600,6 @@ rsmi_status_t rsmi_dev_clk_extremum_set(uint32_t dv_ind, rsmi_freq_ind_t level, CATCH } - rsmi_status_t rsmi_dev_clk_range_set(uint32_t dv_ind, uint64_t minclkvalue, uint64_t maxclkvalue, rsmi_clk_type_t clkType) { @@ -2067,7 +2081,8 @@ rsmi_status_t rsmi_dev_process_isolation_get(uint32_t dv_ind, // the enforce_isolation sysfs is in this format // Get the partition_id. For SPX, the partition_id will be 0. - int partition_id = dev->get_partition_id(); + uint32_t partition_id = 0; + rsmi_dev_partition_id_get(dv_ind, &partition_id); DEVICE_MUTEX @@ -2126,7 +2141,8 @@ rsmi_status_t rsmi_dev_process_isolation_set(uint32_t dv_ind, // To set the values,need to specify the setting for all of the partitions // For two partition // echo "1 0" | sudo tee  /sys/class/drm/cardX/device/enforce_isolation - int partition_id = dev->get_partition_id(); + uint32_t partition_id = 0; + rsmi_dev_partition_id_get(dv_ind, &partition_id); std::string str_val; rsmi_status_t ret = get_dev_value_line(amd::smi::kDevProcessIsolation, dv_ind, &str_val); if (ret == RSMI_STATUS_FILE_ERROR) { @@ -5360,9 +5376,12 @@ rsmi_topo_get_p2p_status(uint32_t dv_ind_src, uint32_t dv_ind_dst, CATCH } -static rsmi_status_t -get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { +static rsmi_status_t get_compute_partition(uint32_t dv_ind, + std::string &compute_partition) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(compute_partition.c_str()) std::string compute_partition_str; @@ -5386,6 +5405,8 @@ get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { return RSMI_STATUS_UNEXPECTED_DATA; } compute_partition = compute_partition_str; + ss << __PRETTY_FUNCTION__ << " | ======= END =======, " << dv_ind; + LOG_TRACE(ss); return RSMI_STATUS_SUCCESS; CATCH } @@ -5395,7 +5416,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, uint32_t len) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start =======, dv_ind = " + ss << __PRETTY_FUNCTION__ << " | ======= start =======, dv_ind = " << dv_ind; LOG_TRACE(ss); if ((len == 0) || (compute_partition == nullptr)) { @@ -5431,7 +5452,7 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, return ret; } - std::size_t length = returning_compute_partition.copy(compute_partition, len); + std::size_t length = returning_compute_partition.copy(compute_partition, len-1); compute_partition[length]='\0'; if (len < (returning_compute_partition.size() + 1)) { @@ -5465,20 +5486,47 @@ static rsmi_status_t is_available_compute_partition(uint32_t dv_ind, std::string new_compute_partition) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); DEVICE_MUTEX std::string availableComputePartitions; rsmi_status_t ret = get_dev_value_line(amd::smi::kDevAvailableComputePartition, dv_ind, &availableComputePartitions); if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | FAIL " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition) + << " | Data: could not retrieve requested data" + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_ERROR(ss); return ret; } bool isComputePartitionAvailable = amd::smi::containsString(availableComputePartitions, new_compute_partition); - return (isComputePartitionAvailable) ? RSMI_STATUS_SUCCESS : - RSMI_STATUS_SETTING_UNAVAILABLE; + + ret = ((isComputePartitionAvailable) ? RSMI_STATUS_SUCCESS : + RSMI_STATUS_SETTING_UNAVAILABLE); + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " + << amd::smi::Device::get_type_string(amd::smi::kDevAvailableComputePartition) + << " | Data: available_partitions = " << availableComputePartitions + << " | Data: isComputePartitionAvailable = " + << (isComputePartitionAvailable ? "True" : "False") + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_INFO(ss); + return ret; CATCH } @@ -5487,16 +5535,14 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, rsmi_compute_partition_type_t compute_partition) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; LOG_TRACE(ss); REQUIRE_ROOT_ACCESS if (!amd::smi::is_sudo_user()) { return RSMI_STATUS_PERMISSION; } - DEVICE_MUTEX - std::string newComputePartitionStr - = mapRSMIToStringComputePartitionTypes.at(compute_partition); - std::string currentComputePartition; + std::string currentComputePartition = ""; + std::string newComputePartitionStr = ""; switch (compute_partition) { case RSMI_COMPUTE_PARTITION_CPX: @@ -5504,9 +5550,13 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, case RSMI_COMPUTE_PARTITION_DPX: case RSMI_COMPUTE_PARTITION_TPX: case RSMI_COMPUTE_PARTITION_QPX: + newComputePartitionStr = + mapRSMIToStringComputePartitionTypes.at(compute_partition); break; case RSMI_COMPUTE_PARTITION_INVALID: default: + newComputePartitionStr = + mapRSMIToStringComputePartitionTypes.at(RSMI_COMPUTE_PARTITION_INVALID); ss << __PRETTY_FUNCTION__ << " | ======= end ======= " << " | Fail " @@ -5583,8 +5633,8 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, << "| sizeof string = " << std::dec << sizeof(newComputePartitionStr); LOG_DEBUG(ss); - GET_DEV_FROM_INDX + DEVICE_MUTEX int ret = dev->writeDevInfo(amd::smi::kDevComputePartition, newComputePartitionStr); rsmi_status_t returnResponse = amd::smi::ErrnoToRsmiStatus(ret); @@ -5599,7 +5649,6 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, << getRSMIStatusString(returnResponse) << " |"; LOG_TRACE(ss); - // TODO(charpoag): investigate providing GPU busy state occurred with return returnResponse; CATCH } @@ -5607,6 +5656,9 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, static rsmi_status_t get_memory_partition(uint32_t dv_ind, std::string &memory_partition) { TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; + LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY(memory_partition.c_str()) std::string val_str; @@ -5630,6 +5682,8 @@ static rsmi_status_t get_memory_partition(uint32_t dv_ind, return RSMI_STATUS_UNEXPECTED_DATA; } memory_partition = val_str; + ss << __PRETTY_FUNCTION__ << " | ======= END =======, " << dv_ind; + LOG_TRACE(ss); return RSMI_STATUS_SUCCESS; CATCH } @@ -5639,7 +5693,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, rsmi_memory_partition_type_t memory_partition) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX @@ -5774,7 +5828,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, uint32_t len) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; LOG_TRACE(ss); if ((len == 0) || (memory_partition == nullptr)) { ss << __PRETTY_FUNCTION__ @@ -5844,7 +5898,7 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << " | ======= start =======, " << dv_ind; LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX @@ -5883,7 +5937,7 @@ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) { rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << "| ======= start =======, " << dv_ind; LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX @@ -5919,6 +5973,168 @@ rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) { CATCH } +rsmi_status_t +rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << "| ======= start =======, " << dv_ind; + LOG_TRACE(ss); + if (partition_id == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | FAIL" + << " | Device #: " << dv_ind + << " | Type: partition_id" + << " | Data: nullptr" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_INVALID_ARGS; + } + DEVICE_MUTEX + std::string strCompPartition = "UNKNOWN"; + const uint32_t PARTITION_LEN = 10; + char compute_partition[PARTITION_LEN]; + rsmi_status_t ret = rsmi_dev_compute_partition_get(dv_ind, compute_partition, PARTITION_LEN); + if (ret == RSMI_STATUS_SUCCESS) { + strCompPartition = compute_partition; + } + uint64_t pci_id = UINT64_MAX; + *partition_id = UINT32_MAX; + ret = rsmi_dev_pci_id_get(dv_ind, &pci_id); + if (ret == RSMI_STATUS_SUCCESS) { + *partition_id = static_cast((pci_id >> 28) & 0xf); + } + + /** + * Fall back is required due to driver changes within KFD. + * Some devices may report bits [31:28] or [2:0]. + * With the newly added rsmi_dev_partition_id_get(..), + * we provided this fallback to properly retrieve the partition ID. We + * plan to eventually remove partition ID from the function portion of the + * BDF (Bus Device Function). See below for PCI ID description. + * + * bits [63:32] = domain + * bits [31:28] or bits [2:0] = partition id + * bits [27:16] = reserved + * bits [15:8] = Bus + * bits [7:3] = Device + * bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes + */ + if (*partition_id != UINT32_MAX && *partition_id == 0 && + (strCompPartition == "DPX" || strCompPartition == "TPX" + || strCompPartition == "CPX" || strCompPartition == "QPX")) { + *partition_id = static_cast(pci_id & 0x7); + } + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Success" + << " | Device #: " << dv_ind + << " | Type: partition_id" + << " | Data: " << *partition_id + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |"; + LOG_INFO(ss); + return ret; + CATCH +} + +rsmi_status_t rsmi_dev_target_graphics_version_get(uint32_t dv_ind, + uint64_t *gfx_version) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======" + << " | Device #: " << dv_ind; + LOG_TRACE(ss); + rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED; + std::string version = ""; + const uint64_t undefined_gfx_version = std::numeric_limits::max(); + if (gfx_version == nullptr) { + ret = RSMI_STATUS_INVALID_ARGS; + } else { + *gfx_version = undefined_gfx_version; + ret = amd::smi::rsmi_get_gfx_target_version(dv_ind , &version); + } + if (ret == RSMI_STATUS_SUCCESS) { + version = amd::smi::removeString(version, "gfx"); + *gfx_version = std::stoull(version); + } + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Returning: " << getRSMIStatusString(ret, false) + << " | Device #: " << dv_ind + << " | Type: Target_graphics_version" + << " | Data: " + << ((gfx_version == nullptr) ? "nullptr" : + amd::smi::print_unsigned_hex_and_int(*gfx_version)); + LOG_TRACE(ss); + return ret; + CATCH +} + +rsmi_status_t rsmi_dev_guid_get(uint32_t dv_ind, uint64_t *guid) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======" + << " | Device #: " << dv_ind; + LOG_TRACE(ss); + GET_DEV_AND_KFDNODE_FROM_INDX + uint64_t kgd_gpu_id = 0; + rsmi_status_t resp = RSMI_STATUS_NOT_SUPPORTED; + int ret = kfd_node->KFDNode::get_gpu_id(&kgd_gpu_id); + resp = amd::smi::ErrnoToRsmiStatus(ret); + + if (guid == nullptr) { + resp = RSMI_STATUS_INVALID_ARGS; + } else { + *guid = kgd_gpu_id; + } + + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Returning: " << getRSMIStatusString(resp, false) + << " | Device #: " << dv_ind + << " | Type: GUID (gpu_id)" + << " | Data: " << ((guid == nullptr) ? "nullptr" : + amd::smi::print_unsigned_hex_and_int(*guid)); + LOG_INFO(ss); + return resp; + CATCH +} + +rsmi_status_t rsmi_dev_node_id_get(uint32_t dv_ind, uint32_t *node_id) { + TRY + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start =======" + << " | Device #: " << dv_ind; + LOG_TRACE(ss); + GET_DEV_AND_KFDNODE_FROM_INDX + uint32_t kfd_node_id = std::numeric_limits::max(); + rsmi_status_t resp = RSMI_STATUS_NOT_SUPPORTED; + int ret = kfd_node->KFDNode::get_node_id(&kfd_node_id); + resp = amd::smi::ErrnoToRsmiStatus(ret); + + if (node_id == nullptr) { + resp = RSMI_STATUS_INVALID_ARGS; + } else { + *node_id = kfd_node_id; + if (kfd_node_id == std::numeric_limits::max()) { + resp = RSMI_STATUS_NOT_SUPPORTED; + } + } + + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Returning: " << getRSMIStatusString(resp, false) + << " | Device #: " << dv_ind + << " | Type: node_id" + << " | Data: " << ((node_id == nullptr) ? "nullptr" : + amd::smi::print_unsigned_hex_and_int(*node_id)); + LOG_INFO(ss); + return resp; + CATCH +} + enum iterator_handle_type { FUNC_ITER = 0, VARIANT_ITER, @@ -6379,1455 +6595,6 @@ rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) { CATCH } -// -// NOTE: APIs related to new 'GPU Metrics' related work are added here -// so they can be used/tested. -// -rsmi_status_t -rsmi_dev_metrics_temp_edge_get(uint32_t dv_ind, uint16_t* edge_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(edge_value != nullptr); - if (edge_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempEdge); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *edge_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_temp_hotspot_get(uint32_t dv_ind, uint16_t* hotspot_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(hotspot_value != nullptr); - if (hotspot_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempHotspot); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *hotspot_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_temp_mem_get(uint32_t dv_ind, uint16_t* mem_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(mem_value != nullptr); - if (mem_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempMem); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *mem_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_temp_vrgfx_get(uint32_t dv_ind, uint16_t* vrgfx_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(vrgfx_value != nullptr); - if (vrgfx_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempVrGfx); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *vrgfx_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_temp_vrsoc_get(uint32_t dv_ind, uint16_t* vrsoc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(vrsoc_value != nullptr); - if (vrsoc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempVrSoc); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *vrsoc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_temp_vrmem_get(uint32_t dv_ind, uint16_t* vrmem_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(vrmem_value != nullptr); - if (vrmem_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempVrMem); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *vrmem_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(socket_power_value != nullptr); - if (socket_power_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrSocketPower); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *socket_power_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_socket_power_get(uint32_t dv_ind, uint16_t* socket_power_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(socket_power_value != nullptr); - if (socket_power_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgSocketPower); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *socket_power_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_gfx_activity_get(uint32_t dv_ind, uint16_t* gfx_activity_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(gfx_activity_value != nullptr); - if (gfx_activity_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgGfxActivity); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *gfx_activity_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_umc_activity_get(uint32_t dv_ind, uint16_t* umc_activity_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(umc_activity_value != nullptr); - if (umc_activity_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgUmcActivity); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *umc_activity_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_mm_activity_get(uint32_t dv_ind, uint16_t* mm_activity_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(mm_activity_value != nullptr); - if (mm_activity_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgMmActivity); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *mm_activity_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_energy_acc_get(uint32_t dv_ind, uint64_t* energy_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(energy_acc_value != nullptr); - if (energy_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricEnergyAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *energy_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_system_clock_counter_get(uint32_t dv_ind, uint64_t* system_clock_counter_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(system_clock_counter_value != nullptr); - if (system_clock_counter_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTSClockCounter); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *system_clock_counter_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_firmware_timestamp_get(uint32_t dv_ind, uint64_t* firmware_timestamp_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(firmware_timestamp_value != nullptr); - if (firmware_timestamp_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTSFirmware); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *firmware_timestamp_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_indep_throttle_status_get(uint32_t dv_ind, uint64_t* throttle_status_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(throttle_status_value != nullptr); - if (throttle_status_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricIndepThrottleStatus); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *throttle_status_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_throttle_status_get(uint32_t dv_ind, uint32_t* throttle_status_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(throttle_status_value != nullptr); - if (throttle_status_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricThrottleStatus); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *throttle_status_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_fan_speed_get(uint32_t dv_ind, uint16_t* fan_speed_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(fan_speed_value != nullptr); - if (fan_speed_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrFanSpeed); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *fan_speed_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_link_width_get(uint32_t dv_ind, uint16_t* pcie_link_width_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_link_width_value != nullptr); - if (pcie_link_width_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieLinkWidth); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_link_width_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_link_speed_get(uint32_t dv_ind, uint16_t* pcie_link_speed_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_link_speed_value != nullptr); - if (pcie_link_speed_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieLinkSpeed); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_link_speed_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_xgmi_link_width_get(uint32_t dv_ind, uint16_t* xgmi_link_width_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(xgmi_link_width_value != nullptr); - if (xgmi_link_width_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiLinkWidth); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *xgmi_link_width_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_xgmi_link_speed_get(uint32_t dv_ind, uint16_t* xgmi_link_speed_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(xgmi_link_speed_value != nullptr); - if (xgmi_link_speed_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiLinkSpeed); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *xgmi_link_speed_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_gfxclk_lock_status_get(uint32_t dv_ind, uint32_t* gfxclk_lock_status_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(gfxclk_lock_status_value != nullptr); - if (gfxclk_lock_status_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricGfxClkLockStatus); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *gfxclk_lock_status_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_gfx_activity_acc_get(uint32_t dv_ind, uint32_t* gfx_activity_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(gfx_activity_acc_value != nullptr); - if (gfx_activity_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricGfxActivityAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *gfx_activity_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_mem_activity_acc_get(uint32_t dv_ind, uint32_t* mem_activity_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(mem_activity_acc_value != nullptr); - if (mem_activity_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricMemActivityAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *mem_activity_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_bandwidth_acc_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_bandwidth_acc_value != nullptr); - if (pcie_bandwidth_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_bandwidth_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_bandwidth_inst_get(uint32_t dv_ind, uint64_t* pcie_bandwidth_inst_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_bandwidth_inst_value != nullptr); - if (pcie_bandwidth_inst_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieBandwidthInst); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_bandwidth_inst_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_l0_recov_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_count_acc_value != nullptr); - if (pcie_count_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieL0RecovCountAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_count_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_replay_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_count_acc_value != nullptr); - if (pcie_count_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieReplayCountAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_count_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_pcie_replay_rover_count_acc_get(uint32_t dv_ind, uint64_t* pcie_count_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(pcie_count_acc_value != nullptr); - if (pcie_count_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricPcieReplayRollOverCountAccumulator); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *pcie_count_acc_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_uclk_get(uint32_t dv_ind, uint16_t* uclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(uclk_value != nullptr); - if (uclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrUClock); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *uclk_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_temp_hbm_get(uint32_t dv_ind, GPUMetricTempHbm_t* temp_hbm_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(temp_hbm_value != nullptr); - if (temp_hbm_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricTempHbm); - amd::smi::GPUMetricTempHbmTbl_t tmp_hbl_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_hbl_tbl); - const auto max_num_elems = - static_cast(std::end(*temp_hbm_value) - std::begin(*temp_hbm_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_hbl_tbl.size()) ? max_num_elems : tmp_hbl_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_hbl_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(temp_hbm_value, 0, sizeof(*temp_hbm_value)); - std::copy_n(std::begin(tmp_hbl_tbl), copy_size, *temp_hbm_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_vcn_activity_get(uint32_t dv_ind, GPUMetricVcnActivity_t* vcn_activity_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(vcn_activity_value != nullptr); - if (vcn_activity_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVcnActivity); - amd::smi::GPUMetricVcnActivityTbl_t tmp_vcn_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_vcn_tbl); - const auto max_num_elems = - static_cast(std::end(*vcn_activity_value) - std::begin(*vcn_activity_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_vcn_tbl.size()) ? max_num_elems : tmp_vcn_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_vcn_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(vcn_activity_value, 0, sizeof(*vcn_activity_value)); - std::copy_n(std::begin(tmp_vcn_tbl), copy_size, *vcn_activity_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_xgmi_read_data_get(uint32_t dv_ind, GPUMetricXgmiReadDataAcc_t* xgmi_read_data_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(xgmi_read_data_acc_value != nullptr); - if (xgmi_read_data_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiReadDataAccumulator); - amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl); - const auto max_num_elems = - static_cast(std::end(*xgmi_read_data_acc_value) - std::begin(*xgmi_read_data_acc_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_xgmi_acc_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(xgmi_read_data_acc_value, 0, sizeof(*xgmi_read_data_acc_value)); - std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_read_data_acc_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_xgmi_write_data_get(uint32_t dv_ind, GPUMetricXgmiWriteDataAcc_t* xgmi_write_data_acc_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(xgmi_write_data_acc_value != nullptr); - if (xgmi_write_data_acc_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricXgmiWriteDataAccumulator); - amd::smi::GPUMetricXgmiAccTbl_t tmp_xgmi_acc_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_xgmi_acc_tbl); - const auto max_num_elems = - static_cast(std::end(*xgmi_write_data_acc_value) - std::begin(*xgmi_write_data_acc_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_xgmi_acc_tbl.size()) ? max_num_elems : tmp_xgmi_acc_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_xgmi_acc_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(xgmi_write_data_acc_value, 0, sizeof(*xgmi_write_data_acc_value)); - std::copy_n(std::begin(tmp_xgmi_acc_tbl), copy_size, *xgmi_write_data_acc_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_gfxclk_get(uint32_t dv_ind, GPUMetricCurrGfxClk_t* current_gfxclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(current_gfxclk_value != nullptr); - if (current_gfxclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrGfxClock); - amd::smi::GPUMetricCurrGfxClkTbl_t tmp_curr_gfxclk_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_gfxclk_tbl); - const auto max_num_elems = - static_cast(std::end(*current_gfxclk_value) - std::begin(*current_gfxclk_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_curr_gfxclk_tbl.size()) ? max_num_elems : tmp_curr_gfxclk_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_curr_gfxclk_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(current_gfxclk_value, 0, sizeof(*current_gfxclk_value)); - std::copy_n(std::begin(tmp_curr_gfxclk_tbl), copy_size, *current_gfxclk_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_socclk_get(uint32_t dv_ind, GPUMetricCurrSocClk_t* current_socclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(current_socclk_value != nullptr); - if (current_socclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrSocClock); - amd::smi::GPUMetricCurrSocClkTbl_t tmp_curr_socclk_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_socclk_tbl); - const auto max_num_elems = - static_cast(std::end(*current_socclk_value) - std::begin(*current_socclk_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_curr_socclk_tbl.size()) ? max_num_elems : tmp_curr_socclk_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_curr_socclk_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(current_socclk_value, 0, sizeof(*current_socclk_value)); - std::copy_n(std::begin(tmp_curr_socclk_tbl), copy_size, *current_socclk_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_vclk0_get(uint32_t dv_ind, GPUMetricCurrVClk0_t* current_vclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(current_vclk_value != nullptr); - if (current_vclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrVClock0); - amd::smi::GPUMetricCurrVClkTbl_t tmp_curr_vclk0_tbl{}; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_vclk0_tbl); - const auto max_num_elems = - static_cast(std::end(*current_vclk_value) - std::begin(*current_vclk_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_curr_vclk0_tbl.size()) ? max_num_elems : tmp_curr_vclk0_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_curr_vclk0_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(current_vclk_value, 0, sizeof(*current_vclk_value)); - std::copy_n(std::begin(tmp_curr_vclk0_tbl), copy_size, *current_vclk_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_vclk1_get(uint32_t dv_ind, uint16_t* current_vclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(current_vclk_value != nullptr); - if (current_vclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrVClock1); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *current_vclk_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_dclk0_get(uint32_t dv_ind, GPUMetricCurrDClk0_t* current_dclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(current_dclk_value != nullptr); - if (current_dclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrDClock0); - amd::smi::GPUMetricCurrDClkTbl_t tmp_curr_dclk0_tbl; - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, tmp_curr_dclk0_tbl); - const auto max_num_elems = - static_cast(std::end(*current_dclk_value) - std::begin(*current_dclk_value)); - const auto copy_size = - static_cast((max_num_elems < tmp_curr_dclk0_tbl.size()) ? max_num_elems : tmp_curr_dclk0_tbl.size()); - ostrstream << __PRETTY_FUNCTION__ - << "\n | ======= end ======= " - << "\n | End Result " - << "\n | Device #: " << dv_ind - << "\n | Metric Type: " << static_cast(gpu_metric_unit) - << "\n | Metric Size: " << tmp_curr_dclk0_tbl.size() - << "\n | Max num of elements: " << max_num_elems - << "\n | Copy size: " << copy_size - << "\n | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - std::memset(current_dclk_value, 0, sizeof(*current_dclk_value)); - std::copy_n(std::begin(tmp_curr_dclk0_tbl), copy_size, *current_dclk_value); - } - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_curr_dclk1_get(uint32_t dv_ind, uint16_t* current_dclk_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(current_dclk_value != nullptr); - if (current_dclk_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricCurrDClock1); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *current_dclk_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_gfx_clock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgGfxClockFrequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_soc_clock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgSocClockFrequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_uclock_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgUClockFrequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_vclock0_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgVClock0Frequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_dclock0_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgDClock0Frequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_vclock1_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgVClock1Frequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_avg_dclock1_frequency_get(uint32_t dv_ind, uint16_t* clock_frequency_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(clock_frequency_value != nullptr); - if (clock_frequency_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricAvgDClock1Frequency); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *clock_frequency_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_volt_soc_get(uint32_t dv_ind, uint16_t* voltage_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(voltage_value != nullptr); - if (voltage_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVoltageSoc); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *voltage_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_volt_gfx_get(uint32_t dv_ind, uint16_t* voltage_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(voltage_value != nullptr); - if (voltage_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVoltageGfx); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *voltage_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - -rsmi_status_t -rsmi_dev_metrics_volt_mem_get(uint32_t dv_ind, uint16_t* voltage_value) -{ - TRY - std::ostringstream ostrstream; - ostrstream << __PRETTY_FUNCTION__ << "| ======= start ======="; - LOG_TRACE(ostrstream); - - assert(voltage_value != nullptr); - if (voltage_value == nullptr) { - return rsmi_status_t::RSMI_STATUS_INVALID_ARGS; - } - - const auto gpu_metric_unit(AMDGpuMetricsUnitType_t::kMetricVoltageMem); - auto status_code = rsmi_dev_gpu_metrics_info_query(dv_ind, gpu_metric_unit, *voltage_value); - ostrstream << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | End Result " - << " | Device #: " << dv_ind - << " | Metric Type: " << static_cast(gpu_metric_unit) - << " | Returning = " << status_code << " " << getRSMIStatusString(status_code) << " |"; - LOG_INFO(ostrstream); - - return status_code; - CATCH -} - rsmi_status_t rsmi_dev_metrics_header_info_get(uint32_t dv_ind, metrics_table_header_t* header_value) { @@ -7871,10 +6638,13 @@ rsmi_dev_metrics_xcd_counter_get(uint32_t dv_ind, uint16_t* xcd_counter_value) } auto xcd_counter = uint16_t(0); - GPUMetricCurrGfxClk_t curr_gfxclk_table{}; - auto status_code = rsmi_dev_metrics_curr_gfxclk_get(dv_ind, &curr_gfxclk_table); + rsmi_gpu_metrics_t gpu_metrics; + auto status_code = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); if (status_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { - for (const auto& gfxclk : curr_gfxclk_table) { + for (const auto& gfxclk : gpu_metrics.current_gfxclks) { + if (gfxclk == UINT16_MAX) { + break; + } if ((gfxclk != 0) && (gfxclk != UINT16_MAX)) { xcd_counter++; } @@ -7916,10 +6686,6 @@ rsmi_dev_metrics_log_get(uint32_t dv_ind) CATCH } -// -// End of: new GPU Metrics related work. -// - // UNDOCUMENTED FUNCTIONS // This functions are not declared in rocm_smi.h. They are either not fully diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc index e0ebe8a055..ce3bf33eb2 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_device.cc @@ -490,7 +490,7 @@ static const std::map kDevFuncDependsMap = { // Functions with only mandatory dependencies {"rsmi_dev_vram_vendor_get", {{kDevVramVendorFName}, {}}}, {"rsmi_dev_id_get", {{kDevDevIDFName}, {}}}, - {"rsmi_dev_oam_id_get", {{kDevXGMIPhysicalIDFName}, {}}}, + {"rsmi_dev_xgmi_physical_id_get", {{kDevXGMIPhysicalIDFName}, {}}}, {"rsmi_dev_revision_get", {{kDevDevRevIDFName}, {}}}, {"rsmi_dev_vendor_id_get", {{kDevVendorIDFName}, {}}}, {"rsmi_dev_name_get", {{kDevVendorIDFName, diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc index b319a1fcd4..a21e619c4a 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_kfd.cc @@ -526,7 +526,7 @@ int GetProcessInfoForPID(uint32_t pid, rsmi_process_info_t *proc, // Collect count of compute units cu_count += kfd_node_map[gpu_id]->cu_count(); } else { - //Some GFX revisions do not provide cu_occupancy debugfs method + // Some GFX revisions do not provide cu_occupancy debugfs method proc->cu_occupancy = CU_OCCUPANCY_INVALID; cu_count = 0; } @@ -1067,18 +1067,18 @@ int KFDNode::get_gfx_target_version(uint64_t *gfx_target_version) { *gfx_target_version = gfx_version; ss << __PRETTY_FUNCTION__ << " | File: " << properties_path - << " | Successfully read node #" << std::to_string(this->node_indx_) + << " | Read node: " << std::to_string(this->node_indx_) << " for gfx_target_version" - << " | Data (gfx_target_version) *gfx_target_version = " + << " | Data (*gfx_target_version): " << std::to_string(*gfx_target_version) - << " | return = " << std::to_string(ret) + << " | Return: " + << getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(ret), false) << " | "; LOG_DEBUG(ss); return ret; } -int32_t KFDNode::get_simd_per_cu(uint64_t* simd_per_cu) const -{ +int32_t KFDNode::get_simd_per_cu(uint64_t* simd_per_cu) const { const std::string properties_path("/sys/class/kfd/kfd/topology/nodes/" + std::to_string(this->node_indx_) + "/properties"); @@ -1090,8 +1090,7 @@ int32_t KFDNode::get_simd_per_cu(uint64_t* simd_per_cu) const return ret; } -int32_t KFDNode::get_simd_count(uint64_t* simd_count) const -{ +int32_t KFDNode::get_simd_count(uint64_t* simd_count) const { const std::string properties_path("/sys/class/kfd/kfd/topology/nodes/" + std::to_string(this->node_indx_) + "/properties"); @@ -1103,6 +1102,62 @@ int32_t KFDNode::get_simd_count(uint64_t* simd_count) const return ret; } +// Public interface for device +// /sys/class/kfd/kfd/topology/nodes/*/gpu_id +int KFDNode::get_gpu_id(uint64_t *gpu_id) { + std::ostringstream ss; + std::string gpuid_path = "/sys/class/kfd/kfd/topology/nodes/" + + std::to_string(this->node_indx_) + "/gpu_id"; + const uint64_t undefined_gpu_id = std::numeric_limits::max(); + std::string gpu_id_string = ""; + *gpu_id = undefined_gpu_id; + int ret = ReadSysfsStr(gpuid_path, &gpu_id_string); + if (ret != 0 || gpu_id_string.empty()) { + ss << __PRETTY_FUNCTION__ + << " | File: " << gpuid_path + << " | Data (*gpu_id): empty or nullptr" + << " | Issue: Could not read node #" << std::to_string(this->node_indx_) + << ". KFD node was an unsupported node or value read was empty." + << " | Return: " + << getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(ret), false) + << " | "; + LOG_ERROR(ss); + return ret; + } + *gpu_id = std::stoull(gpu_id_string); + if (*gpu_id == 0) { // CPU node - return not supported + *gpu_id = undefined_gpu_id; + ret = ENOENT; // map to RSMI_STATUS_NOT_SUPPORTED + } + ss << __PRETTY_FUNCTION__ + << " | File: " << gpuid_path + << " | Read node #: " << std::to_string(this->node_indx_) + << " | Data (*gpu_id): " << std::to_string(*gpu_id) + << " | Return: " + << getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(ret), false) + << " | "; + LOG_DEBUG(ss); + return ret; +} + +// Public interface for device +// /sys/class/kfd/kfd/topology/nodes/ +int KFDNode::get_node_id(uint32_t *node_id) { + std::ostringstream ss; + int ret = 0; + std::string nodeid_path = "/sys/class/kfd/kfd/topology/nodes/" + + std::to_string(this->node_indx_); + ss << __PRETTY_FUNCTION__ + << " | File: " << nodeid_path + << " | Read node #: " << std::to_string(this->node_indx_) + << " | Data (*node_id): " << std::to_string(*node_id) + << " | Return: " + << getRSMIStatusString(amd::smi::ErrnoToRsmiStatus(ret), false) + << " | "; + *node_id = this->node_indx_; + LOG_DEBUG(ss); + return ret; +} } // namespace smi } // namespace amd diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc index 25c32b1f94..eb43d663c1 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_main.cc @@ -235,15 +235,7 @@ RocmSMI::Initialize(uint64_t flags) { int i_ret; std::ostringstream ss; - LOG_ALWAYS("=============== ROCM SMI initialize ================"); - ROCmLogging::Logger::getInstance()->enableAllLogLevels(); - // Leaving below to allow developers to check current log settings - // std::string logSettings = Logger::getInstance()->getLogSettings(); - // std::cout << "Current log settings:\n" << logSettings << std::endl; - if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { - logSystemDetails(); - } assert(ref_count_ == 1); if (ref_count_ != 1) { @@ -259,6 +251,15 @@ RocmSMI::Initialize(uint64_t flags) { // To help debug env variable issues // debugRSMIEnvVarInfo(); + if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { + ROCmLogging::Logger::getInstance()->enableAllLogLevels(); + LOG_ALWAYS("=============== ROCM SMI initialize ================"); + logSystemDetails(); + } + // Leaving below to allow developers to check current log settings + // std::string logSettings = ROCmLogging::Logger::getInstance()->getLogSettings(); + // std::cout << "Current log settings:\n" << logSettings << std::endl; + while (!std::string(kAMDMonitorTypes[i]).empty()) { amd_monitor_types_.insert(kAMDMonitorTypes[i]); ++i; @@ -283,6 +284,7 @@ RocmSMI::Initialize(uint64_t flags) { << " | [before] device->path() = " << device->path() << "\n | bdfid = " << bdfid << "\n | device->bdfid() = " << device->bdfid() + << " (" << print_int_as_hex(device->bdfid()) << ")" << "\n | (xgmi node) setting to setting " << "device->set_bdfid(device->bdfid())"; LOG_TRACE(ss); @@ -293,6 +295,7 @@ RocmSMI::Initialize(uint64_t flags) { << " | [before] device->path() = " << device->path() << "\n | bdfid = " << bdfid << "\n | device->bdfid() = " << device->bdfid() + << " (" << print_int_as_hex(device->bdfid()) << ")" << "\n | (legacy/pcie card) setting device->set_bdfid(bdfid)"; LOG_TRACE(ss); device->set_bdfid(bdfid); @@ -301,6 +304,7 @@ RocmSMI::Initialize(uint64_t flags) { << " | [after] device->path() = " << device->path() << "\n | bdfid = " << bdfid << "\n | device->bdfid() = " << device->bdfid() + << " (" << print_int_as_hex(device->bdfid()) << ")" << "\n | final update: device->bdfid() holds correct device bdf"; LOG_TRACE(ss); } @@ -312,8 +316,11 @@ RocmSMI::Initialize(uint64_t flags) { for (uint32_t dv_ind = 0; dv_ind < devices_.size(); ++dv_ind) { dev = devices_[dv_ind]; uint64_t bdfid = dev->bdfid(); + bdfid = bdfid & 0xFFFFFFFF0FFFFFFF; // clear out partition id in bdf + // NOTE: partition_id is not part of bdf (but is part of pci_id) + // which is why it is removed in sorting dv_to_id.push_back({bdfid, dev}); - } + } ss << __PRETTY_FUNCTION__ << " Sort index based on BDF."; LOG_DEBUG(ss); @@ -734,7 +741,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { continue; sscanf(&dentry->d_name[strlen(kDeviceNamePrefix)], "%d", &cardId); if (cardId > max_cardId) - max_cardId = cardId; + max_cardId = cardId; count++; } dentry = readdir(drm_dir); @@ -748,23 +755,47 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { uint64_t s_gpu_id = 0; uint64_t s_unique_id = 0; uint64_t s_location_id = 0; + uint64_t s_bdf = 0; + uint64_t s_domain = 0; + uint8_t s_bus = 0; + uint8_t s_device = 0; + uint8_t s_function = 0; + uint8_t s_partition_id = 0; + uint64_t padding = 0; // padding added in case new changes in future }; // allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id, - // location_id} + // location_id, bdf, domain, bus, device, + // partition_id} std::multimap allSystemNodes; uint32_t node_id = 0; + static const int BYTE = 8; while (true) { - uint64_t gpu_id = 0, unique_id = 0, location_id = 0; + uint64_t gpu_id = 0, unique_id = 0, location_id = 0, domain = 0; int ret_gpu_id = get_gpu_id(node_id, &gpu_id); int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id); int ret_loc_id = read_node_properties(node_id, "location_id", &location_id); - if (ret_gpu_id == 0 || ret_unique_id == 0 || ret_loc_id == 0) { + int ret_domain = + read_node_properties(node_id, "domain", &domain); + if (ret_gpu_id == 0 && + ~(ret_unique_id != 0 || ret_loc_id != 0 || ret_unique_id != 0)) { + // Do not try to build a node if one of these fields + // do not exist in KFD (0 as values okay) systemNode myNode; myNode.s_node_id = node_id; myNode.s_gpu_id = gpu_id; myNode.s_unique_id = unique_id; myNode.s_location_id = location_id; + myNode.s_domain = domain & 0xFFFFFFFF; + myNode.s_bdf = (myNode.s_domain << 32) | (myNode.s_location_id); + myNode.s_location_id = myNode.s_bdf; + myNode.s_bdf |= ((domain & 0xFFFFFFFF) << 32); + myNode.s_location_id = myNode.s_bdf; + myNode.s_domain = myNode.s_location_id >> 32; + myNode.s_bus = ((myNode.s_location_id >> 8) & 0xFF); + myNode.s_device = ((myNode.s_location_id >> 3) & 0x1F); + myNode.s_function = myNode.s_location_id & 0x7; + myNode.s_partition_id = ((myNode.s_location_id >> 28) & 0xF); if (gpu_id != 0) { // only add gpu nodes, 0 = CPU allSystemNodes.emplace(unique_id, myNode); } @@ -780,6 +811,12 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { << "; gpu_id = " << std::to_string(i.second.s_gpu_id) << "; unique_id = " << std::to_string(i.second.s_unique_id) << "; location_id = " << std::to_string(i.second.s_location_id) + << "; bdf = " << print_int_as_hex(i.second.s_bdf) + << "; domain = " << print_int_as_hex(i.second.s_domain, true, 2*BYTE) + << "; bus = " << print_int_as_hex(i.second.s_bus, true, BYTE) + << "; device = " << print_int_as_hex(i.second.s_device, true, BYTE) + << "; function = " << std::to_string(i.second.s_function) + << "; partition_id = " << std::to_string(i.second.s_partition_id) << "], "; } ss << "}"; @@ -817,13 +854,67 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { rsmi_status_t ret_unique_id = rsmi_dev_unique_id_get(cardAdded, &device_uuid); auto temp_numb_nodes = allSystemNodes.count(device_uuid); - auto it = allSystemNodes.lower_bound(device_uuid); - if (it != allSystemNodes.end() && doesDeviceSupportPartitions && temp_numb_nodes > 1 + auto primaryBdfId = + allSystemNodes.lower_bound(device_uuid)->second.s_location_id; + auto i = allSystemNodes.lower_bound(device_uuid); + if (doesDeviceSupportPartitions && temp_numb_nodes > 1 && ret_unique_id == RSMI_STATUS_SUCCESS) { - auto primaryBdfId = it->second.s_location_id; // helps identify xgmi nodes (secondary nodes) easier + ss << __PRETTY_FUNCTION__ << " | secondary node add ; " + << " BDF = " << std::to_string(primaryBdfId) + << " (" << print_int_as_hex(primaryBdfId) << ")"; + LOG_DEBUG(ss); + if (doesDeviceSupportPartitions && strCompPartition != "SPX" + && i->second.s_partition_id == 0) { + i->second.s_partition_id = i->second.s_function; + ss << __PRETTY_FUNCTION__ << " | (secondary node add) fall back - " + << "detected !SPX && partition_id == 0" + << "; function = " << std::to_string(i->second.s_function) + << "; partition_id = " << std::to_string(i->second.s_partition_id); + LOG_DEBUG(ss); + } + ss << __PRETTY_FUNCTION__ + << " | (secondary node add) B4 AddToDeviceList() -->" + << "\n[node_id = " << std::to_string(i->second.s_node_id) + << "; gpu_id = " << std::to_string(i->second.s_gpu_id) + << "; unique_id = " << std::to_string(i->second.s_unique_id) + << "; location_id = " << std::to_string(i->second.s_location_id) + << "; bdf = " << print_int_as_hex(i->second.s_bdf) + << "; domain = " << print_int_as_hex(i->second.s_domain, true, 2*BYTE) + << "; bus = " << print_int_as_hex(i->second.s_bus, true, BYTE) + << "; device = " << print_int_as_hex(i->second.s_device, true, BYTE) + << "; function = " << std::to_string(i->second.s_function) + << "; partition_id = " << std::to_string(i->second.s_partition_id) + << "], "; + LOG_DEBUG(ss); AddToDeviceList(d_name, primaryBdfId); } else { + ss << __PRETTY_FUNCTION__ << " | primary node add ; " + << " BDF = " << std::to_string(UINT64_MAX); + if (doesDeviceSupportPartitions && strCompPartition != "SPX" + && i->second.s_partition_id == 0) { + i->second.s_partition_id = i->second.s_function; + ss << __PRETTY_FUNCTION__ << " | (primary node add) fall back - " + << "detected !SPX && partition_id == 0" + << "; function = " << std::to_string(i->second.s_function) + << "; partition_id = " << std::to_string(i->second.s_partition_id); + LOG_DEBUG(ss); + } + LOG_DEBUG(ss); + ss << __PRETTY_FUNCTION__ + << " | (primary node add) After AddToDeviceList() -->" + << "\n[node_id = " << std::to_string(i->second.s_node_id) + << "; gpu_id = " << std::to_string(i->second.s_gpu_id) + << "; unique_id = " << std::to_string(i->second.s_unique_id) + << "; location_id = " << std::to_string(i->second.s_location_id) + << "; bdf = " << print_int_as_hex(i->second.s_bdf) + << "; domain = " << print_int_as_hex(i->second.s_domain, true, 2*BYTE) + << "; bus = " << print_int_as_hex(i->second.s_bus, true, BYTE) + << "; device = " << print_int_as_hex(i->second.s_device, true, BYTE) + << "; function = " << std::to_string(i->second.s_function) + << "; partition_id = " << std::to_string(i->second.s_partition_id) + << "], "; + LOG_DEBUG(ss); AddToDeviceList(d_name, UINT64_MAX); } @@ -834,6 +925,12 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { << "; gpu_id = " << std::to_string(i.second.s_gpu_id) << "; unique_id = " << std::to_string(i.second.s_unique_id) << "; location_id = " << std::to_string(i.second.s_location_id) + << "; bdf = " << print_int_as_hex(i.second.s_bdf) + << "; domain = " << print_int_as_hex(i.second.s_domain, true, 2*BYTE) + << "; bus = " << print_int_as_hex(i.second.s_bus, true, BYTE) + << "; device = " << print_int_as_hex(i.second.s_device, true, BYTE) + << "; function = " << std::to_string(i.second.s_function) + << "; partition_id = " << std::to_string(i.second.s_partition_id) << "], "; } ss << "}"; @@ -909,6 +1006,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { auto removalGpuId = it->second.s_gpu_id; auto removalUniqueId = it->second.s_unique_id; auto removalLocId = it->second.s_location_id; + auto removaldomain = it->second.s_domain; auto nodesErased = 1; primary_location_id = removalLocId; allSystemNodes.erase(it++); @@ -919,6 +1017,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { << "; gpu_id = " << std::to_string(removalGpuId) << "; unique_id = " << std::to_string(removalUniqueId) << "; location_id = " << std::to_string(removalLocId) + << "; removaldomain = " << std::to_string(removaldomain) << "]"; LOG_DEBUG(ss); } @@ -926,15 +1025,34 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { break; } auto myBdfId = it->second.s_location_id; - AddToDeviceList(secNode, myBdfId); + ss << __PRETTY_FUNCTION__ << " | secondary node add #2; " + << " BDF = " << std::to_string(myBdfId) + << " (" << print_int_as_hex(myBdfId) << ")"; + LOG_DEBUG(ss); + if (doesDeviceSupportPartitions && strCompPartition != "SPX" + && it->second.s_partition_id == 0) { + it->second.s_partition_id = it->second.s_function; + ss << __PRETTY_FUNCTION__ << " | (secondary node add #2) fall back - " + << "detected !SPX && partition_id == 0" + << "; function = " << std::to_string(it->second.s_function) + << "; partition_id = " << std::to_string(it->second.s_partition_id); + LOG_DEBUG(ss); + } ss << __PRETTY_FUNCTION__ - << "\nSECONDARY --> After adding new node; ERASING -> [node_id = " - << std::to_string(it->second.s_node_id) + << " | (secondary node add #2) B4 AddToDeviceList() -->" + << "\n[node_id = " << std::to_string(it->second.s_node_id) << "; gpu_id = " << std::to_string(it->second.s_gpu_id) << "; unique_id = " << std::to_string(it->second.s_unique_id) << "; location_id = " << std::to_string(it->second.s_location_id) - << "]"; + << "; bdf = " << print_int_as_hex(it->second.s_bdf) + << "; domain = " << print_int_as_hex(it->second.s_domain, true, 2*BYTE) + << "; bus = " << print_int_as_hex(it->second.s_bus, true, BYTE) + << "; device = " << print_int_as_hex(it->second.s_device, true, BYTE) + << "; function = " << std::to_string(it->second.s_function) + << "; partition_id = " << std::to_string(it->second.s_partition_id) + << "], "; LOG_DEBUG(ss); + AddToDeviceList(secNode, myBdfId); allSystemNodes.erase(it++); numb_nodes--; cardAdded++; diff --git a/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc b/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc index 32f0209654..4bb045c6d7 100755 --- a/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc +++ b/projects/amdsmi/rocm_smi/src/rocm_smi_utils.cc @@ -1113,6 +1113,7 @@ static std::string print_pnt(rsmi_od_vddc_point_t *pt) { ss << "\t\t** Voltage: " << pt->voltage << " mV\n"; return ss.str(); } + static std::string pt_vddc_curve(rsmi_od_volt_curve *c) { std::ostringstream ss; if (c == nullptr) { @@ -1182,16 +1183,31 @@ bool is_sudo_user() { return isRunningWithSudo; } -rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind, - std::string *gfx_version) { +// string output of gfx_ +rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind, std::string *gfx_version) { std::ostringstream ss; uint64_t kfd_gfx_version = 0; GET_DEV_AND_KFDNODE_FROM_INDX int ret = kfd_node->get_gfx_target_version(&kfd_gfx_version); + uint64_t orig_target_version = 0; + uint64_t major = 0; + uint64_t minor = 0; + uint64_t rev = 0; if (ret == 0) { - ss << "gfx" << kfd_gfx_version; - *gfx_version = ss.str(); + orig_target_version = std::stoull(std::to_string(kfd_gfx_version)); + // separate out parts -> put back into normal graphics version format + major = static_cast((orig_target_version / 10000) * 100); + minor = static_cast((orig_target_version % 10000 / 100) * 10); + if (minor == 0) major *= 10; // 0 as a minor is correct, but bump up by 10 + rev = static_cast(orig_target_version % 100); + *gfx_version = "gfx" + std::to_string(major + minor + rev); + ss << __PRETTY_FUNCTION__ + << " | " << std::dec << "kfd_target_version = " << orig_target_version + << "; major = " << major << "; minor = " << minor << "; rev = " + << rev << "\nReporting rsmi_get_gfx_target_version = " << *gfx_version + << "\n"; + LOG_INFO(ss); return RSMI_STATUS_SUCCESS; } else { *gfx_version = "Unknown"; diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 5fe6398f26..de2507cab6 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -753,18 +753,54 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i // default to 0xffff as not supported info->oam_id = std::numeric_limits::max(); uint16_t tmp_oam_id = 0; - status = rsmi_wrapper(rsmi_dev_oam_id_get, processor_handle, &(tmp_oam_id)); + status = rsmi_wrapper(rsmi_dev_xgmi_physical_id_get, processor_handle, &(tmp_oam_id)); info->oam_id = tmp_oam_id; // default to 0xffffffff as not supported info->num_of_compute_units = std::numeric_limits::max(); auto tmp_num_of_compute_units = uint32_t(0); status = rsmi_wrapper(amd::smi::rsmi_dev_number_of_computes_get, processor_handle, - &tmp_num_of_compute_units); + &(tmp_num_of_compute_units)); if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { info->num_of_compute_units = tmp_num_of_compute_units; } + // default to 0xffffffffffffffff as not supported + info->target_graphics_version = std::numeric_limits::max(); + auto tmp_target_gfx_version = uint64_t(0); + status = rsmi_wrapper(rsmi_dev_target_graphics_version_get, processor_handle, + &(tmp_target_gfx_version)); + if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { + info->target_graphics_version = tmp_target_gfx_version; + } + + // default to 0xffffffffffffffff as not supported + info->kfd_id = std::numeric_limits::max(); + auto tmp_kfd_id = uint64_t(0); + status = rsmi_wrapper(rsmi_dev_guid_get, processor_handle, + &(tmp_kfd_id)); + if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { + info->kfd_id = tmp_kfd_id; + } + + // default to 0xffffffff as not supported + info->node_id = std::numeric_limits::max(); + auto tmp_node_id = uint32_t(0); + status = rsmi_wrapper(rsmi_dev_node_id_get, processor_handle, + &(tmp_node_id)); + if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { + info->node_id = tmp_node_id; + } + + // default to 0xffffffff as not supported + info->partition_id = std::numeric_limits::max(); + auto tmp_partition_id = uint32_t(0); + status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, + &(tmp_partition_id)); + if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { + info->partition_id = tmp_partition_id; + } + return AMDSMI_STATUS_SUCCESS; } diff --git a/projects/amdsmi/src/amd_smi/amd_smi_drm.cc b/projects/amdsmi/src/amd_smi/amd_smi_drm.cc index a17ed40843..f68fd96aa6 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi_drm.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi_drm.cc @@ -52,6 +52,8 @@ #include "amd_smi/impl/amd_smi_common.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_utils.h" +#include "rocm_smi/rocm_smi_logger.h" namespace amd { namespace smi { @@ -173,10 +175,26 @@ amdsmi_status_t AMDSmiDrm::init() { } has_valid_fds = true; - bdf.function_number = device->businfo.pci->func; - bdf.device_number = device->businfo.pci->dev; - bdf.bus_number = device->businfo.pci->bus; - bdf.domain_number = device->businfo.pci->domain; + std::ostringstream ss; + uint64_t bdf_rocm = 0; + rsmi_dev_pci_id_get(i, &bdf_rocm); + ss << __PRETTY_FUNCTION__ << " | " + << "bdf_rocm | Received bdf: " + << "\nWhole BDF: " << amd::smi::print_unsigned_hex_and_int(bdf_rocm) + << "\nDomain = " + << amd::smi::print_unsigned_hex_and_int((bdf_rocm & 0xFFFFFFFF00000000) >> 32) + << "; \nBus# = " << amd::smi::print_unsigned_hex_and_int((bdf_rocm & 0xFF00) >> 8) + << "; \nDevice# = "<< amd::smi::print_unsigned_hex_and_int((bdf_rocm & 0xF8) >> 3) + << "; \nFunction# = " << amd::smi::print_unsigned_hex_and_int((bdf_rocm & 0x7)); + LOG_INFO(ss); + bdf.function_number = ((bdf_rocm & 0x7)); + bdf.device_number = ((bdf_rocm & 0xF8) >> 3); + bdf.bus_number = ((bdf_rocm & 0xFF00) >> 8); + bdf.domain_number = ((bdf_rocm & 0xFFFFFFFF00000000) >> 32); + ss << __PRETTY_FUNCTION__ << " | " << "Received bdf: Domain = " << bdf.domain_number + << "; Bus# = " << bdf.bus_number << "; Device# = "<< bdf.device_number + << "; Function# = " << bdf.function_number; + LOG_INFO(ss); vendor_id = device->deviceinfo.pci->vendor_id; @@ -309,6 +327,14 @@ amdsmi_status_t AMDSmiDrm::get_drm_fd_by_index(uint32_t gpu_index, uint32_t *fd_ amdsmi_status_t AMDSmiDrm::get_bdf_by_index(uint32_t gpu_index, amdsmi_bdf_t *bdf_info) const { if (gpu_index + 1 > drm_bdfs_.size()) return AMDSMI_STATUS_NOT_SUPPORTED; *bdf_info = drm_bdfs_[gpu_index]; + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | gpu_index = " << gpu_index + << "; \nreceived bdf: Domain = " << bdf_info->domain_number + << "; \nBus# = " << bdf_info->bus_number + << "; \nDevice# = " << bdf_info->device_number + << "; \nFunction# = " << bdf_info->function_number + << "\nReturning = AMDSMI_STATUS_SUCCESS"; + LOG_INFO(ss); return AMDSMI_STATUS_SUCCESS; } diff --git a/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc index aa337bc210..1dcc38aebe 100755 --- a/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc @@ -48,6 +48,7 @@ #include #include +#include #include #include "amd_smi/amdsmi.h" @@ -58,7 +59,9 @@ TestSysInfoRead::TestSysInfoRead() : TestBase() { set_title("AMDSMI System Info Read Test"); set_description("This test verifies that system information such as the " - "BDFID, AMDSMI version, VBIOS version, etc. can be read properly."); + "BDFID, AMDSMI version, VBIOS version, " + "vendor_id, unique_id, target_gfx_version, kfd_id, node_id, partition_id, etc. " + "can be read properly."); } TestSysInfoRead::~TestSysInfoRead(void) { @@ -150,22 +153,39 @@ void TestSysInfoRead::Run(void) { ASSERT_EQ(err, AMDSMI_STATUS_INVAL); - // vendor_id, unique_id - amdsmi_asic_info_t asci_info; - err = amdsmi_get_gpu_asic_info(processor_handles_[0], &asci_info); + // vendor_id, unique_id, target_gfx_version, kfd_id, node_id, partition_id + amdsmi_asic_info_t asci_info = {}; + err = amdsmi_get_gpu_asic_info(processor_handles_[i], &asci_info); if (err == AMDSMI_STATUS_NOT_SUPPORTED) { std::cout << "\t**amdsmi_dev_unique_id() is not supported" " on this machine" << std::endl; + EXPECT_EQ(asci_info.target_graphics_version, std::numeric_limits::max()); + EXPECT_EQ(asci_info.kfd_id, std::numeric_limits::max()); + EXPECT_EQ(asci_info.node_id, std::numeric_limits::max()); + EXPECT_EQ(asci_info.partition_id, std::numeric_limits::max()); // Verify api support checking functionality is working err = amdsmi_get_gpu_asic_info(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED); } else { if (err == AMDSMI_STATUS_SUCCESS) { IF_VERB(STANDARD) { - std:: cout << "\t**GPU PCIe Vendor : " + std:: cout << "\t**GPU PCIe Vendor : " << asci_info.vendor_name << std::endl; + std::cout << "\t**Target GFX version: " << std::dec + << asci_info.target_graphics_version << "\n"; + std::cout << "\t**KFD ID: " << std::dec + << asci_info.kfd_id << "\n"; + std::cout << "\t**Node ID: " << std::dec + << asci_info.node_id << "\n"; + std::cout << "\t**Partition ID: " << std::dec + << asci_info.partition_id << "\n"; } + EXPECT_EQ(err, AMDSMI_STATUS_SUCCESS); + EXPECT_NE(asci_info.target_graphics_version, std::numeric_limits::max()); + EXPECT_NE(asci_info.kfd_id, std::numeric_limits::max()); + EXPECT_NE(asci_info.node_id, std::numeric_limits::max()); + EXPECT_NE(asci_info.partition_id, std::numeric_limits::max()); // Verify api support checking functionality is working err = amdsmi_get_gpu_asic_info(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_INVAL); diff --git a/projects/amdsmi/tests/amd_smi_test/functional/temp_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/temp_read.cc index 5169047767..097771c653 100755 --- a/projects/amdsmi/tests/amd_smi_test/functional/temp_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/temp_read.cc @@ -137,8 +137,7 @@ void TestTempRead::Run(void) { ASSERT_EQ(err, AMDSMI_STATUS_INVAL); IF_VERB(STANDARD) { - std::cout << "\t**" << label << ": " << val_i64/1000 << - "C" << std::endl; + std::cout << "\t**" << label << ": " << val_i64 << "C" << std::endl; } }; for (type = AMDSMI_TEMPERATURE_TYPE_FIRST; type <= AMDSMI_TEMPERATURE_TYPE__MAX; ++type) {