From 82096d7f74810ca1b3dce0b6138929b975171f52 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Tue, 17 Sep 2024 04:54:41 -0500 Subject: [PATCH] Moved KFD information to separate structure and API Signed-off-by: Maisam Arif Change-Id: If6eaea589edc704cf408d6391b5f2154134035e7 [ROCm/amdsmi commit: 3b7f661e71c057700d106fd8cc71afff80329547] --- projects/amdsmi/CHANGELOG.md | 103 ++++++++---------- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 48 +++++--- projects/amdsmi/amdsmi_cli/amdsmi_parser.py | 5 +- .../docs/how-to/using-amdsmi-for-python.md | 47 ++++++-- projects/amdsmi/include/amd_smi/amdsmi.h | 29 ++++- projects/amdsmi/py-interface/README.md | 47 ++++++-- projects/amdsmi/py-interface/__init__.py | 1 + .../amdsmi/py-interface/amdsmi_interface.py | 74 ++++++++----- .../amdsmi/py-interface/amdsmi_wrapper.py | 25 ++++- projects/amdsmi/src/amd_smi/amd_smi.cc | 49 ++++++--- .../amd_smi_test/functional/id_info_read.cc | 6 +- .../functional/mutual_exclusion.cc | 4 +- .../amd_smi_test/functional/sys_info_read.cc | 49 ++++++--- .../tests/python_unittest/integration_test.py | 10 +- projects/amdsmi/tools/amdsmi_quick_start.py | 6 +- 15 files changed, 328 insertions(+), 175 deletions(-) diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 5f1e04f8ff..45d8ceb3a1 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -7,6 +7,7 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr ## amd_smi_lib for ROCm 6.3.0 ### Changes + - **Moved python tests directory path install location**. - `/opt//share/amd_smi/pytest/..` to `/opt//share/amd_smi/tests/python_unittest/..` - On amd-smi-lib-tests uninstall, the amd_smi tests folder is removed. @@ -43,14 +44,14 @@ If no topology argument is provided all topology information will be displayed. Topology arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:0c:00.0 | UUID: 5fff74a1-0000-1000-808c-324a4d24b37e - ID: 1 | BDF: 0000:22:00.0 | UUID: 06ff74a1-0000-1000-80d3-f5e97636ae62 - ID: 2 | BDF: 0000:38:00.0 | UUID: 87ff74a1-0000-1000-80a0-d0a45576c5ed - ID: 3 | BDF: 0000:5c:00.0 | UUID: 5dff74a1-0000-1000-8054-a29c595fd7f3 - ID: 4 | BDF: 0000:9f:00.0 | UUID: a8ff74a1-0000-1000-805b-92615ca9e7b4 - ID: 5 | BDF: 0000:af:00.0 | UUID: ddff74a1-0000-1000-809e-5a98a60013bd - ID: 6 | BDF: 0000:bf:00.0 | UUID: 9aff74a1-0000-1000-80e8-cbefaf9f72c3 - ID: 7 | BDF: 0000:df:00.0 | UUID: 48ff74a1-0000-1000-806e-3c0b30d78e00 + ID: 0 | BDF: 0000:0c:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + ID: 1 | BDF: 0000:22:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + ID: 2 | BDF: 0000:38:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + ID: 3 | BDF: 0000:5c:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + ID: 4 | BDF: 0000:9f:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + ID: 5 | BDF: 0000:af:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + ID: 6 | BDF: 0000:bf:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + ID: 7 | BDF: 0000:df:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx all | Selects all devices @@ -184,7 +185,37 @@ Legend: 64,32 = 64 bit and 32 bit atomic support - ``` -- **Added Target_Graphics_Version, KFD_ID, Node_id, and partition id to `amd-smi static --asic`**. + +- **Created new amdsmi_kfd_info_t and added information under `amd-smi list`**. + - Due to fixes needed to properly enumerate all logical GPUs in CPX, new device identifiers were added in to a new `amdsmi_kfd_info_t` which gets populated via the API `amdsmi_get_gpu_kfd_info`. + - This info has been added to the `amd-smi list`. + - These new fields are only available for BM/Guest Linux devices at this time. + +```C +typedef struct { + uint64_t kfd_id; //< 0xFFFFFFFFFFFFFFFF if not supported + uint32_t node_id; //< 0xFFFFFFFF if not supported + uint32_t reserved[13]; +} amdsmi_kfd_info_t; +``` + +```shell +$ amd-smi list +GPU: 0 + BDF: 0000:23:00.0 + UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + KFD_ID: 45412 + NODE_ID: 1 + +GPU: 1 + BDF: 0000:26:00.0 + UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + KFD_ID: 59881 + NODE_ID: 2 +``` + +- **Added Target_Graphics_Version and partition id to `amd-smi static --asic`**. + Due to fixes needed to properly enumerate all logical GPUs in CPX, new device identifiers were placed within the `amdsmi_asic_info_t` struct. These new fields are only available for BM/Guest Linux devices at this time. @@ -201,15 +232,13 @@ typedef struct { uint32_t oam_id; //< 0xFFFF if not supported uint32_t num_of_compute_units; //< 0xFFFFFFFF if not supported uint64_t target_graphics_version; //< 0xFFFFFFFFFFFFFFFF if not supported - uint64_t kfd_id; //< 0xFFFFFFFFFFFFFFFF if not supported - uint32_t node_id; //< 0xFFFFFFFF if not supported uint32_t partition_id; //< 0xFFFFFFFF if not supported - uint32_t reserved[17]; + uint32_t reserved[14]; } amdsmi_asic_info_t; ``` ```shell -$ amd-smi static --asic --board --bus --partition +$ amd-smi static --asic --partition GPU: 0 ASIC: MARKET_NAME: MI308X @@ -226,57 +255,11 @@ GPU: 0 ASIC_SERIAL: OAM_ID: 5 NUM_COMPUTE_UNITS: 20 - BUS: - BDF: 0000:0A:00.0 - MAX_PCIE_WIDTH: 16 - MAX_PCIE_SPEED: 32 GT/s - PCIE_INTERFACE_VERSION: Gen 5 - SLOT_TYPE: PCIE - BOARD: - MODEL_NUMBER: 102-G30218-00 - PRODUCT_SERIAL: 692432000576 - FRU_ID: 113-AMDG302180002-0000000000000 - PRODUCT_NAME: AMD Instinct MI308X OAM - MANUFACTURER_NAME: AMD PARTITION: COMPUTE_PARTITION: CPX MEMORY_PARTITION: NPS4 - -GPU: 1 - ASIC: - MARKET_NAME: MI308X - VENDOR_ID: 0x1002 - VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] - SUBVENDOR_ID: 0x1002 - DEVICE_ID: 0x74a2 - TARGET_GRAPHICS_VERSION: gfx942 - KFD_ID: 41657 - NODE_ID: 3 - PARTITION_ID: 1 - SUBSYSTEM_ID: 0x74a2 - REV_ID: 0x00 - ASIC_SERIAL: - OAM_ID: 5 - NUM_COMPUTE_UNITS: 20 - BUS: - BDF: 0000:0A:00.1 - MAX_PCIE_WIDTH: 16 - MAX_PCIE_SPEED: 32 GT/s - PCIE_INTERFACE_VERSION: Gen 5 - SLOT_TYPE: PCIE - BOARD: - MODEL_NUMBER: 102-G30218-00 - PRODUCT_SERIAL: 692432000576 - FRU_ID: 113-AMDG302180002-0000000000000 - PRODUCT_NAME: AMD Instinct MI308X OAM - MANUFACTURER_NAME: AMD - PARTITION: - COMPUTE_PARTITION: CPX - MEMORY_PARTITION: NPS4 -... ``` - ### Removals - **Removed usage of _validate_positive in Parser and replaced with _positive_int and _not_negative_int as appropriate**. @@ -1022,7 +1005,7 @@ Use the watch arguments to run continuously Monitor Arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:01:00.0 | UUID: 4eff74a0-0000-1000-802d-1d762a397f73 + ID: 0 | BDF: 0000:01:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx all | Selects all devices -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: ID: 0 diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 21a7d73753..4915354fa9 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -157,6 +157,9 @@ class AMDSMICommands(): args.gpu = device_handle + # Get gpu_id for logging + gpu_id = self.helpers.get_gpu_id_from_device_handle(args.gpu) + try: bdf = amdsmi_interface.amdsmi_get_gpu_device_bdf(args.gpu) except amdsmi_exception.AmdSmiLibraryException as e: @@ -167,13 +170,25 @@ class AMDSMICommands(): except amdsmi_exception.AmdSmiLibraryException as e: uuid = e.get_error_info() + try: + kfd_info = amdsmi_interface.amdsmi_get_gpu_kfd_info(args.gpu) + kfd_id = kfd_info['kfd_id'] + node_id = kfd_info['node_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + kfd_id = node_id = e.get_error_info() + logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info()) + # CSV format is intentionally aligned with Host if self.logger.is_csv_format(): self.logger.store_output(args.gpu, 'gpu_bdf', bdf) self.logger.store_output(args.gpu, 'gpu_uuid', uuid) + self.logger.store_output(args.gpu, 'kfd_id', kfd_id) + self.logger.store_output(args.gpu, 'node_id', node_id) else: self.logger.store_output(args.gpu, 'bdf', bdf) self.logger.store_output(args.gpu, 'uuid', uuid) + self.logger.store_output(args.gpu, 'kfd_id', kfd_id) + self.logger.store_output(args.gpu, 'node_id', node_id) if multiple_devices: self.logger.store_multiple_device_output() @@ -354,28 +369,35 @@ class AMDSMICommands(): # Populate static dictionary for each enabled argument static_dict = {} if args.asic: + asic_dict = { + "market_name" : "N/A", + "vendor_id" : "N/A", + "vendor_name" : "N/A", + "subvendor_id" : "N/A", + "device_id" : "N/A", + "subsystem_id" : "N/A", + "rev_id" : "N/A", + "asic_serial" : "N/A", + "oam_id" : "N/A", + "num_compute_units" : "N/A", + "target_graphics_version" : "N/A", + "partition_id" : "N/A" + } + try: asic_info = amdsmi_interface.amdsmi_get_gpu_asic_info(args.gpu) - static_dict["asic"] = asic_info + for key, value in asic_info.items(): + asic_dict[key] = value except amdsmi_exception.AmdSmiLibraryException as e: - static_dict["asic"] = "N/A" logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info()) try: subsystem_id = amdsmi_interface.amdsmi_get_gpu_subsystem_id(args.gpu) - if static_dict["asic"] != "N/A": - # Reorder asic to include subsystem_id after device_id - static_dict["asic"]["subsystem_id"] = subsystem_id - static_dict["asic"]["rev_id"] = static_dict["asic"].pop("rev_id") - static_dict["asic"]["asic_serial"] = static_dict["asic"].pop("asic_serial") - static_dict["asic"]["oam_id"] = static_dict["asic"].pop("oam_id") - static_dict["asic"]["num_compute_units"] = static_dict["asic"].pop("num_compute_units") - else: - static_dict["asic"]["subsystem_id"] = subsystem_id + asic_dict["subsystem_id"] = subsystem_id except amdsmi_exception.AmdSmiLibraryException as e: - if static_dict["asic"] != "N/A": - static_dict["asic"]["subsystem_id"] = "N/A" logging.debug("Failed to get asic info for gpu %s | %s", gpu_id, e.get_error_info()) + + static_dict['asic'] = asic_dict if args.bus: bus_info = { 'bdf': "N/A", diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py index 917f9731c4..be58f7b0fe 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_parser.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_parser.py @@ -566,9 +566,8 @@ class AMDSMIParser(argparse.ArgumentParser): # Subparser help text list_help = "List GPU information" - list_subcommand_help = "Lists all the devices on the system and the links between devices.\ - \nLists all the sockets and for each socket, GPUs and/or CPUs associated to\ - \nthat socket alongside some basic information for each device.\ + list_subcommand_help = "Lists all detected devices on the system\ + \nLists the BDF, UUID, KFD_ID, and NODE_ID for each GPU and/or CPUs\ \nIn virtualization environments, it can also list VFs associated to each\ \nGPU with some basic information for each VF." diff --git a/projects/amdsmi/docs/how-to/using-amdsmi-for-python.md b/projects/amdsmi/docs/how-to/using-amdsmi-for-python.md index 797486ca8e..40edc84f8c 100644 --- a/projects/amdsmi/docs/how-to/using-amdsmi-for-python.md +++ b/projects/amdsmi/docs/how-to/using-amdsmi-for-python.md @@ -377,6 +377,8 @@ Field | Content `rev_id` | revision id `asic_serial` | asic serial `oam_id` | oam id +`num_of_compute_units` | number of compute units on asic +`target_graphics_version` | hardware graphics version Exceptions that can be thrown by `amdsmi_get_gpu_asic_info` function: @@ -394,13 +396,44 @@ try: else: for device in devices: asic_info = amdsmi_get_gpu_asic_info(device) - print(asic_info['market_name']) - print(hex(asic_info['vendor_id'])) - print(asic_info['vendor_name']) - print(hex(asic_info['device_id'])) - print(hex(asic_info['rev_id'])) - print(asic_info['asic_serial']) - print(asic_info['oam_id']) + print(asic_info) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_get_gpu_kfd_info + +Description: Returns KFD(kernel fusion driver) information for the given GPU +This correlates to GUID in rocm-smi + +Input parameters: + +* `processor_handle` device which to query + +Output: Dictionary with fields + +Field | Content +---|--- +`kfd_id` | KFD's unique GPU identifier +`node_id` | KFD's internal GPU index + +Exceptions that can be thrown by `amdsmi_get_gpu_kfd_info` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + kfd_info = amdsmi_get_gpu_kfd_info(device) + print(kfd_info) except AmdSmiException as e: print(e) ``` diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 6b75e7e475..cd0dceff70 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -590,11 +590,15 @@ typedef struct { uint32_t oam_id; //< 0xFFFF if not supported uint32_t num_of_compute_units; //< 0xFFFFFFFF if not supported uint64_t target_graphics_version; //< 0xFFFFFFFFFFFFFFFF if not supported + uint32_t partition_id; //< 0xFFFFFFFF if not supported + uint32_t reserved[14]; +} amdsmi_asic_info_t; + +typedef struct { uint64_t kfd_id; //< 0xFFFFFFFFFFFFFFFF if not supported uint32_t node_id; //< 0xFFFFFFFF if not supported - uint32_t partition_id; //< 0xFFFFFFFF if not supported - uint32_t reserved[11]; -} amdsmi_asic_info_t; + uint32_t reserved[13]; +} amdsmi_kfd_info_t; typedef enum { AMDSMI_LINK_TYPE_PCIE, @@ -4716,6 +4720,25 @@ amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_handle, amdsmi_driv amdsmi_status_t amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_info_t *info); +/** + * @brief Returns the KFD (Kernel Fusion Driver) information for the device + * + * @platform{gpu_bm_linux} @platform{guest_1vf} @platform{guest_mvf} + * + * @details This function returns KFD information populated into the amdsmi_kfd_info_t. + * This contains the kfd_id and node_id which allow for the ID and + * index of this device in the KFD. + * + * @param[in] processor_handle Device which to query + * + * @param[out] info Reference to kfd information structure. + * Must be allocated by user. + * + * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail + */ +amdsmi_status_t +amdsmi_get_gpu_kfd_info(amdsmi_processor_handle processor_handle, amdsmi_kfd_info_t *info); + /** * @brief Returns vram info * diff --git a/projects/amdsmi/py-interface/README.md b/projects/amdsmi/py-interface/README.md index 797486ca8e..40edc84f8c 100644 --- a/projects/amdsmi/py-interface/README.md +++ b/projects/amdsmi/py-interface/README.md @@ -377,6 +377,8 @@ Field | Content `rev_id` | revision id `asic_serial` | asic serial `oam_id` | oam id +`num_of_compute_units` | number of compute units on asic +`target_graphics_version` | hardware graphics version Exceptions that can be thrown by `amdsmi_get_gpu_asic_info` function: @@ -394,13 +396,44 @@ try: else: for device in devices: asic_info = amdsmi_get_gpu_asic_info(device) - print(asic_info['market_name']) - print(hex(asic_info['vendor_id'])) - print(asic_info['vendor_name']) - print(hex(asic_info['device_id'])) - print(hex(asic_info['rev_id'])) - print(asic_info['asic_serial']) - print(asic_info['oam_id']) + print(asic_info) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_get_gpu_kfd_info + +Description: Returns KFD(kernel fusion driver) information for the given GPU +This correlates to GUID in rocm-smi + +Input parameters: + +* `processor_handle` device which to query + +Output: Dictionary with fields + +Field | Content +---|--- +`kfd_id` | KFD's unique GPU identifier +`node_id` | KFD's internal GPU index + +Exceptions that can be thrown by `amdsmi_get_gpu_kfd_info` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + kfd_info = amdsmi_get_gpu_kfd_info(device) + print(kfd_info) except AmdSmiException as e: print(e) ``` diff --git a/projects/amdsmi/py-interface/__init__.py b/projects/amdsmi/py-interface/__init__.py index 13e3221401..12822f7f77 100644 --- a/projects/amdsmi/py-interface/__init__.py +++ b/projects/amdsmi/py-interface/__init__.py @@ -89,6 +89,7 @@ from .amdsmi_interface import amdsmi_get_gpu_driver_info # # ASIC and Bus Static Information from .amdsmi_interface import amdsmi_get_gpu_asic_info +from .amdsmi_interface import amdsmi_get_gpu_kfd_info from .amdsmi_interface import amdsmi_get_power_cap_info from .amdsmi_interface import amdsmi_get_gpu_vram_info from .amdsmi_interface import amdsmi_get_gpu_cache_info diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index dd5c676a01..9fb9a83173 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -575,7 +575,7 @@ def _make_amdsmi_bdf_from_list(bdf): amdsmi_bdf.struct_amdsmi_bdf_t.domain_number = bdf[0] return amdsmi_bdf -def _padHexValue(value, length): +def _pad_hex_value(value, length): """ Pad a hexadecimal value with a given length of zeros :param value: A hexadecimal value to be padded with zeros @@ -590,23 +590,23 @@ def _padHexValue(value, length): return '0x' + value[2:].zfill(length) return value -class UIntegerTypes(IntEnum): +class MaxUIntegerTypes(IntEnum): UINT8_T = 0xFF UINT16_T = 0xFFFF UINT32_T = 0xFFFFFFFF UINT64_T = 0xFFFFFFFFFFFFFFFF -def _validateIfMaxUint(valToCheck, uintType: UIntegerTypes): +def _validate_if_max_uint(value, uint_type: MaxUIntegerTypes): return_val = "N/A" - if not isinstance(valToCheck, list): - if valToCheck == uintType: + if not isinstance(value, list): + if value == uint_type: return return_val else: - return valToCheck + return value else: - return_val = valToCheck - for idx, v in enumerate(valToCheck): - if v == uintType: + return_val = value + for idx, v in enumerate(value): + if v == uint_type: return_val[idx] = "N/A" return return_val @@ -1656,18 +1656,16 @@ def amdsmi_get_gpu_asic_info( ) asic_info = { - "market_name": _padHexValue(asic_info_struct.market_name.decode("utf-8"), 4), + "market_name": _pad_hex_value(asic_info_struct.market_name.decode("utf-8"), 4), "vendor_id": asic_info_struct.vendor_id, "vendor_name": asic_info_struct.vendor_name.decode("utf-8"), "subvendor_id": asic_info_struct.subvendor_id, "device_id": asic_info_struct.device_id, - "rev_id": _padHexValue(hex(asic_info_struct.rev_id), 2), + "rev_id": _pad_hex_value(hex(asic_info_struct.rev_id), 2), "asic_serial": asic_info_struct.asic_serial.decode("utf-8"), "oam_id": asic_info_struct.oam_id, "num_compute_units": asic_info_struct.num_of_compute_units, "target_graphics_version": "gfx" + str(asic_info_struct.target_graphics_version), - "kfd_id": asic_info_struct.kfd_id, - "node_id": asic_info_struct.node_id, "partition_id": asic_info_struct.partition_id } @@ -1705,6 +1703,28 @@ def amdsmi_get_gpu_asic_info( return asic_info +def amdsmi_get_gpu_kfd_info( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle, +) -> Dict[str, Any]: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + kfd_info_struct = amdsmi_wrapper.amdsmi_kfd_info_t() + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_kfd_info( + processor_handle, ctypes.byref(kfd_info_struct)) + ) + + kfd_info = { + "kfd_id": _validate_if_max_uint(kfd_info_struct.kfd_id, MaxUIntegerTypes.UINT32_T), + "node_id": _validate_if_max_uint(kfd_info_struct.node_id, MaxUIntegerTypes.UINT64_T) + } + + return kfd_info + + def amdsmi_get_power_cap_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: @@ -1999,10 +2019,10 @@ def amdsmi_get_gpu_board_info( ) board_info_dict = { - "model_number": _padHexValue(board_info.model_number.decode("utf-8").strip(), 4), + "model_number": _pad_hex_value(board_info.model_number.decode("utf-8").strip(), 4), "product_serial": board_info.product_serial.decode("utf-8").strip(), "fru_id": board_info.fru_id.decode("utf-8").strip(), - "product_name": _padHexValue(board_info.product_name.decode("utf-8").strip(), 4), + "product_name": _pad_hex_value(board_info.product_name.decode("utf-8").strip(), 4), "manufacturer_name": board_info.manufacturer_name.decode("utf-8").strip() } @@ -2301,20 +2321,20 @@ def amdsmi_get_pcie_info( pcie_info_dict = { "pcie_static": { - "max_pcie_width": _validateIfMaxUint(pcie_info.pcie_static.max_pcie_width, UIntegerTypes.UINT16_T), - "max_pcie_speed": _validateIfMaxUint(pcie_info.pcie_static.max_pcie_speed, UIntegerTypes.UINT32_T), - "pcie_interface_version": _validateIfMaxUint(pcie_info.pcie_static.pcie_interface_version, UIntegerTypes.UINT32_T), + "max_pcie_width": _validate_if_max_uint(pcie_info.pcie_static.max_pcie_width, MaxUIntegerTypes.UINT16_T), + "max_pcie_speed": _validate_if_max_uint(pcie_info.pcie_static.max_pcie_speed, MaxUIntegerTypes.UINT32_T), + "pcie_interface_version": _validate_if_max_uint(pcie_info.pcie_static.pcie_interface_version, MaxUIntegerTypes.UINT32_T), "slot_type": pcie_info.pcie_static.slot_type, }, "pcie_metric": { - "pcie_width": _validateIfMaxUint(pcie_info.pcie_metric.pcie_width, UIntegerTypes.UINT16_T), - "pcie_speed": _validateIfMaxUint(pcie_info.pcie_metric.pcie_speed, UIntegerTypes.UINT32_T), - "pcie_bandwidth": _validateIfMaxUint(pcie_info.pcie_metric.pcie_bandwidth, UIntegerTypes.UINT32_T), - "pcie_replay_count": _validateIfMaxUint(pcie_info.pcie_metric.pcie_replay_count, UIntegerTypes.UINT64_T), - "pcie_l0_to_recovery_count": _validateIfMaxUint(pcie_info.pcie_metric.pcie_l0_to_recovery_count, UIntegerTypes.UINT64_T), - "pcie_replay_roll_over_count": _validateIfMaxUint(pcie_info.pcie_metric.pcie_replay_roll_over_count, UIntegerTypes.UINT64_T), - "pcie_nak_sent_count": _validateIfMaxUint(pcie_info.pcie_metric.pcie_nak_sent_count, UIntegerTypes.UINT64_T), - "pcie_nak_received_count": _validateIfMaxUint(pcie_info.pcie_metric.pcie_nak_received_count, UIntegerTypes.UINT64_T), + "pcie_width": _validate_if_max_uint(pcie_info.pcie_metric.pcie_width, MaxUIntegerTypes.UINT16_T), + "pcie_speed": _validate_if_max_uint(pcie_info.pcie_metric.pcie_speed, MaxUIntegerTypes.UINT32_T), + "pcie_bandwidth": _validate_if_max_uint(pcie_info.pcie_metric.pcie_bandwidth, MaxUIntegerTypes.UINT32_T), + "pcie_replay_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_replay_count, MaxUIntegerTypes.UINT64_T), + "pcie_l0_to_recovery_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_l0_to_recovery_count, MaxUIntegerTypes.UINT64_T), + "pcie_replay_roll_over_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_replay_roll_over_count, MaxUIntegerTypes.UINT64_T), + "pcie_nak_sent_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_nak_sent_count, MaxUIntegerTypes.UINT64_T), + "pcie_nak_received_count": _validate_if_max_uint(pcie_info.pcie_metric.pcie_nak_received_count, MaxUIntegerTypes.UINT64_T), } } @@ -2407,7 +2427,7 @@ def amdsmi_get_gpu_subsystem_id(processor_handle: amdsmi_wrapper.amdsmi_processo processor_handle, ctypes.byref(id)) ) - return _padHexValue(hex(id.value), 4) + return _pad_hex_value(hex(id.value), 4) def amdsmi_get_gpu_subsystem_name(processor_handle: amdsmi_wrapper.amdsmi_processor_handle): diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index c2b3f8f5b3..24c59370c4 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -904,14 +904,23 @@ struct_amdsmi_asic_info_t._fields_ = [ ('num_of_compute_units', ctypes.c_uint32), ('PADDING_0', ctypes.c_ubyte * 4), ('target_graphics_version', ctypes.c_uint64), - ('kfd_id', ctypes.c_uint64), - ('node_id', ctypes.c_uint32), ('partition_id', ctypes.c_uint32), - ('reserved', ctypes.c_uint32 * 17), + ('reserved', ctypes.c_uint32 * 14), ('PADDING_1', ctypes.c_ubyte * 4), ] amdsmi_asic_info_t = struct_amdsmi_asic_info_t +class struct_amdsmi_kfd_info_t(Structure): + pass + +struct_amdsmi_kfd_info_t._pack_ = 1 # source:False +struct_amdsmi_kfd_info_t._fields_ = [ + ('kfd_id', ctypes.c_uint64), + ('node_id', ctypes.c_uint32), + ('reserved', ctypes.c_uint32 * 13), +] + +amdsmi_kfd_info_t = struct_amdsmi_kfd_info_t # values for enumeration 'amdsmi_link_type_t' amdsmi_link_type_t__enumvalues = { @@ -2265,6 +2274,9 @@ amdsmi_get_gpu_driver_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(s amdsmi_get_gpu_asic_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_asic_info amdsmi_get_gpu_asic_info.restype = amdsmi_status_t amdsmi_get_gpu_asic_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_asic_info_t)] +amdsmi_get_gpu_kfd_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_kfd_info +amdsmi_get_gpu_kfd_info.restype = amdsmi_status_t +amdsmi_get_gpu_kfd_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_kfd_info_t)] amdsmi_get_gpu_vram_info = _libraries['libamd_smi.so'].amdsmi_get_gpu_vram_info amdsmi_get_gpu_vram_info.restype = amdsmi_status_t amdsmi_get_gpu_vram_info.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_vram_info_t)] @@ -2696,7 +2708,8 @@ __all__ = \ 'amdsmi_get_gpu_ecc_enabled', 'amdsmi_get_gpu_ecc_status', 'amdsmi_get_gpu_event_notification', 'amdsmi_get_gpu_fan_rpms', 'amdsmi_get_gpu_fan_speed', 'amdsmi_get_gpu_fan_speed_max', - 'amdsmi_get_gpu_id', 'amdsmi_get_gpu_mem_overdrive_level', + 'amdsmi_get_gpu_id', 'amdsmi_get_gpu_kfd_info', + 'amdsmi_get_gpu_mem_overdrive_level', 'amdsmi_get_gpu_memory_partition', 'amdsmi_get_gpu_memory_reserved_pages', 'amdsmi_get_gpu_memory_total', 'amdsmi_get_gpu_memory_usage', @@ -2742,7 +2755,7 @@ __all__ = \ 'amdsmi_init', 'amdsmi_init_flags_t', 'amdsmi_init_gpu_event_notification', 'amdsmi_io_bw_encoding_t', 'amdsmi_io_link_type_t', 'amdsmi_is_P2P_accessible', - 'amdsmi_is_gpu_power_management_enabled', + 'amdsmi_is_gpu_power_management_enabled', 'amdsmi_kfd_info_t', 'amdsmi_link_id_bw_type_t', 'amdsmi_link_metrics_t', 'amdsmi_link_type_t', 'amdsmi_memory_page_status_t', 'amdsmi_memory_partition_type_t', 'amdsmi_memory_type_t', @@ -2802,7 +2815,7 @@ __all__ = \ 'struct_amdsmi_freq_volt_region_t', 'struct_amdsmi_frequencies_t', 'struct_amdsmi_frequency_range_t', 'struct_amdsmi_fw_info_t', 'struct_amdsmi_gpu_cache_info_t', 'struct_amdsmi_gpu_metrics_t', - 'struct_amdsmi_hsmp_metrics_table_t', + 'struct_amdsmi_hsmp_metrics_table_t', 'struct_amdsmi_kfd_info_t', 'struct_amdsmi_link_id_bw_type_t', 'struct_amdsmi_link_metrics_t', 'struct_amdsmi_name_value_t', 'struct_amdsmi_od_vddc_point_t', 'struct_amdsmi_od_volt_curve_t', diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index b8195d7a6b..47144f4ee4 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -774,24 +774,6 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i info->target_graphics_version = tmp_target_gfx_version; } - // default to 0xffffffffffffffff as not supported - info->kfd_id = std::numeric_limits::max(); - auto tmp_kfd_id = uint64_t(0); - status = rsmi_wrapper(rsmi_dev_guid_get, processor_handle, - &(tmp_kfd_id)); - if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { - info->kfd_id = tmp_kfd_id; - } - - // default to 0xffffffff as not supported - info->node_id = std::numeric_limits::max(); - auto tmp_node_id = uint32_t(0); - status = rsmi_wrapper(rsmi_dev_node_id_get, processor_handle, - &(tmp_node_id)); - if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { - info->node_id = tmp_node_id; - } - // default to 0xffffffff as not supported info->partition_id = std::numeric_limits::max(); auto tmp_partition_id = uint32_t(0); @@ -804,6 +786,37 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i return AMDSMI_STATUS_SUCCESS; } +amdsmi_status_t amdsmi_get_gpu_kfd_info(amdsmi_processor_handle processor_handle, + amdsmi_kfd_info_t *info) { + AMDSMI_CHECK_INIT(); + + if (info == nullptr) { + return AMDSMI_STATUS_INVAL; + } + + amdsmi_status_t status; + // default to 0xffffffffffffffff as not supported + info->kfd_id = std::numeric_limits::max(); + auto tmp_kfd_id = uint64_t(0); + status = rsmi_wrapper(rsmi_dev_guid_get, processor_handle, &(tmp_kfd_id)); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } else { + info->kfd_id = tmp_kfd_id; + } + + // default to 0xffffffff as not supported + info->node_id = std::numeric_limits::max(); + auto tmp_node_id = uint32_t(0); + status = rsmi_wrapper(rsmi_dev_node_id_get, processor_handle, &(tmp_node_id)); + if (status != AMDSMI_STATUS_SUCCESS) { + return status; + } else { + info->node_id = tmp_node_id; + } + + return AMDSMI_STATUS_SUCCESS; +} amdsmi_status_t amdsmi_get_gpu_subsystem_id(amdsmi_processor_handle processor_handle, uint16_t *id) { diff --git a/projects/amdsmi/tests/amd_smi_test/functional/id_info_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/id_info_read.cc index 2f7236a1b2..de92faf5cd 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/id_info_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/id_info_read.cc @@ -123,8 +123,8 @@ void TestIdInfoRead::Run(void) { } // vendor_id, unique_id - amdsmi_asic_info_t asci_info; - err = amdsmi_get_gpu_asic_info(processor_handles_[0], &asci_info); + amdsmi_asic_info_t asic_info; + err = amdsmi_get_gpu_asic_info(processor_handles_[0], &asic_info); CHK_ERR_ASRT(err) // device name, brand, serial_number @@ -215,7 +215,7 @@ void TestIdInfoRead::Run(void) { IF_VERB(STANDARD) { std::cout << "\t**Sub-system Vendor ID: 0x" << std::hex << - asci_info.subvendor_id << std::endl; + asic_info.subvendor_id << std::endl; } err = amdsmi_get_gpu_vendor_name(processor_handles_[i], buffer, kBufferLen); diff --git a/projects/amdsmi/tests/amd_smi_test/functional/mutual_exclusion.cc b/projects/amdsmi/tests/amd_smi_test/functional/mutual_exclusion.cc index 48c692fcdd..f7bef7ce5d 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/mutual_exclusion.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/mutual_exclusion.cc @@ -200,8 +200,8 @@ void TestMutualExclusion::Run(void) { ret = amdsmi_get_gpu_id(processor_handles_[0], &dmy_ui16); // vendor_id, unique_id - amdsmi_asic_info_t asci_info; - ret = amdsmi_get_gpu_asic_info(processor_handles_[0], &asci_info); + amdsmi_asic_info_t asic_info; + ret = amdsmi_get_gpu_asic_info(processor_handles_[0], &asic_info); CHECK_RET(ret, AMDSMI_STATUS_BUSY); // device name, brand, serial_number diff --git a/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc index b64d37f1e7..6c4a4b7717 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc @@ -153,17 +153,15 @@ void TestSysInfoRead::Run(void) { ASSERT_EQ(err, AMDSMI_STATUS_INVAL); - // vendor_id, unique_id, target_gfx_version, kfd_id, node_id, partition_id - amdsmi_asic_info_t asci_info = {}; - err = amdsmi_get_gpu_asic_info(processor_handles_[i], &asci_info); + // vendor_id, unique_id, target_gfx_version, partition_id + amdsmi_asic_info_t asic_info = {}; + err = amdsmi_get_gpu_asic_info(processor_handles_[i], &asic_info); if (err == AMDSMI_STATUS_NOT_SUPPORTED) { std::cout << "\t**amdsmi_dev_unique_id() is not supported" " on this machine" << std::endl; - EXPECT_EQ(asci_info.target_graphics_version, std::numeric_limits::max()); - EXPECT_EQ(asci_info.kfd_id, std::numeric_limits::max()); - EXPECT_EQ(asci_info.node_id, std::numeric_limits::max()); - EXPECT_EQ(asci_info.partition_id, std::numeric_limits::max()); + EXPECT_EQ(asic_info.target_graphics_version, std::numeric_limits::max()); + EXPECT_EQ(asic_info.partition_id, std::numeric_limits::max()); // Verify api support checking functionality is working err = amdsmi_get_gpu_asic_info(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED); @@ -171,21 +169,15 @@ void TestSysInfoRead::Run(void) { if (err == AMDSMI_STATUS_SUCCESS) { IF_VERB(STANDARD) { std:: cout << "\t**GPU PCIe Vendor : " - << asci_info.vendor_name << std::endl; + << asic_info.vendor_name << std::endl; std::cout << "\t**Target GFX version: " << std::dec - << asci_info.target_graphics_version << "\n"; - std::cout << "\t**KFD ID: " << std::dec - << asci_info.kfd_id << "\n"; - std::cout << "\t**Node ID: " << std::dec - << asci_info.node_id << "\n"; + << asic_info.target_graphics_version << "\n"; std::cout << "\t**Partition ID: " << std::dec - << asci_info.partition_id << "\n"; + << asic_info.partition_id << "\n"; } EXPECT_EQ(err, AMDSMI_STATUS_SUCCESS); - EXPECT_NE(asci_info.target_graphics_version, std::numeric_limits::max()); - EXPECT_NE(asci_info.kfd_id, std::numeric_limits::max()); - EXPECT_NE(asci_info.node_id, std::numeric_limits::max()); - EXPECT_NE(asci_info.partition_id, std::numeric_limits::max()); + EXPECT_NE(asic_info.target_graphics_version, std::numeric_limits::max()); + EXPECT_NE(asic_info.partition_id, std::numeric_limits::max()); // Verify api support checking functionality is working err = amdsmi_get_gpu_asic_info(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_INVAL); @@ -195,6 +187,27 @@ void TestSysInfoRead::Run(void) { } } + // kfd_id, node_id + amdsmi_kfd_info_t kfd_info = {}; + err = amdsmi_get_gpu_kfd_info(processor_handles_[i], &kfd_info); + if (err != AMDSMI_STATUS_SUCCESS) { + EXPECT_EQ(kfd_info.kfd_id, std::numeric_limits::max()); + EXPECT_EQ(kfd_info.node_id, std::numeric_limits::max()); + } else { + IF_VERB(STANDARD) { + std::cout << "\t**KFD ID: " << std::dec + << kfd_info.kfd_id << "\n"; + std::cout << "\t**Node ID: " << std::dec + << kfd_info.node_id << "\n"; + } + EXPECT_EQ(err, AMDSMI_STATUS_SUCCESS); + EXPECT_NE(kfd_info.kfd_id, std::numeric_limits::max()); + EXPECT_NE(kfd_info.node_id, std::numeric_limits::max()); + } + // Verify api support checking functionality is working + err = amdsmi_get_gpu_kfd_info(processor_handles_[i], nullptr); + ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED); + err = amdsmi_get_lib_version(&ver); CHK_ERR_ASRT(err) diff --git a/projects/amdsmi/tests/python_unittest/integration_test.py b/projects/amdsmi/tests/python_unittest/integration_test.py index 2a3367323c..9e1f12bbe3 100755 --- a/projects/amdsmi/tests/python_unittest/integration_test.py +++ b/projects/amdsmi/tests/python_unittest/integration_test.py @@ -511,12 +511,14 @@ def walk_through(self): asic_info['oam_id'])) print(" asic_info['target_graphics_version'] is: {}\n".format( asic_info['target_graphics_version'])) - print(" asic_info['kfd_id'] is: {}\n".format( - asic_info['kfd_id'])) - print(" asic_info['node_id'] is: {}\n".format( - asic_info['node_id'])) print(" asic_info['partition_id'] is: {}\n".format( asic_info['partition_id'])) + print("\n###Test amdsmi_get_gpu_kfd_info \n") + kfd_info = amdsmi.amdsmi_get_gpu_kfd_info(processors[i]) + print(" kfd_info['kfd_id'] is: {}\n".format( + kfd_info['kfd_id'])) + print(" kfd_info['node_id'] is: {}\n".format( + kfd_info['node_id'])) print("###Test amdsmi_get_power_cap_info \n") power_info = amdsmi.amdsmi_get_power_cap_info(processors[i]) print(" power_info['dpm_cap'] is: {}".format( diff --git a/projects/amdsmi/tools/amdsmi_quick_start.py b/projects/amdsmi/tools/amdsmi_quick_start.py index fb08e6d1e6..4cec5be4a1 100644 --- a/projects/amdsmi/tools/amdsmi_quick_start.py +++ b/projects/amdsmi/tools/amdsmi_quick_start.py @@ -23,15 +23,13 @@ # This is not meant to serve best practices for development. # Run this post install with python3 -i quick_start.py - -from amdsmi import * -from pathlib import Path - import atexit import logging import signal import sys +from amdsmi import * +from pathlib import Path # Make exit & quit work without parens because it's annoying type(exit).__repr__ = sys.exit