diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 45d8ceb3a1..1135bd1838 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -16,18 +16,15 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr - **Added more supported utilization count types to `amdsmi_get_utilization_count()`**. - **Added `amd-smi set -L/--clk-limit ...` command**. -Equivalent to rocm-smi's '--extremum' command which sets sclk's or mclk's soft minimum or soft maximum clock frequency. + - Equivalent to rocm-smi's '--extremum' command which sets sclk's or mclk's soft minimum or soft maximum clock frequency. - **Added Pytest functionality to test amdsmi API calls in Python**. - **Changed the `power` parameter in `amdsmi_get_energy_count()` to `energy_accumulator`**. -Changes propagate forwards into the python interface as well, however we are maintaing backwards compatibility and keeping the `power` field in the python API until ROCm 6.4. + - Changes propagate forwards into the python interface as well, however we are maintaing backwards compatibility and keeping the `power` field in the python API until ROCm 6.4. - **Added GPU memory overdrive percentage to `amd-smi metric -o`**. -Added `amdsmi_get_gpu_mem_overdrive_level()` function to amd-smi C and Python Libraries. - -- **Added Subsystem Device ID to `amd-smi static --asic`**. -No underlying changes to amdsmi_get_gpu_asic_info + - Added `amdsmi_get_gpu_mem_overdrive_level()` function to amd-smi C and Python Libraries. - **Added retrieving connection type and P2P capabilities between two GPUs**. - Added `amdsmi_topo_get_p2p_status` function to amd-smi C and Python Libraries. @@ -44,14 +41,14 @@ If no topology argument is provided all topology information will be displayed. Topology arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:0c:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx - ID: 1 | BDF: 0000:22:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx - ID: 2 | BDF: 0000:38:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx - ID: 3 | BDF: 0000:5c:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx - ID: 4 | BDF: 0000:9f:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx - ID: 5 | BDF: 0000:af:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx - ID: 6 | BDF: 0000:bf:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx - ID: 7 | BDF: 0000:df:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + ID: 0 | BDF: 0000:0c:00.0 | UUID: + ID: 1 | BDF: 0000:22:00.0 | UUID: + ID: 2 | BDF: 0000:38:00.0 | UUID: + ID: 3 | BDF: 0000:5c:00.0 | UUID: + ID: 4 | BDF: 0000:9f:00.0 | UUID: + ID: 5 | BDF: 0000:af:00.0 | UUID: + ID: 6 | BDF: 0000:bf:00.0 | UUID: + ID: 7 | BDF: 0000:df:00.0 | UUID: all | Selects all devices @@ -75,62 +72,7 @@ Command Modifiers: ``` ```shell -$ amd-smi topology -ACCESS TABLE: - 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 -0000:0c:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED -0000:22:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED -0000:38:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED -0000:5c:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED -0000:9f:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED -0000:af:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED -0000:bf:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED -0000:df:00.0 ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED ENABLED - -WEIGHT TABLE: - 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 -0000:0c:00.0 0 15 15 15 15 15 15 15 -0000:22:00.0 15 0 15 15 15 15 15 15 -0000:38:00.0 15 15 0 15 15 15 15 15 -0000:5c:00.0 15 15 15 0 15 15 15 15 -0000:9f:00.0 15 15 15 15 0 15 15 15 -0000:af:00.0 15 15 15 15 15 0 15 15 -0000:bf:00.0 15 15 15 15 15 15 0 15 -0000:df:00.0 15 15 15 15 15 15 15 0 - -HOPS TABLE: - 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 -0000:0c:00.0 0 1 1 1 1 1 1 1 -0000:22:00.0 1 0 1 1 1 1 1 1 -0000:38:00.0 1 1 0 1 1 1 1 1 -0000:5c:00.0 1 1 1 0 1 1 1 1 -0000:9f:00.0 1 1 1 1 0 1 1 1 -0000:af:00.0 1 1 1 1 1 0 1 1 -0000:bf:00.0 1 1 1 1 1 1 0 1 -0000:df:00.0 1 1 1 1 1 1 1 0 - -LINK TYPE TABLE: - 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 -0000:0c:00.0 SELF XGMI XGMI XGMI XGMI XGMI XGMI XGMI -0000:22:00.0 XGMI SELF XGMI XGMI XGMI XGMI XGMI XGMI -0000:38:00.0 XGMI XGMI SELF XGMI XGMI XGMI XGMI XGMI -0000:5c:00.0 XGMI XGMI XGMI SELF XGMI XGMI XGMI XGMI -0000:9f:00.0 XGMI XGMI XGMI XGMI SELF XGMI XGMI XGMI -0000:af:00.0 XGMI XGMI XGMI XGMI XGMI SELF XGMI XGMI -0000:bf:00.0 XGMI XGMI XGMI XGMI XGMI XGMI SELF XGMI -0000:df:00.0 XGMI XGMI XGMI XGMI XGMI XGMI XGMI SELF - -NUMA BW TABLE: - 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 -0000:0c:00.0 N/A 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 -0000:22:00.0 50000-50000 N/A 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 -0000:38:00.0 50000-50000 50000-50000 N/A 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 -0000:5c:00.0 50000-50000 50000-50000 50000-50000 N/A 50000-50000 50000-50000 50000-50000 50000-50000 -0000:9f:00.0 50000-50000 50000-50000 50000-50000 50000-50000 N/A 50000-50000 50000-50000 50000-50000 -0000:af:00.0 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 N/A 50000-50000 50000-50000 -0000:bf:00.0 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 N/A 50000-50000 -0000:df:00.0 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 50000-50000 N/A - +$ amd-smi topology -cndz CACHE COHERANCY TABLE: 0000:0c:00.0 0000:22:00.0 0000:38:00.0 0000:5c:00.0 0000:9f:00.0 0000:af:00.0 0000:bf:00.0 0000:df:00.0 0000:0c:00.0 SELF C NC NC C C C NC @@ -203,22 +145,40 @@ typedef struct { $ amd-smi list GPU: 0 BDF: 0000:23:00.0 - UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + UUID: KFD_ID: 45412 NODE_ID: 1 + PARTITION_ID: 0 GPU: 1 BDF: 0000:26:00.0 - UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + UUID: KFD_ID: 59881 NODE_ID: 2 + PARTITION_ID: 0 ``` -- **Added Target_Graphics_Version and partition id to `amd-smi static --asic`**. +- **Added Subsystem Device ID to `amd-smi static --asic`**. + - No underlying changes to amdsmi_get_gpu_asic_info -Due to fixes needed to properly enumerate all logical GPUs in CPX, new device identifiers -were placed within the `amdsmi_asic_info_t` struct. These new fields are only available for BM/Guest Linux -devices at this time. +```shell +$ amd-smi static --asic +GPU: 0 + ASIC: + MARKET_NAME: MI308X + VENDOR_ID: 0x1002 + VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] + SUBVENDOR_ID: 0x1002 + DEVICE_ID: 0x74a2 + SUBSYSTEM_ID: 0x74a2 + REV_ID: 0x00 + ASIC_SERIAL: + OAM_ID: 5 + NUM_COMPUTE_UNITS: 20 + TARGET_GRAPHICS_VERSION: gfx942 +``` + +- **Added Target_Graphics_Version to `amd-smi static --asic` and `amdsmi_get_gpu_asic_info()`**. ```C typedef struct { @@ -232,13 +192,12 @@ typedef struct { uint32_t oam_id; //< 0xFFFF if not supported uint32_t num_of_compute_units; //< 0xFFFFFFFF if not supported uint64_t target_graphics_version; //< 0xFFFFFFFFFFFFFFFF if not supported - uint32_t partition_id; //< 0xFFFFFFFF if not supported - uint32_t reserved[14]; + uint32_t reserved[15]; } amdsmi_asic_info_t; ``` ```shell -$ amd-smi static --asic --partition +$ amd-smi static --asic GPU: 0 ASIC: MARKET_NAME: MI308X @@ -246,47 +205,102 @@ GPU: 0 VENDOR_NAME: Advanced Micro Devices Inc. [AMD/ATI] SUBVENDOR_ID: 0x1002 DEVICE_ID: 0x74a2 - TARGET_GRAPHICS_VERSION: gfx942 - KFD_ID: 24248 - NODE_ID: 2 - PARTITION_ID: 0 SUBSYSTEM_ID: 0x74a2 REV_ID: 0x00 ASIC_SERIAL: OAM_ID: 5 NUM_COMPUTE_UNITS: 20 + TARGET_GRAPHICS_VERSION: gfx942 +``` + +- **Udpated Partition APIs and struct information and added and partition_id to `amd-smi static --partition` & `amd-smi list`**. + - As part of an overhaul to partition information, some partition information will be made available in the `amdsmi_accelerator_partition_profile_t`. + - This struct will be filled out by a new API, `amdsmi_get_gpu_accelerator_partition_profile()`. + - Future data from these APIs wil will eventually get added to `static --partition`. + +```C +#define AMDSMI_MAX_ACCELERATOR_PROFILE 32 +#define AMDSMI_MAX_CP_PROFILE_RESOURCES 32 +#define AMDSMI_MAX_ACCELERATOR_PARTITIONS 8 + +/** + * @brief Accelerator Partition. This enum is used to identify + * various accelerator partitioning settings. + */ +typedef enum { + AMDSMI_ACCELERATOR_PARTITION_INVALID = 0, + AMDSMI_ACCELERATOR_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + AMDSMI_ACCELERATOR_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + AMDSMI_ACCELERATOR_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + AMDSMI_ACCELERATOR_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory + AMDSMI_ACCELERATOR_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory +} amdsmi_accelerator_partition_type_t; + +typedef struct { + amdsmi_accelerator_partition_type_t profile_type; // SPX, DPX, QPX, CPX and so on + uint32_t num_partitions; // On MI300X, SPX: 1, DPX: 2, QPX: 4, CPX: 8, the length of resources array + uint32_t profile_index; // The index in the profiles array in amdsmi_compute_partition_profile_t + uint32_t num_resources; // length of index_of_resources_profile + uint32_t resources[AMDSMI_MAX_ACCELERATOR_PARTITIONS][AMDSMI_MAX_CP_PROFILE_RESOURCES]; + uint32_t reserved[12]; +} amdsmi_accelerator_partition_profile_t; +``` + +```shell +$ amd-smi static --partition +GPU: 0 PARTITION: COMPUTE_PARTITION: CPX MEMORY_PARTITION: NPS4 + PARTITION_ID: 0 + +$ amd-smi list +GPU: 0 + BDF: 0000:23:00.0 + UUID: + KFD_ID: 45412 + NODE_ID: 1 + PARTITION_ID: 0 + +GPU: 1 + BDF: 0000:26:00.0 + UUID: + KFD_ID: 59881 + NODE_ID: 2 + PARTITION_ID: 0 ``` ### Removals - **Removed usage of _validate_positive in Parser and replaced with _positive_int and _not_negative_int as appropriate**. -This will allow 0 to be a valid input for several options in setting CPUs where appropriate (for example, as a mode or NBIOID) + - This will allow 0 to be a valid input for several options in setting CPUs where appropriate (for example, as a mode or NBIOID) ### Optimizations - **Adjusted ordering of gpu_metrics calls to ensure that pcie_bw values remain stable in `amd-smi metric` & `amd-smi monitor`**. -With this change additional padding was added to PCIE_BW `amd-smi monitor --pcie` + - With this change additional padding was added to PCIE_BW `amd-smi monitor --pcie` ### Resolved issues - **Improved Offline install process & lowered dependency for PyYAML**. - **Fixed CPX not showing total number of logical GPUs**. -Updates were made to `amdsmi_init()` and `amdsmi_get_gpu_bdf_id(..)`. In order to display all logical devices, we needed a way to provide order to GPU's enumerated. This was done + - Updates were made to `amdsmi_init()` and `amdsmi_get_gpu_bdf_id(..)`. In order to display all logical devices, we needed a way to provide order to GPU's enumerated. This was done by adding a partition_id within the BDF optional pci_id bits. - -Due to driver changes in KFD, some devices may report bits [31:28] or [2:0]. With the newly added `amdsmi_get_gpu_bdf_id(..)`, we provided this fallback to properly retreive partition ID. We + - Due to driver changes in KFD, some devices may report bits [31:28] or [2:0]. With the newly added `amdsmi_get_gpu_bdf_id(..)`, we provided this fallback to properly retreive partition ID. We plan to eventually remove partition ID from the function portion of the BDF (Bus Device Function). See below for PCI ID description. - - bits [63:32] = domain - - bits [31:28] or bits [2:0] = partition id - - bits [27:16] = reserved - - bits [15:8] = Bus - - bits [7:3] = Device - - bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes + - bits [63:32] = domain + - bits [31:28] or bits [2:0] = partition id + - bits [27:16] = reserved + - bits [15:8] = Bus + - bits [7:3] = Device + - bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes Previously in non-SPX modes (ex. CPX/TPX/DPX/etc) some MI3x ASICs would not report all logical GPU devices within AMD SMI. @@ -329,6 +343,8 @@ GPU POWER GPU_TEMP MEM_TEMP VRAM_USED VRAM_TOTAL - **Fixed incorrect implementation of the Python API `amdsmi_get_gpu_metrics_header_info()`**. +- **`amd-smi static --partition` will have updates with additional partition information from `amdsmi_get_gpu_accelerator_partition_profile()`**. + ### Known issues - N/A @@ -1005,7 +1021,7 @@ Use the watch arguments to run continuously Monitor Arguments: -h, --help show this help message and exit -g, --gpu GPU [GPU ...] Select a GPU ID, BDF, or UUID from the possible choices: - ID: 0 | BDF: 0000:01:00.0 | UUID: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + ID: 0 | BDF: 0000:01:00.0 | UUID: all | Selects all devices -U, --cpu CPU [CPU ...] Select a CPU ID from the possible choices: ID: 0 diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 4915354fa9..6bd534413f 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -175,20 +175,29 @@ class AMDSMICommands(): kfd_id = kfd_info['kfd_id'] node_id = kfd_info['node_id'] except amdsmi_exception.AmdSmiLibraryException as e: - kfd_id = node_id = e.get_error_info() + kfd_id = node_id = "N/A" logging.debug("Failed to get kfd info for gpu %s | %s", gpu_id, e.get_error_info()) + try: + partition_info = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(args.gpu) + partition_id = partition_info['partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + partition_id = "N/A" + logging.debug("Failed to get partition ID for gpu %s | %s", gpu_id, e.get_error_info()) + # CSV format is intentionally aligned with Host if self.logger.is_csv_format(): self.logger.store_output(args.gpu, 'gpu_bdf', bdf) self.logger.store_output(args.gpu, 'gpu_uuid', uuid) self.logger.store_output(args.gpu, 'kfd_id', kfd_id) self.logger.store_output(args.gpu, 'node_id', node_id) + self.logger.store_output(args.gpu, 'partition_id', partition_id) else: self.logger.store_output(args.gpu, 'bdf', bdf) self.logger.store_output(args.gpu, 'uuid', uuid) self.logger.store_output(args.gpu, 'kfd_id', kfd_id) self.logger.store_output(args.gpu, 'node_id', node_id) + self.logger.store_output(args.gpu, 'partition_id', partition_id) if multiple_devices: self.logger.store_multiple_device_output() @@ -380,8 +389,7 @@ class AMDSMICommands(): "asic_serial" : "N/A", "oam_id" : "N/A", "num_compute_units" : "N/A", - "target_graphics_version" : "N/A", - "partition_id" : "N/A" + "target_graphics_version" : "N/A" } try: @@ -679,8 +687,16 @@ class AMDSMICommands(): memory_partition = "N/A" logging.debug("Failed to get memory partition info for gpu %s | %s", gpu_id, e.get_error_info()) + try: + partition_info = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(args.gpu) + partition_id = partition_info['partition_id'] + except amdsmi_exception.AmdSmiLibraryException as e: + partition_id = "N/A" + logging.debug("Failed to get partition ID for gpu %s | %s", gpu_id, e.get_error_info()) + static_dict['partition'] = {"compute_partition": compute_partition, - "memory_partition": memory_partition} + "memory_partition": memory_partition, + "partition_id": partition_id} if 'soc_pstate' in current_platform_args: if args.soc_pstate: try: @@ -4996,4 +5012,4 @@ class AMDSMICommands(): except Exception as e: print(e) - listener.stop() \ No newline at end of file + listener.stop() diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index cd0dceff70..0c681ee62b 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -87,6 +87,9 @@ typedef enum { #define AMDSMI_MAX_CONTAINER_TYPE 2 #define AMDSMI_MAX_CACHE_TYPES 10 #define AMDSMI_MAX_NUM_XGMI_PHYSICAL_LINK 64 +#define AMDSMI_MAX_ACCELERATOR_PROFILE 32 +#define AMDSMI_MAX_CP_PROFILE_RESOURCES 32 +#define AMDSMI_MAX_ACCELERATOR_PARTITIONS 8 #define AMDSMI_GPU_UUID_SIZE 38 @@ -275,6 +278,24 @@ typedef enum { AMDSMI_CLK_TYPE__MAX = AMDSMI_CLK_TYPE_DCLK1 } amdsmi_clk_type_t; +/** + * @brief Accelerator Partition. This enum is used to identify + * various accelerator partitioning settings. + */ +typedef enum { + AMDSMI_ACCELERATOR_PARTITION_INVALID = 0, + AMDSMI_ACCELERATOR_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + AMDSMI_ACCELERATOR_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + AMDSMI_ACCELERATOR_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + AMDSMI_ACCELERATOR_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory + AMDSMI_ACCELERATOR_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory +} amdsmi_accelerator_partition_type_t; + /** * @brief Compute Partition. This enum is used to identify * various compute partitioning settings. @@ -590,8 +611,7 @@ typedef struct { uint32_t oam_id; //< 0xFFFF if not supported uint32_t num_of_compute_units; //< 0xFFFFFFFF if not supported uint64_t target_graphics_version; //< 0xFFFFFFFFFFFFFFFF if not supported - uint32_t partition_id; //< 0xFFFFFFFF if not supported - uint32_t reserved[14]; + uint32_t reserved[15]; } amdsmi_asic_info_t; typedef struct { @@ -600,6 +620,15 @@ typedef struct { uint32_t reserved[13]; } amdsmi_kfd_info_t; +typedef struct { + amdsmi_accelerator_partition_type_t profile_type; // SPX, DPX, QPX, CPX and so on + uint32_t num_partitions; // On MI300X, SPX: 1, DPX: 2, QPX: 4, CPX: 8, the length of resources array + uint32_t profile_index; // The index in the profiles array in amdsmi_accelerator_partition_profile_t + uint32_t num_resources; // length of index_of_resources_profile + uint32_t resources[AMDSMI_MAX_ACCELERATOR_PARTITIONS][AMDSMI_MAX_CP_PROFILE_RESOURCES]; + uint64_t reserved[6]; +} amdsmi_accelerator_partition_profile_t; + typedef enum { AMDSMI_LINK_TYPE_PCIE, AMDSMI_LINK_TYPE_XGMI, @@ -4517,6 +4546,23 @@ amdsmi_status_t amdsmi_reset_gpu_memory_partition(amdsmi_processor_handle proces /** @} */ // end of memory_partition +/*****************************************************************************/ +/** @defgroup accelerator_partition_profile Accelerator Partition Profile Functions + * These functions are used to configure and query the device's + * accelerator parition profile setting. + * @{ + */ +// TODO: declare rest of partition profile functions and complete doc commentary. +/* + Get the current accelerator partition profile. The function will return current profile. +*/ +amdsmi_status_t +amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_handle, + amdsmi_accelerator_partition_profile_t *profile, + uint32_t *partition_id); + +/** @} */ // end of accelerator_partition_profile + /*****************************************************************************/ /** @defgroup EvntNotif Event Notification Functions * These functions are used to configure for and get asynchronous event diff --git a/projects/amdsmi/py-interface/README.md b/projects/amdsmi/py-interface/README.md index 40edc84f8c..dc4001403e 100644 --- a/projects/amdsmi/py-interface/README.md +++ b/projects/amdsmi/py-interface/README.md @@ -2102,6 +2102,7 @@ except AmdSmiException as e: ``` ### amdsmi_set_gpu_process_isolation + Description: Enable/disable the system Process Isolation for the given device handle. Input parameters: @@ -2132,6 +2133,7 @@ except AmdSmiException as e: ``` ### amdsmi_clean_gpu_local_data + Description: Clear the SRAM data of the given device. This can be called between user logins to prevent information leak. Input parameters: @@ -2160,7 +2162,6 @@ except AmdSmiException as e: print(e) ``` - ### amdsmi_get_gpu_overdrive_level Description: Get the overdrive percent associated with the device with provided @@ -3826,6 +3827,44 @@ except AmdSmiException as e: print(e) ``` +### amdsmi_get_gpu_accelerator_partition_profile + +**Note: CURRENTLY HARDCODED TO RETURN EMPTY VALUES** + +Description: Get partition information for target device + +Input parameters: + +* `processor_handle` the device handle + +Output: Dictionary with fields: + +Field | Description +---|--- +`partition_id` | ID of the partition on the GPU provided +`partition_profile` | Dict containing partition data (TBD) + +Exceptions that can be thrown by `amdsmi_get_gpu_accelerator_partition_profile` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + partition_id = amdsmi_get_gpu_accelerator_partition_profile(device)["partition_id"] + print(partition_id) +except AmdSmiException as e: + print(e) +``` + ### amdsmi_get_xgmi_info Description: Returns XGMI information for the GPU. diff --git a/projects/amdsmi/py-interface/__init__.py b/projects/amdsmi/py-interface/__init__.py index 12822f7f77..e0ffcd2c28 100644 --- a/projects/amdsmi/py-interface/__init__.py +++ b/projects/amdsmi/py-interface/__init__.py @@ -224,6 +224,7 @@ from .amdsmi_interface import amdsmi_reset_gpu_compute_partition from .amdsmi_interface import amdsmi_get_gpu_memory_partition from .amdsmi_interface import amdsmi_set_gpu_memory_partition from .amdsmi_interface import amdsmi_reset_gpu_memory_partition +from .amdsmi_interface import amdsmi_get_gpu_accelerator_partition_profile # # Individual GPU Metrics Functions from .amdsmi_interface import amdsmi_get_gpu_metrics_header_info diff --git a/projects/amdsmi/py-interface/amdsmi_interface.py b/projects/amdsmi/py-interface/amdsmi_interface.py index 9fb9a83173..43bffa1d92 100644 --- a/projects/amdsmi/py-interface/amdsmi_interface.py +++ b/projects/amdsmi/py-interface/amdsmi_interface.py @@ -1665,8 +1665,7 @@ def amdsmi_get_gpu_asic_info( "asic_serial": asic_info_struct.asic_serial.decode("utf-8"), "oam_id": asic_info_struct.oam_id, "num_compute_units": asic_info_struct.num_of_compute_units, - "target_graphics_version": "gfx" + str(asic_info_struct.target_graphics_version), - "partition_id": asic_info_struct.partition_id + "target_graphics_version": "gfx" + str(asic_info_struct.target_graphics_version) } string_values = ["market_name", "vendor_name"] @@ -1746,6 +1745,7 @@ def amdsmi_get_power_cap_info( "min_power_cap": power_info.min_power_cap, "max_power_cap": power_info.max_power_cap} + def amdsmi_get_gpu_pm_metrics_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: @@ -1773,6 +1773,7 @@ def amdsmi_get_gpu_pm_metrics_info( amdsmi_wrapper.amdsmi_free_name_value_pairs(pm_metrics) return results + def amdsmi_get_gpu_reg_table_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, reg_type: amdsmi_wrapper.amdsmi_reg_type_t, @@ -1801,6 +1802,7 @@ def amdsmi_get_gpu_reg_table_info( amdsmi_wrapper.amdsmi_free_name_value_pairs(pm_metrics) return results + def amdsmi_get_gpu_vram_info( processor_handle: amdsmi_wrapper.amdsmi_processor_handle, ) -> Dict[str, Any]: @@ -2564,6 +2566,7 @@ def amdsmi_topo_get_link_type( return {"hops": hops.value, "type": type.value} + def amdsmi_topo_get_p2p_status( processor_handle_src: amdsmi_wrapper.amdsmi_processor_handle, processor_handle_dst: amdsmi_wrapper.amdsmi_processor_handle, @@ -2716,6 +2719,36 @@ def amdsmi_reset_gpu_memory_partition(processor_handle: amdsmi_wrapper.amdsmi_pr _check_res(amdsmi_wrapper.amdsmi_reset_gpu_memory_partition(processor_handle)) +def amdsmi_get_gpu_accelerator_partition_profile( + processor_handle: amdsmi_wrapper.amdsmi_processor_handle + ) -> Dict[str, Any]: + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + partition_id = ctypes.c_uint32() + profile = amdsmi_wrapper.amdsmi_accelerator_partition_profile_t() + + _check_res( + amdsmi_wrapper.amdsmi_get_gpu_accelerator_partition_profile(processor_handle, + ctypes.byref(profile), + ctypes.byref(partition_id)) + ) + + partition_profile_dict = { + "profile_type" : profile.profile_type, + "num_partitions" : profile.num_partitions, + "profile_index" : profile.profile_index, + "num_resources" : profile.num_resources, + "resources" : "N/A" + } + + return { + "partition_id" : partition_id.value, + "partition_profile" : partition_profile_dict + } + + def amdsmi_get_xgmi_info(processor_handle: amdsmi_wrapper.amdsmi_processor_handle): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( diff --git a/projects/amdsmi/py-interface/amdsmi_wrapper.py b/projects/amdsmi/py-interface/amdsmi_wrapper.py index 24c59370c4..8089a2abbf 100644 --- a/projects/amdsmi/py-interface/amdsmi_wrapper.py +++ b/projects/amdsmi/py-interface/amdsmi_wrapper.py @@ -377,6 +377,23 @@ AMDSMI_CLK_TYPE_DCLK1 = 9 AMDSMI_CLK_TYPE__MAX = 9 amdsmi_clk_type_t = ctypes.c_uint32 # enum +# values for enumeration 'amdsmi_accelerator_partition_type_t' +amdsmi_accelerator_partition_type_t__enumvalues = { + 0: 'AMDSMI_ACCELERATOR_PARTITION_INVALID', + 1: 'AMDSMI_ACCELERATOR_PARTITION_SPX', + 2: 'AMDSMI_ACCELERATOR_PARTITION_DPX', + 3: 'AMDSMI_ACCELERATOR_PARTITION_TPX', + 4: 'AMDSMI_ACCELERATOR_PARTITION_QPX', + 5: 'AMDSMI_ACCELERATOR_PARTITION_CPX', +} +AMDSMI_ACCELERATOR_PARTITION_INVALID = 0 +AMDSMI_ACCELERATOR_PARTITION_SPX = 1 +AMDSMI_ACCELERATOR_PARTITION_DPX = 2 +AMDSMI_ACCELERATOR_PARTITION_TPX = 3 +AMDSMI_ACCELERATOR_PARTITION_QPX = 4 +AMDSMI_ACCELERATOR_PARTITION_CPX = 5 +amdsmi_accelerator_partition_type_t = ctypes.c_uint32 # enum + # values for enumeration 'amdsmi_compute_partition_type_t' amdsmi_compute_partition_type_t__enumvalues = { 0: 'AMDSMI_COMPUTE_PARTITION_INVALID', @@ -759,19 +776,6 @@ amdsmi_card_form_factor_t = ctypes.c_uint32 # enum class struct_amdsmi_pcie_info_t(Structure): pass -class struct_pcie_static_(Structure): - pass - -struct_pcie_static_._pack_ = 1 # source:False -struct_pcie_static_._fields_ = [ - ('max_pcie_width', ctypes.c_uint16), - ('PADDING_0', ctypes.c_ubyte * 2), - ('max_pcie_speed', ctypes.c_uint32), - ('pcie_interface_version', ctypes.c_uint32), - ('slot_type', amdsmi_card_form_factor_t), - ('reserved', ctypes.c_uint64 * 10), -] - class struct_pcie_metric_(Structure): pass @@ -790,6 +794,19 @@ struct_pcie_metric_._fields_ = [ ('reserved', ctypes.c_uint64 * 13), ] +class struct_pcie_static_(Structure): + pass + +struct_pcie_static_._pack_ = 1 # source:False +struct_pcie_static_._fields_ = [ + ('max_pcie_width', ctypes.c_uint16), + ('PADDING_0', ctypes.c_ubyte * 2), + ('max_pcie_speed', ctypes.c_uint32), + ('pcie_interface_version', ctypes.c_uint32), + ('slot_type', amdsmi_card_form_factor_t), + ('reserved', ctypes.c_uint64 * 10), +] + struct_amdsmi_pcie_info_t._pack_ = 1 # source:False struct_amdsmi_pcie_info_t._fields_ = [ ('pcie_static', struct_pcie_static_), @@ -904,8 +921,7 @@ struct_amdsmi_asic_info_t._fields_ = [ ('num_of_compute_units', ctypes.c_uint32), ('PADDING_0', ctypes.c_ubyte * 4), ('target_graphics_version', ctypes.c_uint64), - ('partition_id', ctypes.c_uint32), - ('reserved', ctypes.c_uint32 * 14), + ('reserved', ctypes.c_uint32 * 15), ('PADDING_1', ctypes.c_ubyte * 4), ] @@ -921,6 +937,20 @@ struct_amdsmi_kfd_info_t._fields_ = [ ] amdsmi_kfd_info_t = struct_amdsmi_kfd_info_t +class struct_amdsmi_accelerator_partition_profile_t(Structure): + pass + +struct_amdsmi_accelerator_partition_profile_t._pack_ = 1 # source:False +struct_amdsmi_accelerator_partition_profile_t._fields_ = [ + ('profile_type', amdsmi_accelerator_partition_type_t), + ('num_partitions', ctypes.c_uint32), + ('profile_index', ctypes.c_uint32), + ('num_resources', ctypes.c_uint32), + ('resources', ctypes.c_uint32 * 32 * 8), + ('reserved', ctypes.c_uint64 * 6), +] + +amdsmi_accelerator_partition_profile_t = struct_amdsmi_accelerator_partition_profile_t # values for enumeration 'amdsmi_link_type_t' amdsmi_link_type_t__enumvalues = { @@ -2250,6 +2280,9 @@ amdsmi_set_gpu_memory_partition.argtypes = [amdsmi_processor_handle, amdsmi_memo amdsmi_reset_gpu_memory_partition = _libraries['libamd_smi.so'].amdsmi_reset_gpu_memory_partition amdsmi_reset_gpu_memory_partition.restype = amdsmi_status_t amdsmi_reset_gpu_memory_partition.argtypes = [amdsmi_processor_handle] +amdsmi_get_gpu_accelerator_partition_profile = _libraries['libamd_smi.so'].amdsmi_get_gpu_accelerator_partition_profile +amdsmi_get_gpu_accelerator_partition_profile.restype = amdsmi_status_t +amdsmi_get_gpu_accelerator_partition_profile.argtypes = [amdsmi_processor_handle, ctypes.POINTER(struct_amdsmi_accelerator_partition_profile_t), ctypes.POINTER(ctypes.c_uint32)] amdsmi_init_gpu_event_notification = _libraries['libamd_smi.so'].amdsmi_init_gpu_event_notification amdsmi_init_gpu_event_notification.restype = amdsmi_status_t amdsmi_init_gpu_event_notification.argtypes = [amdsmi_processor_handle] @@ -2447,7 +2480,12 @@ amdsmi_get_esmi_err_msg = _libraries['libamd_smi.so'].amdsmi_get_esmi_err_msg amdsmi_get_esmi_err_msg.restype = amdsmi_status_t amdsmi_get_esmi_err_msg.argtypes = [amdsmi_status_t, ctypes.POINTER(ctypes.POINTER(ctypes.c_char))] __all__ = \ - ['AGG_BW0', 'AMDSMI_AVERAGE_POWER', + ['AGG_BW0', 'AMDSMI_ACCELERATOR_PARTITION_CPX', + 'AMDSMI_ACCELERATOR_PARTITION_DPX', + 'AMDSMI_ACCELERATOR_PARTITION_INVALID', + 'AMDSMI_ACCELERATOR_PARTITION_QPX', + 'AMDSMI_ACCELERATOR_PARTITION_SPX', + 'AMDSMI_ACCELERATOR_PARTITION_TPX', 'AMDSMI_AVERAGE_POWER', 'AMDSMI_CACHE_PROPERTY_CPU_CACHE', 'AMDSMI_CACHE_PROPERTY_DATA_CACHE', 'AMDSMI_CACHE_PROPERTY_ENABLED', @@ -2651,21 +2689,23 @@ __all__ = \ 'AMDSMI_XGMI_STATUS_MULTIPLE_ERRORS', 'AMDSMI_XGMI_STATUS_NO_ERRORS', 'CLK_LIMIT_MAX', 'CLK_LIMIT_MIN', 'RD_BW0', 'WR_BW0', 'amd_metrics_table_header_t', - 'amdsmi_asic_info_t', 'amdsmi_bdf_t', 'amdsmi_bit_field_t', - 'amdsmi_board_info_t', 'amdsmi_cache_property_type_t', - 'amdsmi_card_form_factor_t', 'amdsmi_clean_gpu_local_data', - 'amdsmi_clk_info_t', 'amdsmi_clk_limit_type_t', - 'amdsmi_clk_type_t', 'amdsmi_compute_partition_type_t', - 'amdsmi_container_types_t', 'amdsmi_counter_command_t', - 'amdsmi_counter_value_t', 'amdsmi_cpu_apb_disable', - 'amdsmi_cpu_apb_enable', 'amdsmi_cpusocket_handle', - 'amdsmi_ddr_bw_metrics_t', 'amdsmi_dev_perf_level_t', - 'amdsmi_dimm_power_t', 'amdsmi_dimm_thermal_t', - 'amdsmi_dpm_level_t', 'amdsmi_dpm_policy_entry_t', - 'amdsmi_dpm_policy_t', 'amdsmi_driver_info_t', - 'amdsmi_engine_usage_t', 'amdsmi_error_count_t', - 'amdsmi_event_group_t', 'amdsmi_event_handle_t', - 'amdsmi_event_type_t', 'amdsmi_evt_notification_data_t', + 'amdsmi_accelerator_partition_profile_t', + 'amdsmi_accelerator_partition_type_t', 'amdsmi_asic_info_t', + 'amdsmi_bdf_t', 'amdsmi_bit_field_t', 'amdsmi_board_info_t', + 'amdsmi_cache_property_type_t', 'amdsmi_card_form_factor_t', + 'amdsmi_clean_gpu_local_data', 'amdsmi_clk_info_t', + 'amdsmi_clk_limit_type_t', 'amdsmi_clk_type_t', + 'amdsmi_compute_partition_type_t', 'amdsmi_container_types_t', + 'amdsmi_counter_command_t', 'amdsmi_counter_value_t', + 'amdsmi_cpu_apb_disable', 'amdsmi_cpu_apb_enable', + 'amdsmi_cpusocket_handle', 'amdsmi_ddr_bw_metrics_t', + 'amdsmi_dev_perf_level_t', 'amdsmi_dimm_power_t', + 'amdsmi_dimm_thermal_t', 'amdsmi_dpm_level_t', + 'amdsmi_dpm_policy_entry_t', 'amdsmi_dpm_policy_t', + 'amdsmi_driver_info_t', 'amdsmi_engine_usage_t', + 'amdsmi_error_count_t', 'amdsmi_event_group_t', + 'amdsmi_event_handle_t', 'amdsmi_event_type_t', + 'amdsmi_evt_notification_data_t', 'amdsmi_evt_notification_type_t', 'amdsmi_first_online_core_on_cpu_socket', 'amdsmi_free_name_value_pairs', 'amdsmi_freq_ind_t', @@ -2695,6 +2735,7 @@ __all__ = \ 'amdsmi_get_cpu_socket_temperature', 'amdsmi_get_cpucore_handles', 'amdsmi_get_cpusocket_handles', 'amdsmi_get_energy_count', 'amdsmi_get_esmi_err_msg', 'amdsmi_get_fw_info', + 'amdsmi_get_gpu_accelerator_partition_profile', 'amdsmi_get_gpu_activity', 'amdsmi_get_gpu_asic_info', 'amdsmi_get_gpu_available_counters', 'amdsmi_get_gpu_bad_page_info', 'amdsmi_get_gpu_bdf_id', @@ -2804,6 +2845,7 @@ __all__ = \ 'amdsmi_vram_vendor_type_t', 'amdsmi_xgmi_info_t', 'amdsmi_xgmi_status_t', 'processor_type_t', 'size_t', 'struct__links', 'struct_amd_metrics_table_header_t', + 'struct_amdsmi_accelerator_partition_profile_t', 'struct_amdsmi_asic_info_t', 'struct_amdsmi_board_info_t', 'struct_amdsmi_clk_info_t', 'struct_amdsmi_counter_value_t', 'struct_amdsmi_ddr_bw_metrics_t', 'struct_amdsmi_dimm_power_t', diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 47144f4ee4..7b7eda3a2d 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -774,15 +774,6 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i info->target_graphics_version = tmp_target_gfx_version; } - // default to 0xffffffff as not supported - info->partition_id = std::numeric_limits::max(); - auto tmp_partition_id = uint32_t(0); - status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, - &(tmp_partition_id)); - if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) { - info->partition_id = tmp_partition_id; - } - return AMDSMI_STATUS_SUCCESS; } @@ -1168,6 +1159,24 @@ amdsmi_reset_gpu_memory_partition(amdsmi_processor_handle processor_handle) { return rsmi_wrapper(rsmi_dev_memory_partition_reset, processor_handle); } +amdsmi_status_t +amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_handle, + amdsmi_accelerator_partition_profile_t *profile, + uint32_t *partition_id) { + AMDSMI_CHECK_INIT(); + // TODO: also fill out profile later + // default to 0xffffffff if not supported + *partition_id = std::numeric_limits::max(); + auto tmp_partition_id = uint32_t(0); + + amdsmi_status_t status = rsmi_wrapper(rsmi_dev_partition_id_get, processor_handle, &tmp_partition_id); + if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS){ + *partition_id = tmp_partition_id; + } + + return status; +} + // TODO(bliu) : other xgmi related information amdsmi_status_t amdsmi_get_xgmi_info(amdsmi_processor_handle processor_handle, amdsmi_xgmi_info_t *info) { @@ -1303,8 +1312,8 @@ void amdsmi_free_name_value_pairs(void *p) { amdsmi_status_t amdsmi_get_power_cap_info(amdsmi_processor_handle processor_handle, - uint32_t sensor_ind, - amdsmi_power_cap_info_t *info) { + uint32_t sensor_ind, + amdsmi_power_cap_info_t *info) { AMDSMI_CHECK_INIT(); if (info == nullptr) diff --git a/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc b/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc index 6c4a4b7717..7c9b7fd6b7 100644 --- a/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc +++ b/projects/amdsmi/tests/amd_smi_test/functional/sys_info_read.cc @@ -60,7 +60,7 @@ TestSysInfoRead::TestSysInfoRead() : TestBase() { set_title("AMDSMI System Info Read Test"); set_description("This test verifies that system information such as the " "BDFID, AMDSMI version, VBIOS version, " - "vendor_id, unique_id, target_gfx_version, kfd_id, node_id, partition_id, etc. " + "vendor_id, unique_id, target_gfx_version, kfd_id, node_id, etc. " "can be read properly."); } @@ -153,7 +153,7 @@ void TestSysInfoRead::Run(void) { ASSERT_EQ(err, AMDSMI_STATUS_INVAL); - // vendor_id, unique_id, target_gfx_version, partition_id + // vendor_id, unique_id, target_gfx_version amdsmi_asic_info_t asic_info = {}; err = amdsmi_get_gpu_asic_info(processor_handles_[i], &asic_info); if (err == AMDSMI_STATUS_NOT_SUPPORTED) { @@ -161,7 +161,6 @@ void TestSysInfoRead::Run(void) { "\t**amdsmi_dev_unique_id() is not supported" " on this machine" << std::endl; EXPECT_EQ(asic_info.target_graphics_version, std::numeric_limits::max()); - EXPECT_EQ(asic_info.partition_id, std::numeric_limits::max()); // Verify api support checking functionality is working err = amdsmi_get_gpu_asic_info(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED); @@ -172,12 +171,9 @@ void TestSysInfoRead::Run(void) { << asic_info.vendor_name << std::endl; std::cout << "\t**Target GFX version: " << std::dec << asic_info.target_graphics_version << "\n"; - std::cout << "\t**Partition ID: " << std::dec - << asic_info.partition_id << "\n"; } EXPECT_EQ(err, AMDSMI_STATUS_SUCCESS); EXPECT_NE(asic_info.target_graphics_version, std::numeric_limits::max()); - EXPECT_NE(asic_info.partition_id, std::numeric_limits::max()); // Verify api support checking functionality is working err = amdsmi_get_gpu_asic_info(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_INVAL); diff --git a/projects/amdsmi/tests/python_unittest/integration_test.py b/projects/amdsmi/tests/python_unittest/integration_test.py index 9e1f12bbe3..7c829e87bc 100755 --- a/projects/amdsmi/tests/python_unittest/integration_test.py +++ b/projects/amdsmi/tests/python_unittest/integration_test.py @@ -511,8 +511,6 @@ def walk_through(self): asic_info['oam_id'])) print(" asic_info['target_graphics_version'] is: {}\n".format( asic_info['target_graphics_version'])) - print(" asic_info['partition_id'] is: {}\n".format( - asic_info['partition_id'])) print("\n###Test amdsmi_get_gpu_kfd_info \n") kfd_info = amdsmi.amdsmi_get_gpu_kfd_info(processors[i]) print(" kfd_info['kfd_id'] is: {}\n".format(