From 1f8d9cb9efdbbb05dabf50fec974e8fd7d12a208 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Fri, 13 Oct 2023 01:41:14 -0500 Subject: [PATCH] Added memory & compute partitions to amd-smi lib Signed-off-by: Maisam Arif Change-Id: If3acea6ad281298f1f05785b2e6d8e70fae8d89b --- include/amd_smi/amdsmi.h | 302 +++++++++++++++++++++++---- py-interface/README.md | 187 +++++++++++++++++ py-interface/amdsmi_interface.py | 109 ++++++++++ py-interface/amdsmi_wrapper.py | 105 ++++++++-- rocm_smi/include/rocm_smi/rocm_smi.h | 26 +-- src/amd_smi/amd_smi.cc | 46 ++++ 6 files changed, 703 insertions(+), 72 deletions(-) diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index d35b8621b6..4199547127 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -221,6 +221,45 @@ typedef enum { CLK_TYPE__MAX = CLK_TYPE_DCLK1 } amdsmi_clk_type_t; +/** + * @brief Compute Partition. This enum is used to identify + * various compute partitioning settings. + */ +typedef enum { + COMPUTE_PARTITION_INVALID = 0, + COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory + COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + COMPUTE_PARTITION_QPX //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory +} amdsmi_compute_partition_type_t; + +/** + * @brief Memory Partitions. This enum is used to identify various + * memory partition types. + */ +typedef enum { + MEMORY_PARTITION_UNKNOWN = 0, + MEMORY_PARTITION_NPS1, //!< NPS1 - All CCD & XCD data is interleaved + //!< accross all 8 HBM stacks (all stacks/1). + MEMORY_PARTITION_NPS2, //!< NPS2 - 2 sets of CCDs or 4 XCD interleaved + //!< accross the 4 HBM stacks per AID pair + //!< (8 stacks/2). + MEMORY_PARTITION_NPS4, //!< NPS4 - Each XCD data is interleaved accross + //!< accross 2 (or single) HBM stacks + //!< (8 stacks/8 or 8 stacks/4). + MEMORY_PARTITION_NPS8, //!< NPS8 - Each XCD uses a single HBM stack + //!< (8 stacks/8). Or each XCD uses a single + //!< HBM stack & CCDs share 2 non-interleaved + //!< HBM stacks on its AID + //!< (AID[1,2,3] = 6 stacks/6). +} amdsmi_memory_partition_type_t; + /** * @brief This enumeration is used to indicate from which part of the device a * temperature reading should be obtained. @@ -1525,7 +1564,8 @@ amdsmi_status_t amdsmi_get_processor_type(amdsmi_processor_handle processor_hand * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_processor_handle_from_bdf(amdsmi_bdf_t bdf, amdsmi_processor_handle* processor_handle); +amdsmi_status_t amdsmi_get_processor_handle_from_bdf(amdsmi_bdf_t bdf, + amdsmi_processor_handle* processor_handle); /** @} End DiscQueries */ @@ -1715,7 +1755,8 @@ amdsmi_get_gpu_subsystem_name(amdsmi_processor_handle processor_handle, char *na * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ amdsmi_status_t -amdsmi_get_gpu_pci_bandwidth(amdsmi_processor_handle processor_handle, amdsmi_pcie_bandwidth_t *bandwidth); +amdsmi_get_gpu_pci_bandwidth(amdsmi_processor_handle processor_handle, + amdsmi_pcie_bandwidth_t *bandwidth); /** * @brief Get the unique PCI device identifier associated for a device @@ -1770,7 +1811,8 @@ amdsmi_status_t amdsmi_get_gpu_bdf_id(amdsmi_processor_handle processor_handle, * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_gpu_topo_numa_affinity(amdsmi_processor_handle processor_handle, int32_t *numa_node); +amdsmi_status_t amdsmi_get_gpu_topo_numa_affinity(amdsmi_processor_handle processor_handle, + int32_t *numa_node); /** * @brief Get PCIe traffic information. It is not supported on virtual machine guest @@ -1816,7 +1858,7 @@ amdsmi_status_t amdsmi_get_gpu_pci_throughput(amdsmi_processor_handle processor_ * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_gpu_pci_replay_counter(amdsmi_processor_handle processor_handle, +amdsmi_status_t amdsmi_get_gpu_pci_replay_counter(amdsmi_processor_handle processor_handle, uint64_t *counter); /** @} End PCIeQuer */ @@ -1857,7 +1899,8 @@ amdsmi_status_t amdsmi_get_gpu_pci_replay_counter(amdsmi_processor_handle proce * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_set_gpu_pci_bandwidth(amdsmi_processor_handle processor_handle, uint64_t bw_bitmask); +amdsmi_status_t amdsmi_set_gpu_pci_bandwidth(amdsmi_processor_handle processor_handle, + uint64_t bw_bitmask); /** @} End PCIeCont */ @@ -2021,7 +2064,8 @@ amdsmi_get_gpu_memory_usage(amdsmi_processor_handle processor_handle, amdsmi_mem * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ amdsmi_status_t -amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t *num_pages, amdsmi_retired_page_record_t *info); +amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t *num_pages, + amdsmi_retired_page_record_t *info); /** * @brief Returns RAS features info. @@ -2059,8 +2103,9 @@ amdsmi_status_t amdsmi_get_gpu_ras_feature_info( * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ amdsmi_status_t -amdsmi_get_gpu_ras_block_features_enabled(amdsmi_processor_handle processor_handle, amdsmi_gpu_block_t block, - amdsmi_ras_err_state_t *state); +amdsmi_get_gpu_ras_block_features_enabled(amdsmi_processor_handle processor_handle, + amdsmi_gpu_block_t block, + amdsmi_ras_err_state_t *state); /** * @brief Get information about reserved ("retired") memory pages. It is not supported on @@ -2094,8 +2139,9 @@ amdsmi_get_gpu_ras_block_features_enabled(amdsmi_processor_handle processor_hand * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ amdsmi_status_t -amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle, uint32_t *num_pages, - amdsmi_retired_page_record_t *records); +amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle, + uint32_t *num_pages, + amdsmi_retired_page_record_t *records); /** @} End MemQuer */ @@ -2127,8 +2173,8 @@ amdsmi_get_gpu_memory_reserved_pages(amdsmi_processor_handle processor_handle, u * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_gpu_fan_rpms(amdsmi_processor_handle processor_handle, uint32_t sensor_ind, - int64_t *speed); +amdsmi_status_t amdsmi_get_gpu_fan_rpms(amdsmi_processor_handle processor_handle, + uint32_t sensor_ind, int64_t *speed); /** * @brief Get the fan speed for the specified device as a value relative to @@ -2208,7 +2254,7 @@ amdsmi_status_t amdsmi_get_gpu_fan_speed_max(amdsmi_processor_handle processor_h * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle, +amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle, amdsmi_temperature_type_t sensor_type, amdsmi_temperature_metric_t metric, int64_t *temperature); @@ -2252,7 +2298,7 @@ amdsmi_status_t amdsmi_get_gpu_cache_info( * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_gpu_volt_metric(amdsmi_processor_handle processor_handle, +amdsmi_status_t amdsmi_get_gpu_volt_metric(amdsmi_processor_handle processor_handle, amdsmi_voltage_type_t sensor_type, amdsmi_voltage_metric_t metric, int64_t *voltage); @@ -2354,7 +2400,8 @@ amdsmi_get_utilization_count(amdsmi_processor_handle processor_handle, * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_pcie_link_status(amdsmi_processor_handle processor_handle, amdsmi_pcie_info_t *info); +amdsmi_status_t amdsmi_get_pcie_link_status(amdsmi_processor_handle processor_handle, + amdsmi_pcie_info_t *info); /** * @brief Get max PCIe capabilities of the device with provided processor handle. @@ -2368,7 +2415,8 @@ amdsmi_status_t amdsmi_get_pcie_link_status(amdsmi_processor_handle processor_ha * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_pcie_link_caps(amdsmi_processor_handle processor_handle, amdsmi_pcie_info_t *info); +amdsmi_status_t amdsmi_get_pcie_link_caps(amdsmi_processor_handle processor_handle, + amdsmi_pcie_info_t *info); /** * @brief Get the performance level of the device. It is not supported on virtual @@ -2433,7 +2481,8 @@ amdsmi_set_gpu_perf_determinism_mode(amdsmi_processor_handle processor_handle, u * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_gpu_overdrive_level(amdsmi_processor_handle processor_handle, uint32_t *od); +amdsmi_status_t amdsmi_get_gpu_overdrive_level(amdsmi_processor_handle processor_handle, + uint32_t *od); /** * @brief Get the list of possible system clock speeds of device for a @@ -2454,7 +2503,7 @@ amdsmi_status_t amdsmi_get_gpu_overdrive_level(amdsmi_processor_handle processor * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_clk_freq(amdsmi_processor_handle processor_handle, +amdsmi_status_t amdsmi_get_clk_freq(amdsmi_processor_handle processor_handle, amdsmi_clk_type_t clk_type, amdsmi_frequencies_t *f); /** @@ -2487,7 +2536,7 @@ amdsmi_status_t amdsmi_reset_gpu(amdsmi_processor_handle processor_handle); * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_gpu_od_volt_info(amdsmi_processor_handle processor_handle, +amdsmi_status_t amdsmi_get_gpu_od_volt_info(amdsmi_processor_handle processor_handle, amdsmi_od_volt_freq_data_t *odv); /** @@ -2508,7 +2557,7 @@ amdsmi_status_t amdsmi_get_gpu_od_volt_info(amdsmi_processor_handle processor_h * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_gpu_metrics_info(amdsmi_processor_handle processor_handle, +amdsmi_status_t amdsmi_get_gpu_metrics_info(amdsmi_processor_handle processor_handle, amdsmi_gpu_metrics_t *pgpu_metrics); /** @@ -2531,9 +2580,10 @@ amdsmi_status_t amdsmi_get_gpu_metrics_info(amdsmi_processor_handle processor_h * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_set_gpu_clk_range(amdsmi_processor_handle processor_handle, uint64_t minclkvalue, - uint64_t maxclkvalue, - amdsmi_clk_type_t clkType); +amdsmi_status_t amdsmi_set_gpu_clk_range(amdsmi_processor_handle processor_handle, + uint64_t minclkvalue, + uint64_t maxclkvalue, + amdsmi_clk_type_t clkType); /** * @brief This function sets the clock frequency information. It is not supported on @@ -2555,9 +2605,10 @@ amdsmi_status_t amdsmi_set_gpu_clk_range(amdsmi_processor_handle processor_handl * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_set_gpu_od_clk_info(amdsmi_processor_handle processor_handle, amdsmi_freq_ind_t level, - uint64_t clkvalue, - amdsmi_clk_type_t clkType); +amdsmi_status_t amdsmi_set_gpu_od_clk_info(amdsmi_processor_handle processor_handle, + amdsmi_freq_ind_t level, + uint64_t clkvalue, + amdsmi_clk_type_t clkType); /** * @brief This function sets 1 of the 3 voltage curve points. It is not supported @@ -2578,8 +2629,10 @@ amdsmi_status_t amdsmi_set_gpu_od_clk_info(amdsmi_processor_handle processor_ha * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_set_gpu_od_volt_info(amdsmi_processor_handle processor_handle, uint32_t vpoint, - uint64_t clkvalue, uint64_t voltvalue); +amdsmi_status_t amdsmi_set_gpu_od_volt_info(amdsmi_processor_handle processor_handle, + uint32_t vpoint, + uint64_t clkvalue, + uint64_t voltvalue); /** * @brief This function will retrieve the current valid regions in the @@ -2616,7 +2669,7 @@ amdsmi_status_t amdsmi_set_gpu_od_volt_info(amdsmi_processor_handle processor_h * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_gpu_od_volt_curve_regions(amdsmi_processor_handle processor_handle, +amdsmi_status_t amdsmi_get_gpu_od_volt_curve_regions(amdsmi_processor_handle processor_handle, uint32_t *num_regions, amdsmi_freq_volt_region_t *buffer); /** @@ -2720,7 +2773,7 @@ amdsmi_status_t * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_set_gpu_overdrive_level(amdsmi_processor_handle processor_handle, uint32_t od); +amdsmi_status_t amdsmi_set_gpu_overdrive_level(amdsmi_processor_handle processor_handle, uint32_t od); /** * @brief Control the set of allowed frequencies that can be used for the @@ -2754,7 +2807,7 @@ amdsmi_status_t amdsmi_set_gpu_overdrive_level(amdsmi_processor_handle processo * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_set_clk_freq(amdsmi_processor_handle processor_handle, +amdsmi_status_t amdsmi_set_clk_freq(amdsmi_processor_handle processor_handle, amdsmi_clk_type_t clk_type, uint64_t freq_bitmask); /** @} End PerfCont */ @@ -2811,7 +2864,7 @@ amdsmi_get_lib_version(amdsmi_version_t *version); * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_gpu_ecc_count(amdsmi_processor_handle processor_handle, +amdsmi_status_t amdsmi_get_gpu_ecc_count(amdsmi_processor_handle processor_handle, amdsmi_gpu_block_t block, amdsmi_error_count_t *ec); /** @@ -2838,7 +2891,7 @@ amdsmi_status_t amdsmi_get_gpu_ecc_count(amdsmi_processor_handle processor_hand * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_gpu_ecc_enabled(amdsmi_processor_handle processor_handle, +amdsmi_status_t amdsmi_get_gpu_ecc_enabled(amdsmi_processor_handle processor_handle, uint64_t *enabled_blocks); /** @@ -2863,7 +2916,8 @@ amdsmi_status_t amdsmi_get_gpu_ecc_enabled(amdsmi_processor_handle processor_ha * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_gpu_ecc_status(amdsmi_processor_handle processor_handle, amdsmi_gpu_block_t block, +amdsmi_status_t amdsmi_get_gpu_ecc_status(amdsmi_processor_handle processor_handle, + amdsmi_gpu_block_t block, amdsmi_ras_err_state_t *state); /** @@ -3317,8 +3371,10 @@ amdsmi_topo_get_link_weight(amdsmi_processor_handle processor_handle_src, amdsmi * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ amdsmi_status_t - amdsmi_get_minmax_bandwidth_between_processors(amdsmi_processor_handle processor_handle_src, amdsmi_processor_handle processor_handle_dst, - uint64_t *min_bandwidth, uint64_t *max_bandwidth); + amdsmi_get_minmax_bandwidth_between_processors(amdsmi_processor_handle processor_handle_src, + amdsmi_processor_handle processor_handle_dst, + uint64_t *min_bandwidth, + uint64_t *max_bandwidth); /** * @brief Retrieve the hops and the connection type between 2 GPUs @@ -3366,11 +3422,181 @@ amdsmi_topo_get_link_type(amdsmi_processor_handle processor_handle_src, * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ amdsmi_status_t -amdsmi_is_P2P_accessible(amdsmi_processor_handle processor_handle_src, amdsmi_processor_handle processor_handle_dst, - bool *accessible); +amdsmi_is_P2P_accessible(amdsmi_processor_handle processor_handle_src, + amdsmi_processor_handle processor_handle_dst, + bool *accessible); /** @} End HWTopo */ +/*****************************************************************************/ +/** @defgroup compute_partition Compute Partition Functions + * These functions are used to configure and query the device's + * compute parition setting. + * @{ + */ + +/** + * @brief Retrieves the current compute partitioning for a desired device + * + * @details + * Given a device index @p dv_ind and a string @p compute_partition , + * and uint32 @p len , this function will attempt to obtain the device's + * current compute partition setting string. Upon successful retreival, + * the obtained device's compute partition settings string shall be stored in + * the passed @p compute_partition char string variable. + * + * @param[in] dv_ind a device index + * + * @param[inout] compute_partition a pointer to a char string variable, + * which the device's current compute partition will be written to. + * + * @param[in] len the length of the caller provided buffer @p compute_partition + * , suggested length is 4 or greater. + * + * @retval ::AMDSMI_STATUS_SUCCESS call was successful + * @retval ::AMDSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::AMDSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid + * @retval ::AMDSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * @retval ::AMDSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not + * large enough to hold the entire compute partition value. In this case, + * only @p len bytes will be written. + * + */ +amdsmi_status_t +amdsmi_dev_compute_partition_get(amdsmi_processor_handle processor_handle, + char *compute_partition, uint32_t len); + +/** + * @brief Modifies a selected device's compute partition setting. + * + * @details Given a device index @p dv_ind, a type of compute partition + * @p compute_partition, this function will attempt to update the selected + * device's compute partition setting. + * + * @param[in] dv_ind a device index + * + * @param[in] compute_partition using enum ::amdsmi_compute_partition_type_t, + * define what the selected device's compute partition setting should be + * updated to. + * + * @retval ::AMDSMI_STATUS_SUCCESS call was successful + * @retval ::AMDSMI_STATUS_PERMISSION function requires root access + * @retval ::AMDSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::AMDSMI_STATUS_SETTING_UNAVAILABLE the provided setting is + * unavailable for current device + * @retval ::AMDSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * + */ +amdsmi_status_t +amdsmi_dev_compute_partition_set(amdsmi_processor_handle processor_handle, + amdsmi_compute_partition_type_t compute_partition); + +/** + * @brief Reverts a selected device's compute partition setting back to its + * boot state. + * + * @details Given a device index @p dv_ind , this function will attempt to + * revert its compute partition setting back to its boot state. + * + * @param[in] dv_ind a device index + * + * @retval ::AMDSMI_STATUS_SUCCESS call was successful + * @retval ::AMDSMI_STATUS_PERMISSION function requires root access + * @retval ::AMDSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * + */ +amdsmi_status_t amdsmi_dev_compute_partition_reset(amdsmi_processor_handle processor_handle); + +/** @} */ // end of compute_partition + +/*****************************************************************************/ +/** @defgroup memory_partition Memory Partition Functions + * These functions are used to query and set the device's current memory + * partition. + * @{ + */ + +/** + * @brief Retrieves the current memory partition for a desired device + * + * @details + * Given a device index @p dv_ind and a string @p memory_partition , + * and uint32 @p len , this function will attempt to obtain the device's + * memory partition string. Upon successful retreival, the obtained device's + * memory partition string shall be stored in the passed @p memory_partition + * char string variable. + * + * @param[in] dv_ind a device index + * + * @param[inout] memory_partition a pointer to a char string variable, + * which the device's memory partition will be written to. + * + * @param[in] len the length of the caller provided buffer @p memory_partition , + * suggested length is 5 or greater. + * + * @retval ::AMDSMI_STATUS_SUCCESS call was successful + * @retval ::AMDSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::AMDSMI_STATUS_UNEXPECTED_DATA data provided to function is not valid + * @retval ::AMDSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * @retval ::AMDSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not + * large enough to hold the entire memory partition value. In this case, + * only @p len bytes will be written. + * + */ +amdsmi_status_t +amdsmi_dev_memory_partition_get(amdsmi_processor_handle processor_handle, + char *memory_partition, uint32_t len); + +/** + * @brief Modifies a selected device's current memory partition setting. + * + * @details Given a device index @p dv_ind and a type of memory partition + * @p memory_partition, this function will attempt to update the selected + * device's memory partition setting. + * + * @param[in] dv_ind a device index + * + * @param[in] memory_partition using enum ::amdsmi_memory_partition_type_t, + * define what the selected device's current mode setting should be updated to. + * + * @retval ::AMDSMI_STATUS_SUCCESS call was successful + * @retval ::AMDSMI_STATUS_PERMISSION function requires root access + * @retval ::AMDSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::AMDSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * @retval ::AMDSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart + * the amdgpu driver + * + */ +amdsmi_status_t +amdsmi_dev_memory_partition_set(amdsmi_processor_handle processor_handle, + amdsmi_memory_partition_type_t memory_partition); + +/** + * @brief Reverts a selected device's memory partition setting back to its + * boot state. + * + * @details Given a device index @p dv_ind , this function will attempt to + * revert its current memory partition setting back to its boot state. + * + * @param[in] dv_ind a device index + * + * @retval ::AMDSMI_STATUS_SUCCESS call was successful + * @retval ::AMDSMI_STATUS_PERMISSION function requires root access + * @retval ::AMDSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function + * @retval ::AMDSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart + * the amdgpu driver + * + */ +amdsmi_status_t amdsmi_dev_memory_partition_reset(amdsmi_processor_handle processor_handle); + +/** @} */ // end of memory_partition + /*****************************************************************************/ /** @defgroup EvntNotif Event Notification Functions * These functions are used to configure for and get asynchronous event diff --git a/py-interface/README.md b/py-interface/README.md index c57b63074c..25845075c1 100644 --- a/py-interface/README.md +++ b/py-interface/README.md @@ -3268,6 +3268,193 @@ except AmdSmiException as e: print(e) ``` + +### amdsmi_dev_compute_partition_get + +Description: Get the compute partition from the given GPU + +Input parameters: + +* `processor_handle` the device handle + +Output: String of the partition type + +Exceptions that can be thrown by `amdsmi_dev_compute_partition_get` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + compute_partition_type = amdsmi_dev_compute_partition_get(device) + print(compute_partition_type) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_dev_compute_partition_set + +Description: Set the compute partition to the given GPU + +Input parameters: + +* `processor_handle` the device handle +* `compute_partition` the type of compute_partition to set + +Output: String of the partition type + +Exceptions that can be thrown by `amdsmi_dev_compute_partition_set` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + compute_partition = AmdSmiComputePartitionType.SPX + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + amdsmi_dev_compute_partition_set(device, compute_partition) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_dev_compute_partition_reset + +Description: Reset the compute partitioning on the given GPU + +Input parameters: + +* `processor_handle` the device handle + +Output: String of the partition type + +Exceptions that can be thrown by `amdsmi_dev_compute_partition_reset` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + amdsmi_dev_compute_partition_reset(device) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_dev_memory_partition_get + +Description: Get the memory partition from the given GPU + +Input parameters: + +* `processor_handle` the device handle + +Output: String of the partition type + +Exceptions that can be thrown by `amdsmi_dev_memory_partition_get` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + memory_partition_type = amdsmi_dev_memory_partition_get(device) + print(memory_partition_type) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_dev_memory_partition_set + +Description: Set the memory partition to the given GPU + +Input parameters: + +* `processor_handle` the device handle +* `memory_partition` the type of memory_partition to set + +Output: String of the partition type + +Exceptions that can be thrown by `amdsmi_dev_memory_partition_set` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + memory_partition = AmdSmiMemoryPartitionType.NPS1 + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + amdsmi_dev_memory_partition_set(device, memory_partition) +except AmdSmiException as e: + print(e) +``` + +### amdsmi_dev_memory_partition_reset + +Description: Reset the memory partitioning on the given GPU + +Input parameters: + +* `processor_handle` the device handle + +Output: String of the partition type + +Exceptions that can be thrown by `amdsmi_dev_memory_partition_reset` function: + +* `AmdSmiLibraryException` +* `AmdSmiRetryException` +* `AmdSmiParameterException` + +Example: + +```python +try: + devices = amdsmi_get_processor_handles() + if len(devices) == 0: + print("No GPUs on machine") + else: + for device in devices: + amdsmi_dev_memory_partition_reset(device) +except AmdSmiException as e: + print(e) +``` + ### amdsmi_get_xgmi_info Description: Returns XGMI information for the GPU. diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py index 01ce67a793..b39b94d1b1 100644 --- a/py-interface/amdsmi_interface.py +++ b/py-interface/amdsmi_interface.py @@ -244,6 +244,23 @@ class AmdSmiVoltageType(IntEnum): INVALID = amdsmi_wrapper.AMDSMI_VOLT_TYPE_INVALID +class AmdSmiComputePartitionType(IntEnum): + CPX = amdsmi_wrapper.COMPUTE_PARTITION_CPX + SPX = amdsmi_wrapper.COMPUTE_PARTITION_SPX + DPX = amdsmi_wrapper.COMPUTE_PARTITION_DPX + TPX = amdsmi_wrapper.COMPUTE_PARTITION_TPX + QPX = amdsmi_wrapper.COMPUTE_PARTITION_QPX + INVALID = amdsmi_wrapper.COMPUTE_PARTITION_INVALID + + +class AmdSmiMemoryPartitionType(IntEnum): + NPS1 = amdsmi_wrapper.MEMORY_PARTITION_NPS1 + NPS2 = amdsmi_wrapper.MEMORY_PARTITION_NPS2 + NPS4 = amdsmi_wrapper.MEMORY_PARTITION_NPS4 + NPS8 = amdsmi_wrapper.MEMORY_PARTITION_NPS8 + UNKNOWN = amdsmi_wrapper.MEMORY_PARTITION_UNKNOWN + + class AmdSmiPowerProfilePresetMasks(IntEnum): CUSTOM_MASK = amdsmi_wrapper.AMDSMI_PWR_PROF_PRST_CUSTOM_MASK VIDEO_MASK = amdsmi_wrapper.AMDSMI_PWR_PROF_PRST_VIDEO_MASK @@ -1374,6 +1391,98 @@ def amdsmi_is_P2P_accessible( return accessible.value +def amdsmi_dev_compute_partition_get(processor_handle: amdsmi_wrapper.amdsmi_processor_handle): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + length = ctypes.c_uint32() + length.value = _AMDSMI_STRING_LENGTH + + compute_partition = ctypes.create_string_buffer(_AMDSMI_STRING_LENGTH) + + _check_res( + amdsmi_wrapper.amdsmi_dev_compute_partition_get( + processor_handle, compute_partition, length + ) + ) + + return compute_partition.value.decode("utf-8") + + +def amdsmi_dev_compute_partition_set(processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + compute_partition: AmdSmiComputePartitionType): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + if not isinstance(compute_partition, AmdSmiComputePartitionType): + raise AmdSmiParameterException(compute_partition, AmdSmiComputePartitionType) + + _check_res( + amdsmi_wrapper.amdsmi_dev_compute_partition_set( + processor_handle, compute_partition + ) + ) + + +def amdsmi_dev_compute_partition_reset(processor_handle: amdsmi_wrapper.amdsmi_processor_handle): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + _check_res(amdsmi_wrapper.amdsmi_dev_compute_partition_reset(processor_handle)) + + +def amdsmi_dev_memory_partition_get(processor_handle: amdsmi_wrapper.amdsmi_processor_handle): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + length = ctypes.c_uint32() + length.value = _AMDSMI_STRING_LENGTH + + memory_partition = ctypes.create_string_buffer(_AMDSMI_STRING_LENGTH) + + _check_res( + amdsmi_wrapper.amdsmi_dev_memory_partition_get( + processor_handle, memory_partition, length + ) + ) + + return memory_partition.value.decode("utf-8") + + +def amdsmi_dev_memory_partition_set(processor_handle: amdsmi_wrapper.amdsmi_processor_handle, + memory_partition: AmdSmiMemoryPartitionType): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + if not isinstance(memory_partition, AmdSmiMemoryPartitionType): + raise AmdSmiParameterException(memory_partition, AmdSmiMemoryPartitionType) + + _check_res( + amdsmi_wrapper.amdsmi_dev_memory_partition_set( + processor_handle, memory_partition + ) + ) + + +def amdsmi_dev_memory_partition_reset(processor_handle: amdsmi_wrapper.amdsmi_processor_handle): + if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): + raise AmdSmiParameterException( + processor_handle, amdsmi_wrapper.amdsmi_processor_handle + ) + + _check_res(amdsmi_wrapper.amdsmi_dev_memory_partition_reset(processor_handle)) + + def amdsmi_get_xgmi_info(processor_handle: amdsmi_wrapper.amdsmi_processor_handle): if not isinstance(processor_handle, amdsmi_wrapper.amdsmi_processor_handle): raise AmdSmiParameterException( diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 0b702ace1b..447bcbd540 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -368,6 +368,38 @@ CLK_TYPE_DCLK1 = 9 CLK_TYPE__MAX = 9 amdsmi_clk_type_t = ctypes.c_uint32 # enum +# values for enumeration 'amdsmi_compute_partition_type_t' +amdsmi_compute_partition_type_t__enumvalues = { + 0: 'COMPUTE_PARTITION_INVALID', + 1: 'COMPUTE_PARTITION_CPX', + 2: 'COMPUTE_PARTITION_SPX', + 3: 'COMPUTE_PARTITION_DPX', + 4: 'COMPUTE_PARTITION_TPX', + 5: 'COMPUTE_PARTITION_QPX', +} +COMPUTE_PARTITION_INVALID = 0 +COMPUTE_PARTITION_CPX = 1 +COMPUTE_PARTITION_SPX = 2 +COMPUTE_PARTITION_DPX = 3 +COMPUTE_PARTITION_TPX = 4 +COMPUTE_PARTITION_QPX = 5 +amdsmi_compute_partition_type_t = ctypes.c_uint32 # enum + +# values for enumeration 'amdsmi_memory_partition_type_t' +amdsmi_memory_partition_type_t__enumvalues = { + 0: 'MEMORY_PARTITION_UNKNOWN', + 1: 'MEMORY_PARTITION_NPS1', + 2: 'MEMORY_PARTITION_NPS2', + 3: 'MEMORY_PARTITION_NPS4', + 4: 'MEMORY_PARTITION_NPS8', +} +MEMORY_PARTITION_UNKNOWN = 0 +MEMORY_PARTITION_NPS1 = 1 +MEMORY_PARTITION_NPS2 = 2 +MEMORY_PARTITION_NPS4 = 3 +MEMORY_PARTITION_NPS8 = 4 +amdsmi_memory_partition_type_t = ctypes.c_uint32 # enum + # values for enumeration 'amdsmi_temperature_type_t' amdsmi_temperature_type_t__enumvalues = { 0: 'TEMPERATURE_TYPE_EDGE', @@ -1737,6 +1769,24 @@ amdsmi_topo_get_link_type.argtypes = [amdsmi_processor_handle, amdsmi_processor_ amdsmi_is_P2P_accessible = _libraries['libamd_smi.so'].amdsmi_is_P2P_accessible amdsmi_is_P2P_accessible.restype = amdsmi_status_t amdsmi_is_P2P_accessible.argtypes = [amdsmi_processor_handle, amdsmi_processor_handle, ctypes.POINTER(ctypes.c_bool)] +amdsmi_dev_compute_partition_get = _libraries['libamd_smi.so'].amdsmi_dev_compute_partition_get +amdsmi_dev_compute_partition_get.restype = amdsmi_status_t +amdsmi_dev_compute_partition_get.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_char), uint32_t] +amdsmi_dev_compute_partition_set = _libraries['libamd_smi.so'].amdsmi_dev_compute_partition_set +amdsmi_dev_compute_partition_set.restype = amdsmi_status_t +amdsmi_dev_compute_partition_set.argtypes = [amdsmi_processor_handle, amdsmi_compute_partition_type_t] +amdsmi_dev_compute_partition_reset = _libraries['libamd_smi.so'].amdsmi_dev_compute_partition_reset +amdsmi_dev_compute_partition_reset.restype = amdsmi_status_t +amdsmi_dev_compute_partition_reset.argtypes = [amdsmi_processor_handle] +amdsmi_dev_memory_partition_get = _libraries['libamd_smi.so'].amdsmi_dev_memory_partition_get +amdsmi_dev_memory_partition_get.restype = amdsmi_status_t +amdsmi_dev_memory_partition_get.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_char), uint32_t] +amdsmi_dev_memory_partition_set = _libraries['libamd_smi.so'].amdsmi_dev_memory_partition_set +amdsmi_dev_memory_partition_set.restype = amdsmi_status_t +amdsmi_dev_memory_partition_set.argtypes = [amdsmi_processor_handle, amdsmi_memory_partition_type_t] +amdsmi_dev_memory_partition_reset = _libraries['libamd_smi.so'].amdsmi_dev_memory_partition_reset +amdsmi_dev_memory_partition_reset.restype = amdsmi_status_t +amdsmi_dev_memory_partition_reset.argtypes = [amdsmi_processor_handle] amdsmi_init_gpu_event_notification = _libraries['libamd_smi.so'].amdsmi_init_gpu_event_notification amdsmi_init_gpu_event_notification.restype = amdsmi_status_t amdsmi_init_gpu_event_notification.argtypes = [amdsmi_processor_handle] @@ -1920,6 +1970,9 @@ __all__ = \ 'CLK_TYPE_DCLK1', 'CLK_TYPE_DF', 'CLK_TYPE_FIRST', 'CLK_TYPE_GFX', 'CLK_TYPE_MEM', 'CLK_TYPE_PCIE', 'CLK_TYPE_SOC', 'CLK_TYPE_SYS', 'CLK_TYPE_VCLK0', 'CLK_TYPE_VCLK1', 'CLK_TYPE__MAX', + 'COMPUTE_PARTITION_CPX', 'COMPUTE_PARTITION_DPX', + 'COMPUTE_PARTITION_INVALID', 'COMPUTE_PARTITION_QPX', + 'COMPUTE_PARTITION_SPX', 'COMPUTE_PARTITION_TPX', 'CONTAINER_DOCKER', 'CONTAINER_LXC', 'FW_ID_ASD', 'FW_ID_CP_CE', 'FW_ID_CP_ME', 'FW_ID_CP_MEC1', 'FW_ID_CP_MEC2', 'FW_ID_CP_MEC_JT1', 'FW_ID_CP_MEC_JT2', 'FW_ID_CP_MES', @@ -1947,21 +2000,30 @@ __all__ = \ 'FW_ID_SDMA6', 'FW_ID_SDMA7', 'FW_ID_SDMA_TH0', 'FW_ID_SDMA_TH1', 'FW_ID_SEC_POLICY_STAGE2', 'FW_ID_SMC', 'FW_ID_SMU', 'FW_ID_TA_RAS', 'FW_ID_UVD', 'FW_ID_VCE', 'FW_ID_VCN', - 'FW_ID_XGMI', 'FW_ID__MAX', 'NON_AMD_CPU', 'NON_AMD_GPU', - 'TEMPERATURE_TYPE_EDGE', 'TEMPERATURE_TYPE_FIRST', - 'TEMPERATURE_TYPE_HBM_0', 'TEMPERATURE_TYPE_HBM_1', - 'TEMPERATURE_TYPE_HBM_2', 'TEMPERATURE_TYPE_HBM_3', - 'TEMPERATURE_TYPE_HOTSPOT', 'TEMPERATURE_TYPE_JUNCTION', - 'TEMPERATURE_TYPE_PLX', 'TEMPERATURE_TYPE_VRAM', - 'TEMPERATURE_TYPE__MAX', 'UNKNOWN', 'VRAM_TYPE_DDR2', - 'VRAM_TYPE_DDR3', 'VRAM_TYPE_DDR4', 'VRAM_TYPE_GDDR1', - 'VRAM_TYPE_GDDR3', 'VRAM_TYPE_GDDR4', 'VRAM_TYPE_GDDR5', - 'VRAM_TYPE_GDDR6', 'VRAM_TYPE_HBM', 'VRAM_TYPE_UNKNOWN', - 'VRAM_TYPE__MAX', 'amd_metrics_table_header_t', - 'amdsmi_asic_info_t', 'amdsmi_bdf_t', 'amdsmi_bit_field_t', - 'amdsmi_board_info_t', 'amdsmi_clk_info_t', 'amdsmi_clk_type_t', - 'amdsmi_container_types_t', 'amdsmi_counter_command_t', - 'amdsmi_counter_value_t', 'amdsmi_dev_perf_level_t', + 'FW_ID_XGMI', 'FW_ID__MAX', 'MEMORY_PARTITION_NPS1', + 'MEMORY_PARTITION_NPS2', 'MEMORY_PARTITION_NPS4', + 'MEMORY_PARTITION_NPS8', 'MEMORY_PARTITION_UNKNOWN', + 'NON_AMD_CPU', 'NON_AMD_GPU', 'TEMPERATURE_TYPE_EDGE', + 'TEMPERATURE_TYPE_FIRST', 'TEMPERATURE_TYPE_HBM_0', + 'TEMPERATURE_TYPE_HBM_1', 'TEMPERATURE_TYPE_HBM_2', + 'TEMPERATURE_TYPE_HBM_3', 'TEMPERATURE_TYPE_HOTSPOT', + 'TEMPERATURE_TYPE_JUNCTION', 'TEMPERATURE_TYPE_PLX', + 'TEMPERATURE_TYPE_VRAM', 'TEMPERATURE_TYPE__MAX', 'UNKNOWN', + 'VRAM_TYPE_DDR2', 'VRAM_TYPE_DDR3', 'VRAM_TYPE_DDR4', + 'VRAM_TYPE_GDDR1', 'VRAM_TYPE_GDDR3', 'VRAM_TYPE_GDDR4', + 'VRAM_TYPE_GDDR5', 'VRAM_TYPE_GDDR6', 'VRAM_TYPE_HBM', + 'VRAM_TYPE_UNKNOWN', 'VRAM_TYPE__MAX', + 'amd_metrics_table_header_t', 'amdsmi_asic_info_t', + 'amdsmi_bdf_t', 'amdsmi_bit_field_t', 'amdsmi_board_info_t', + 'amdsmi_clk_info_t', 'amdsmi_clk_type_t', + 'amdsmi_compute_partition_type_t', 'amdsmi_container_types_t', + 'amdsmi_counter_command_t', 'amdsmi_counter_value_t', + 'amdsmi_dev_compute_partition_get', + 'amdsmi_dev_compute_partition_reset', + 'amdsmi_dev_compute_partition_set', + 'amdsmi_dev_memory_partition_get', + 'amdsmi_dev_memory_partition_reset', + 'amdsmi_dev_memory_partition_set', 'amdsmi_dev_perf_level_t', 'amdsmi_driver_info_t', 'amdsmi_engine_usage_t', 'amdsmi_error_count_t', 'amdsmi_event_group_t', 'amdsmi_event_handle_t', 'amdsmi_event_type_t', @@ -2017,12 +2079,13 @@ __all__ = \ 'amdsmi_init_gpu_event_notification', 'amdsmi_io_link_type_t', 'amdsmi_is_P2P_accessible', 'amdsmi_is_gpu_power_management_enabled', - 'amdsmi_memory_page_status_t', 'amdsmi_memory_type_t', - 'amdsmi_mm_ip_t', 'amdsmi_od_vddc_point_t', - 'amdsmi_od_volt_curve_t', 'amdsmi_od_volt_freq_data_t', - 'amdsmi_pcie_bandwidth_t', 'amdsmi_pcie_info_t', - 'amdsmi_pcie_slot_type_t', 'amdsmi_power_cap_info_t', - 'amdsmi_power_info_t', 'amdsmi_power_profile_preset_masks_t', + 'amdsmi_memory_page_status_t', 'amdsmi_memory_partition_type_t', + 'amdsmi_memory_type_t', 'amdsmi_mm_ip_t', + 'amdsmi_od_vddc_point_t', 'amdsmi_od_volt_curve_t', + 'amdsmi_od_volt_freq_data_t', 'amdsmi_pcie_bandwidth_t', + 'amdsmi_pcie_info_t', 'amdsmi_pcie_slot_type_t', + 'amdsmi_power_cap_info_t', 'amdsmi_power_info_t', + 'amdsmi_power_profile_preset_masks_t', 'amdsmi_power_profile_status_t', 'amdsmi_power_type_t', 'amdsmi_proc_info_t', 'amdsmi_process_handle_t', 'amdsmi_process_info_t', 'amdsmi_processor_handle', diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 67607fa186..fef797cc5f 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -377,16 +377,16 @@ typedef rsmi_clk_type_t rsmi_clk_type; */ typedef enum { RSMI_COMPUTE_PARTITION_INVALID = 0, - RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with - //!< shared memory - RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work - //!< together with shared memory - RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work - //!< together with shared memory - RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs - //!< work together with shared memory - RSMI_COMPUTE_PARTITION_QPX //!< Quad GPU mode (QPX)- Quarter XCCs - //!< work together with shared memory + RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory + RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + RSMI_COMPUTE_PARTITION_QPX //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory } rsmi_compute_partition_type_t; /// \cond Ignore in docs. typedef rsmi_compute_partition_type_t rsmi_compute_partition_type; @@ -3902,7 +3902,7 @@ rsmi_is_P2P_accessible(uint32_t dv_ind_src, uint32_t dv_ind_dst, /** @} */ // end of HWTopo /*****************************************************************************/ -/** @defgroup ComputePartition Compute Partition Functions +/** @defgroup compute_partition Compute Partition Functions * These functions are used to configure and query the device's * compute parition setting. * @{ @@ -3983,10 +3983,10 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, */ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind); -/** @} */ // end of ComputePartition +/** @} */ // end of compute_partition /*****************************************************************************/ -/** @defgroup memory_partition The Memory Partition Functions +/** @defgroup memory_partition Memory Partition Functions * These functions are used to query and set the device's current memory * partition. * @{ diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 39903358d6..9253722628 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -986,6 +986,52 @@ amdsmi_is_P2P_accessible(amdsmi_processor_handle processor_handle_src, return amd::smi::rsmi_to_amdsmi_status(rstatus); } +// Compute Partition functions +amdsmi_status_t +amdsmi_dev_compute_partition_get(amdsmi_processor_handle processor_handle, + char *compute_partition, uint32_t len) { + AMDSMI_CHECK_INIT(); + return rsmi_wrapper(rsmi_dev_compute_partition_get, processor_handle, + compute_partition, len); +} + +amdsmi_status_t +amdsmi_dev_compute_partition_set(amdsmi_processor_handle processor_handle, + amdsmi_compute_partition_type_t compute_partition) { + AMDSMI_CHECK_INIT(); + return rsmi_wrapper(rsmi_dev_compute_partition_set, processor_handle, + static_cast(compute_partition)); +} + +amdsmi_status_t +amdsmi_dev_compute_partition_reset(amdsmi_processor_handle processor_handle) { + AMDSMI_CHECK_INIT(); + return rsmi_wrapper(rsmi_dev_compute_partition_reset, processor_handle); +} + +// Memory Partition functions +amdsmi_status_t +amdsmi_dev_memory_partition_get(amdsmi_processor_handle processor_handle, + char *memory_partition, uint32_t len) { + AMDSMI_CHECK_INIT(); + return rsmi_wrapper(rsmi_dev_memory_partition_get, processor_handle, + memory_partition, len); +} + +amdsmi_status_t +amdsmi_dev_memory_partition_set(amdsmi_processor_handle processor_handle, + amdsmi_memory_partition_type_t memory_partition) { + AMDSMI_CHECK_INIT(); + return rsmi_wrapper(rsmi_dev_memory_partition_set, processor_handle, + static_cast(memory_partition)); +} + +amdsmi_status_t +amdsmi_dev_memory_partition_reset(amdsmi_processor_handle processor_handle) { + AMDSMI_CHECK_INIT(); + return rsmi_wrapper(rsmi_dev_memory_partition_reset, processor_handle); +} + // TODO(bliu) : other xgmi related information amdsmi_status_t amdsmi_get_xgmi_info(amdsmi_processor_handle processor_handle, amdsmi_xgmi_info_t *info) {