From e014342896248edcaa0f88a7ff70b91a42b207b7 Mon Sep 17 00:00:00 2001 From: "Poag, Charis" Date: Thu, 20 Mar 2025 17:23:01 -0500 Subject: [PATCH] [SWDEV-513807] Fix amd-smi partition --accelerator not returning AMDSMI_STATUS_NO_PERM (#192) * [SWDEV-513807] Fix amd-smi partition --accelerator not returning AMDSMI_STATUS_NO_PERM Changes: - Fixed amdsmi_get_gpu_accelerator_partition_profile_config() from not returning AMDSMI_STATUS_NO_PERM - Changed amd-smi partition --accelerator to provide user with a warning if users does not use sudo or root permissions. - Updated changelog for fixes planned for 6.4.1 release Signed-off-by: Charis Poag [ROCm/amdsmi commit: 0402bb4d7597bffde0c8f4142865adc50a4eae86] --- projects/amdsmi/CHANGELOG.md | 109 +++++++++++++++++- projects/amdsmi/amdsmi_cli/amdsmi_commands.py | 12 +- projects/amdsmi/amdsmi_cli/amdsmi_logger.py | 5 + projects/amdsmi/include/amd_smi/amdsmi.h | 3 +- projects/amdsmi/src/amd_smi/amd_smi.cc | 3 + 5 files changed, 127 insertions(+), 5 deletions(-) diff --git a/projects/amdsmi/CHANGELOG.md b/projects/amdsmi/CHANGELOG.md index 7d637843aa..c3be9bc037 100644 --- a/projects/amdsmi/CHANGELOG.md +++ b/projects/amdsmi/CHANGELOG.md @@ -18,8 +18,101 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr - Increasing available JPEG engines to 40. Current ASICs may not support all 40. These will be indicated as UINT16_MAX or N/A in CLI. + ## amd_smi_lib for ROCm 6.4.1 +### Added + +- **Added `amdsmi_get_power_info_v2()` with `sensor_ind`**. + +### Changed + +- **Changed amd-smi partition --accelerator & `amdsmi_get_gpu_accelerator_partition_profile_config()` detect users running without root/sudo privledges** + - Updated `amdsmi_get_gpu_accelerator_partition_profile_config()` to return `AMDSMI_STATUS_NO_PERM` immediately + if users run without root/sudo permissions. + - Updated `amd-smi partition --accelerator` to provide a warning for users without root/sudo permissions (see example below, ***output subject to change***). +```shell +$ amd-smi partition --accelerator + +ACCELERATOR_PARTITION_PROFILES: + +*************************************************************************** +** WARNING: ** +** ACCELERATOR_PARTITION_PROFILES requires sudo/root permissions to run. ** +** Please run the command with sudo permissions to get accurate results. ** +*************************************************************************** + +GPU_ID PROFILE_INDEX MEMORY_PARTITION_CAPS ACCELERATOR_TYPE PARTITION_ID NUM_PARTITIONS NUM_RESOURCES RESOURCE_INDEX RESOURCE_TYPE RESOURCE_INSTANCES RESOURCES_SHARED +N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A +N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A +N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A +N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A +N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A +N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A +N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A +N/A N/A N/A N/A 0 N/A N/A N/A N/A N/A N/A + +ACCELERATOR_PARTITION_RESOURCES: +RESOURCE_INDEX RESOURCE_TYPE RESOURCE_INSTANCES RESOURCES_SHARED +N/A N/A N/A N/A +N/A N/A N/A N/A +N/A N/A N/A N/A +N/A N/A N/A N/A +N/A N/A N/A N/A +N/A N/A N/A N/A +N/A N/A N/A N/A +N/A N/A N/A N/A + + +Legend: + * = Current mode +``` + +- **Changed `amd-smi partition --current`, `amd-smi partition --accelerator`, and `amdsmi_get_gpu_accelerator_partition_profile()` to display partition ID for each individual partition** + - Host will continue to display in the full array format, they do not display the individual partitions as Baremetal/Guest setups. + - Baremetal and Guest MI3x setups will change to +reflect each individual partition ID, now provided in `partition_id[0]` location (as seen in other amd-smi CLI commands). +This change was needed for BM/Guest setups due to other related partition outputs seen in (`amd-smi list` and `amd-smi static --partition`) and individual logical partition devices displayed. ***See examples below for reference.*** + +Previous output: +```shell +$ amd-smi partition --current + +CURRENT_PARTITION: +GPU_ID MEMORY ACCELERATOR_TYPE ACCELERATOR_PROFILE_INDEX PARTITION_ID +0 NPS1 CPX 3 0,1,2,3,4,5,6,7 +1 NPS1 CPX 3 N/A +2 NPS1 CPX 3 N/A +3 NPS1 CPX 3 N/A +4 NPS1 CPX 3 N/A +5 NPS1 CPX 3 N/A +6 NPS1 CPX 3 N/A +7 NPS1 CPX 3 N/A +8 NPS1 CPX 3 0,1,2,3,4,5,6,7 +9 NPS1 CPX 3 N/A +10 NPS1 CPX 3 N/A +... +``` + +New output: +```shell +amd-smi partition --current +CURRENT_PARTITION: +GPU_ID MEMORY ACCELERATOR_TYPE ACCELERATOR_PROFILE_INDEX PARTITION_ID +0 NPS1 CPX 3 0 +1 NPS1 CPX 3 1 +2 NPS1 CPX 3 2 +3 NPS1 CPX 3 3 +4 NPS1 CPX 3 4 +5 NPS1 CPX 3 5 +6 NPS1 CPX 3 6 +7 NPS1 CPX 3 7 +8 NPS1 CPX 3 0 +9 NPS1 CPX 3 1 +10 NPS1 CPX 3 2 +... +``` + ### Removed - **Removed `sensor_ind` in `amdsmi_get_power_info()` for backwards compatibility**. @@ -27,9 +120,21 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr - Python API still accepts `sensor_ind` as an optional argument - Changed AMDSMI version from 25.2 to 25.3 -### Added +### Optimized -- **Added `amdsmi_get_power_info_v2()` with `sensor_ind`**. +- N/A + +### Resolved issues + +- **Fixed `amd-smi static --partition` for guest systems with MIx ASICs being unable to run** + +### Upcoming changes + +- N/A + +### Known issues + +- N/A ## amd_smi_lib for ROCm 6.4.0 diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py index 02b7401734..5f1bbdc879 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_commands.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_commands.py @@ -5882,7 +5882,7 @@ class AMDSMICommands(): tabular_output.append(tabular_output_dict) self.logger.multiple_device_output = tabular_output - self.logger.table_title = "\nCURRENT_PARTITION" + self.logger.table_title = "CURRENT_PARTITION" self.logger.print_output(multiple_device_enabled=True, tabular=True, dynamic=True) self.logger.clear_multiple_devices_output() @@ -6054,8 +6054,18 @@ class AMDSMICommands(): self.logger.multiple_device_output = tabular_output self.logger.table_title = "\nACCELERATOR_PARTITION_PROFILES" + # only display warning message if not running as root or with sudo + if os.geteuid() != 0: + self.logger.warning_message = """ +*************************************************************************** +** WARNING: ** +** ACCELERATOR_PARTITION_PROFILES requires sudo/root permissions to run. ** +** Please run the command with sudo permissions to get accurate results. ** +*************************************************************************** +""" self.logger.print_output(multiple_device_enabled=True, tabular=True, dynamic=True) self.logger.clear_multiple_devices_output() + self.logger.warning_message = "" # clear the warning message ######################################### # print accelerator partition resources # diff --git a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py index a056360ea8..ec50ece980 100644 --- a/projects/amdsmi/amdsmi_cli/amdsmi_logger.py +++ b/projects/amdsmi/amdsmi_cli/amdsmi_logger.py @@ -40,6 +40,7 @@ class AMDSMILogger(): self.table_header = "" self.secondary_table_title = "" self.secondary_table_header = "" + self.warning_message = "" self.helpers = AMDSMIHelpers() @@ -827,6 +828,8 @@ class AMDSMILogger(): primary_table_heading = '' if self.table_title: primary_table_heading = self.table_title + ':\n' + if self.warning_message: # Add warning message below the table title + primary_table_heading += self.warning_message + '\n' primary_table_heading += self.table_header + '\n' primary_table = primary_table_heading + primary_table @@ -884,6 +887,8 @@ class AMDSMILogger(): primary_table_heading = '' if self.table_title: primary_table_heading = self.table_title + ':\n' + if self.warning_message: # Add warning message below the table title + primary_table_heading += self.warning_message + '\n' primary_table_heading += self.table_header + '\n' primary_table = primary_table_heading + primary_table diff --git a/projects/amdsmi/include/amd_smi/amdsmi.h b/projects/amdsmi/include/amd_smi/amdsmi.h index 08e6203f5d..0e5dcad53d 100644 --- a/projects/amdsmi/include/amd_smi/amdsmi.h +++ b/projects/amdsmi/include/amd_smi/amdsmi.h @@ -5536,8 +5536,7 @@ amdsmi_set_gpu_memory_partition_mode(amdsmi_processor_handle processor_handle, /** * @brief Version 2.0: Returns gpu accelerator partition caps as currently configured in the system * User must use admin/sudo privledges to run this API, or API will not be able to - * read resources. Otherwise, API will fill in the structure with as much information as - * it can. + * read resources. * * @ingroup tagAcceleratorPartition * diff --git a/projects/amdsmi/src/amd_smi/amd_smi.cc b/projects/amdsmi/src/amd_smi/amd_smi.cc index 7b0af47977..101e03dedb 100644 --- a/projects/amdsmi/src/amd_smi/amd_smi.cc +++ b/projects/amdsmi/src/amd_smi/amd_smi.cc @@ -2003,6 +2003,9 @@ amdsmi_status_t amdsmi_get_gpu_accelerator_partition_profile_config(amdsmi_processor_handle processor_handle, amdsmi_accelerator_partition_profile_config_t *profile_config) { AMDSMI_CHECK_INIT(); + if (!amd::smi::is_sudo_user()) { + return AMDSMI_STATUS_NO_PERM; + } std::ostringstream ss; ss << __PRETTY_FUNCTION__ << " | START ";