From 7d061f9ae475c9c230b09ff2ee688b2a9288fe3d Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Thu, 21 Nov 2024 22:35:43 -0600 Subject: [PATCH] [SWDEV-499029] Fix unable to change memory partition modes Changes: * [API] Removed checking board name, fixes for other MI ASICs * [API] Fixed unable to restart AMD GPU, libdrm blocked doing this operation * [API] Added ability to unload/reload libdrm from within AMD SMI APIs * [CLI] Increased progress bar to change memory partition modes to 140 seconds, since driver reload is variable per system Change-Id: I52f227f2ab850c4a6332ff3ecdc899903b1080f1 Signed-off-by: Charis Poag --- amdsmi_cli/amdsmi_commands.py | 118 +++++++++++++++------- amdsmi_cli/amdsmi_helpers.py | 2 +- include/amd_smi/impl/amd_smi_lib_loader.h | 1 + include/amd_smi/impl/amd_smi_system.h | 5 + rocm_smi/src/rocm_smi.cc | 33 +----- rocm_smi/src/rocm_smi_device.cc | 60 +++++++---- src/amd_smi/amd_smi.cc | 59 ++++++++++- src/amd_smi/amd_smi_lib_loader.cc | 21 ++-- 8 files changed, 207 insertions(+), 92 deletions(-) diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py index 9aabb2f9c0..79971db3a4 100644 --- a/amdsmi_cli/amdsmi_commands.py +++ b/amdsmi_cli/amdsmi_commands.py @@ -3978,18 +3978,20 @@ class AMDSMICommands(): raise ValueError(f"Unable to set compute partition to {args.compute_partition} on {gpu_string}") from e self.logger.store_output(args.gpu, 'computepartition', f"Successfully set compute partition to {args.compute_partition}") if args.memory_partition: + lock = multiprocessing.Lock() + lock.acquire() #################################################################### # Get current and available memory partition modes # # Info used if AMDSMI_STATUS_INVAL is caught & to set progress bar # #################################################################### try: - memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(gpu) # this info likely actually comes from different apis than used here + memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu) # this info likely actually comes from different apis than used here except amdsmi_exception.AmdSmiLibraryException as e: memory_partition = "N/A" logging.debug("Failed to get current memory partition for GPU %s | %s", gpu_id, e.get_error_info()) try: mem_caps_str = "N/A" - partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu) + partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(args.gpu) temp_mem_caps = partition_dict['partition_profile']['memory_caps'] mem_caps = temp_mem_caps.nps_cap_mask if temp_mem_caps.amdsmi_nps_flags_t == None: @@ -4032,48 +4034,92 @@ class AMDSMICommands(): # 2) Requested mode is a valid mode to set # 3) Current is not already the requested mode # otherwise function will return fast + else: + showProgressBar = False + threads = [] - kTimeWait = 40 + k140secs = 140 + string_out = f"Updating memory partition for gpu {gpu_id}" + timesToRetryRestartErr = 1 + self.helpers.increment_set_count() set_count = self.helpers.get_set_count() if set_count == 1: # only show reload warning on 1st set self.helpers.confirm_changing_memory_partition_gpu_reload_warning() - memory_partition = amdsmi_interface.AmdSmiMemoryPartitionType[args.memory_partition] - try: - if set_count == 1 and showProgressBar: # only show reload warning on 1st set - string_out = f"Updating memory partition for gpu {gpu_id}" - t1 = multiprocessing.Process(target=self.helpers.showProgressbar, - args=(string_out, kTimeWait,)) - threads.append(t1) - t1.start() - amdsmi_interface.amdsmi_set_gpu_memory_partition(args.gpu, memory_partition) - for thread in threads: - thread.terminate() - thread.join() - except amdsmi_exception.AmdSmiLibraryException as e: - f = open(os.devnull, 'w') #redirect to /dev/null (crossplatform) - print("\n\n", end='\r', flush=True, file=f) - for thread in threads: - thread.join() - thread.terminate() + while timesToRetryRestartErr >= 0: + timesToRetryRestartErr -= 1 + try: + if showProgressBar: # only show reload warning on 1st set + t1 = multiprocessing.Process(target=self.helpers.showProgressbar, + args=(string_out, k140secs,)) + threads.append(t1) + t1.start() + memory_partition = amdsmi_interface.AmdSmiMemoryPartitionType[args.memory_partition] + amdsmi_interface.amdsmi_set_gpu_memory_partition(args.gpu, memory_partition) + for thread in threads: + thread.terminate() + print("") + break # successful case - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: - raise PermissionError('Command requires elevation') from e - if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL: - print(f"[amdsmi_wrapper.AMDSMI_STATUS_INVAL] Unable to set memory partition to {args.memory_partition} on {gpu_string}") - print(f"Valid Memory partition Modes: {mem_caps_str}\n") - # fall through for value error + except amdsmi_exception.AmdSmiLibraryException as e: + f = open(os.devnull, 'w') #redirect to /dev/null (crossplatform) + print("\n\n", end='\r', flush=True, file=f) + for thread in threads: + thread.terminate() - f = open(os.devnull, 'w') #redirect to /dev/null (crossplatform) - print("\n\n", end='\r', flush=True, file=f) - raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e - except Exception as e: - for thread in threads: - thread.join() - thread.terminate() - raise ValueError(f"Generic error found | Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e - self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}") + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM: + raise PermissionError('Command requires elevation') from e + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL: + out = f"[AMDSMI_STATUS_INVAL] Unable to set memory partition to {args.memory_partition} on {gpu_string}" + print(f"Valid Memory partition Modes: {mem_caps_str}\n") + self.logger.store_output(args.gpu, 'memory_partition', out) + self.logger.print_output() + self.logger.clear_multiple_devices_ouput() + lock.release() + return + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED: + out = f"[AMDSMI_STATUS_NOT_SUPPORTED] Device does not support setting memory partition to {args.memory_partition} on {gpu_string}" + self.logger.store_output(args.gpu, 'memory_partition', out) + self.logger.print_output() + self.logger.clear_multiple_devices_ouput() + lock.release() + return + if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_AMDGPU_RESTART_ERR: + # Try again on a failure -> work around for not being able to close libdrm + string_out = f"Trying again - Updating memory partition for gpu {gpu_id}" + for thread in threads: + thread.terminate() + thread.join() + if timesToRetryRestartErr < 0: + out = f"[AMDSMI_STATUS_AMDGPU_RESTART_ERR] Could not successfully restart driver after applying {args.memory_partition} on {gpu_string}" + self.logger.store_output(args.gpu, 'memory_partition', out) + self.logger.print_output() + self.logger.clear_multiple_devices_ouput() + return + continue + + f = open(os.devnull, 'w') #redirect to /dev/null (crossplatform) + print("\n\n", end='\r', flush=True, file=f) + out = f"Unable to set memory partition to {args.memory_partition} on {gpu_string}" + print(out) + self.logger.store_output(args.gpu, 'memorypartition', out) + self.logger.print_output() + self.logger.clear_multiple_devices_ouput() + lock.release() + return + except Exception as e: + for thread in threads: + thread.terminate() + out = f"Generic error found | Unable to set memory partition to {args.memory_partition} on {gpu_string}" + print(out) + lock.release() + raise ValueError(f"Generic error found | Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e + self.logger.store_output(args.gpu, 'memory_partition', f"Successfully set memory partition to {args.memory_partition}") + self.logger.print_output() + self.logger.clear_multiple_devices_ouput() + lock.release() + return if isinstance(args.power_cap, int): try: power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu) diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py index 6d2a6e4d2c..2fce5f4940 100644 --- a/amdsmi_cli/amdsmi_helpers.py +++ b/amdsmi_cli/amdsmi_helpers.py @@ -755,7 +755,7 @@ class AMDSMIHelpers(): ****** WARNING ******\n Setting Dynamic Memory (NPS) partition modes require users to quit all GPU workloads. AMD SMI will then attempt to change memory (NPS) partition mode. - Upon a successful set, AMD SMI will then initiate an action to restart amdgpu driver. + Upon a successful set, AMD SMI will then initiate an action to restart AMD GPU driver. This action will change all GPU's in the hive to the requested memory (NPS) partition mode. Please use this utility with caution. diff --git a/include/amd_smi/impl/amd_smi_lib_loader.h b/include/amd_smi/impl/amd_smi_lib_loader.h index 79a489c070..a297d0290d 100644 --- a/include/amd_smi/impl/amd_smi_lib_loader.h +++ b/include/amd_smi/impl/amd_smi_lib_loader.h @@ -49,6 +49,7 @@ class AMDSmiLibraryLoader { private: void* libHandler_; std::mutex library_mutex_; + bool library_loaded_ = false; }; template amdsmi_status_t AMDSmiLibraryLoader::load_symbol( diff --git a/include/amd_smi/impl/amd_smi_system.h b/include/amd_smi/impl/amd_smi_system.h index c1bace0a42..e469c26263 100644 --- a/include/amd_smi/impl/amd_smi_system.h +++ b/include/amd_smi/impl/amd_smi_system.h @@ -57,6 +57,11 @@ class AMDSmiSystem { amdsmi_status_t get_cpu_family(uint32_t *cpu_family); amdsmi_status_t get_cpu_model(uint32_t *cpu_model); + + amdsmi_status_t clean_up_drm() { return drm_.cleanup();} + + amdsmi_status_t init_drm() { return drm_.init();} + private: AMDSmiSystem() : init_flag_(AMDSMI_INIT_AMD_GPUS) {} diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc index 3f56bab73f..5f36e8d256 100644 --- a/rocm_smi/src/rocm_smi.cc +++ b/rocm_smi/src/rocm_smi.cc @@ -5703,6 +5703,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, LOG_TRACE(ss); REQUIRE_ROOT_ACCESS DEVICE_MUTEX + const int k1000_MS_WAIT = 1000; const uint32_t kMaxBoardLength = 128; bool isCorrectDevice = false; char boardName[kMaxBoardLength]; @@ -5716,32 +5717,6 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, char current_memory_mode[kMaxCurrentMemoryMode]; current_memory_mode[0] = '\0'; - // rsmi_dev_memory_partition_set is only available for for discrete variant, - // others are required to update through bios settings - rsmi_dev_name_get(dv_ind, boardName, static_cast(kMaxBoardLength)); - std::string myBoardName = boardName; - if (!myBoardName.empty()) { - std::transform(myBoardName.begin(), myBoardName.end(), myBoardName.begin(), - ::tolower); - if (myBoardName.find("mi") != std::string::npos && - myBoardName.find("00x") != std::string::npos) { - isCorrectDevice = true; - } - } - - if (!isCorrectDevice) { - ss << __PRETTY_FUNCTION__ - << " | ======= end ======= " - << " | Fail " - << " | Device #: " << dv_ind - << " | Type: " - << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition) - << " | Cause: device board name does not support this action" - << " | Returning = " - << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED, false); - LOG_ERROR(ss); - return RSMI_STATUS_NOT_SUPPORTED; - } // Is the current mode already what user requested? switch (memory_partition) { @@ -5877,6 +5852,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, << " | Returning = " << getRSMIStatusString(restartRet, false); LOG_TRACE(ss); + if (restartRet != RSMI_STATUS_SUCCESS) { ss << __PRETTY_FUNCTION__ << " | ======= end ======= " @@ -5894,10 +5870,10 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, std::string current_memory_mode_str = "unknown"; rsmi_status_t can_read_sysfs_again = RSMI_STATUS_AMDGPU_RESTART_ERR; int maxWaitSeconds = 10; - const int k1000_MS_WAIT = 1000; // wait until we can read SYSFS again if (restartRet == RSMI_STATUS_SUCCESS) { - while (current_memory_mode_str != user_requested_memory_partition) { + while ((current_memory_mode_str != user_requested_memory_partition) + && maxWaitSeconds > 0) { maxWaitSeconds -= 1; can_read_sysfs_again = rsmi_dev_memory_partition_get(dv_ind, current_memory_mode, kMaxCurrentMemoryMode); @@ -5913,6 +5889,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, << " | Data (user requested mode): " << user_requested_memory_partition << " | Current Memory Partition Mode: " << current_memory_mode_str << " | Available Memory Partition Modes: " << memory_capabilities_str + << " | maxWaitSeconds: " << maxWaitSeconds << " | total wait time (sec): " << (10 - maxWaitSeconds) << " | Returning = " << getRSMIStatusString(can_read_sysfs_again, false); diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc index 4aae96cc68..7621d2819c 100644 --- a/rocm_smi/src/rocm_smi_device.cc +++ b/rocm_smi/src/rocm_smi_device.cc @@ -1477,38 +1477,56 @@ rsmi_status_t Device::restartAMDGpuDriver(void) { bool restartInProgress = true; bool isRestartInProgress = true; bool isAMDGPUModuleLive = false; + bool restartGDM = false; std::string captureRestartErr; + const int kTimeToWaitForDriverMSec = 1000; // sudo systemctl is-active gdm // we do not care about the success of checking if gdm is active - std::tie(success, out) = executeCommand("systemctl is-active gdm"); - (out == "active") ? (restartSuccessful &= success) : - (restartSuccessful = true); + std::tie(success, out) = executeCommand("systemctl is-active gdm", true); + (out == "active") ? (restartGDM = true) : (restartGDM = false); + ss << __PRETTY_FUNCTION__ << " | systemctl is-active gdm: out = " + << out << "; success = " << (success ? "True" : "False"); + LOG_INFO(ss); // if gdm is active -> sudo systemctl stop gdm - // TODO: are are there other display manager's we need to take into account? - // Search GNOME_Display_Manager on wikipedia - if (success && (out == "active")) { + // TODO(AMD_SMI_team): are are there other display manager's we need to take into account? + // see https://help.gnome.org/admin/gdm/stable/overview.html.en_GB + if (success && (out == "active") && (restartGDM)) { wasGdmServiceActive = true; - std::tie(success, out) = executeCommand("systemctl stop gdm&", false); - restartSuccessful &= success; + std::tie(success, out) = executeCommand("systemctl stop gdm&", true); + ss << __PRETTY_FUNCTION__ << " | systemctl stop gdm&: out = " + << out << "; success = " << (success ? "True" : "False"); + LOG_INFO(ss); + } else { + success = true; // ignore failures to restart gdm } + ss << __PRETTY_FUNCTION__ << " | B4 modprobing anything!!! out = " + << out << "; success = " << (success ? "True" : "False") + << "; restartSuccessful = " << (restartSuccessful ? "True" : "False") + << "; captureRestartErr = " << captureRestartErr; + LOG_INFO(ss); + // sudo modprobe -r amdgpu // sudo modprobe amdgpu - std::tie(success, out) = - executeCommand("modprobe -r amdgpu && modprobe amdgpu&", true); + std::tie(success, out) = executeCommand( + "modprobe -r -v amdgpu >/dev/null 2>&1 && modprobe -v amdgpu >/dev/null 2>&1", true); restartSuccessful &= success; captureRestartErr = out; - - if (success) { - restartSuccessful = false; - } + ss << __PRETTY_FUNCTION__ << " | modprobe -r -v amdgpu && modprobe -v amdgpu: out = " + << out << "; success = " << (success ? "True" : "False") + << "; restartSuccessful = " << (restartSuccessful ? "True" : "False") + << "; captureRestartErr = " << captureRestartErr; + LOG_INFO(ss); // if gdm was active -> sudo systemctl start gdm - if (wasGdmServiceActive) { - std::tie(success, out) = executeCommand("systemctl start gdm&", false); - restartSuccessful &= success; + // We don't care if successful or not, just try to restart as a courtesy + if (wasGdmServiceActive && restartGDM) { + std::tie(success, out) = executeCommand("systemctl start gdm&", true); + ss << __PRETTY_FUNCTION__ << " | systemctl start gdm&: out = " + << out << "; success = " << (success ? "True" : "False"); + LOG_INFO(ss); } // Return early if there was an issue restarting amdgpu @@ -1522,7 +1540,6 @@ rsmi_status_t Device::restartAMDGpuDriver(void) { // wait for amdgpu module to come back up rsmi_status_t status = Device::isRestartInProgress(&isRestartInProgress, &isAMDGPUModuleLive); - const int kTimeToWaitForDriverMSec = 1000; int maxLoops = 10; // wait a max of 10 sec while (status != RSMI_STATUS_SUCCESS) { maxLoops -= 1; @@ -1553,7 +1570,7 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress, // wait for amdgpu module to come back up std::tie(success, out) = executeCommand("cat /sys/module/amdgpu/initstate", true); ss << __PRETTY_FUNCTION__ - << " | success = " << success + << " | success = " << (success ? "True" : "False") << " | out = " << out; LOG_DEBUG(ss); if ((success == true) && (!out.empty())) { @@ -1564,6 +1581,11 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress, } *isRestartInProgress = deviceRestartInProgress; *isAMDGPUModuleLive = isSystemAMDGPUModuleLive; + ss << __PRETTY_FUNCTION__ + << " | *isRestartInProgress = " << (*isRestartInProgress ? "True":"False") + << " | *isAMDGPUModuleLive = " << (*isAMDGPUModuleLive ? "True":"False") + << " | out = " << out; + LOG_DEBUG(ss); return ((*isAMDGPUModuleLive && !*isRestartInProgress) ? RSMI_STATUS_SUCCESS : RSMI_STATUS_AMDGPU_RESTART_ERR); diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index 7ac6b30e78..2f5cfd3821 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -53,6 +53,8 @@ #include "amd_smi/impl/amd_smi_processor.h" #include "rocm_smi/rocm_smi_logger.h" +// a global instance of std::mutex to protect data passed during threads +std::mutex myMutex; static bool initialized_lib = false; #define SIZE 10 @@ -1426,8 +1428,63 @@ amdsmi_status_t amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle, amdsmi_memory_partition_type_t memory_partition) { AMDSMI_CHECK_INIT(); - return rsmi_wrapper(rsmi_dev_memory_partition_set, processor_handle, + std::ostringstream ss; + std::lock_guard g(myMutex); + // open libdrm connections prevents the ability to unload driver + amd::smi::AMDSmiSystem::getInstance().clean_up_drm(); + ss << __PRETTY_FUNCTION__ << " | \n" + << "***********************************\n" + << "* Cleaned up - clean_up_drm() *\n" + << "***********************************\n"; + LOG_INFO(ss); + amdsmi_status_t ret = rsmi_wrapper(rsmi_dev_memory_partition_set, processor_handle, static_cast(memory_partition)); + if (ret == AMDSMI_STATUS_SUCCESS) { + const uint32_t k256 = 256; + char current_partition[k256]; + std::string current_partition_str = "UNKNOWN"; + std::string req_user_partition; + amdsmi_status_t ret_get = rsmi_wrapper(rsmi_dev_memory_partition_get, processor_handle, + current_partition, k256); + if (ret_get == AMDSMI_STATUS_SUCCESS) { + current_partition_str.clear(); + current_partition_str = current_partition; + } + switch (memory_partition) { + case AMDSMI_MEMORY_PARTITION_NPS1: + req_user_partition = "NPS1"; + break; + case AMDSMI_MEMORY_PARTITION_NPS2: + req_user_partition = "NPS2"; + break; + case AMDSMI_MEMORY_PARTITION_NPS4: + req_user_partition = "NPS4"; + break; + case AMDSMI_MEMORY_PARTITION_NPS8: + req_user_partition = "NPS8"; + break; + default: + req_user_partition = "UNKNOWN"; + break; + } + if (req_user_partition == current_partition_str) { + amd::smi::AMDSmiSystem::getInstance().init_drm(); + ss << __PRETTY_FUNCTION__ << " | \n" + << "***********************************\n" + << "* Initialized libdrm - init_drm() *\n" + << "***********************************\n"; + LOG_INFO(ss); + } + } + + // TODO(amdsmi_team): issue completely closing -> reopening libdrm on 1st try (workaround above) + // amd::smi::AMDSmiSystem::getInstance().init_drm(); + // ss << __PRETTY_FUNCTION__ << " | \n" + // << "***********************************\n" + // << "* Initialized libdrm - init_drm() *\n" + // << "***********************************\n"; + // LOG_INFO(ss); + return ret; } amdsmi_status_t diff --git a/src/amd_smi/amd_smi_lib_loader.cc b/src/amd_smi/amd_smi_lib_loader.cc index a1b8aaba73..b45cf5b1bc 100644 --- a/src/amd_smi/amd_smi_lib_loader.cc +++ b/src/amd_smi/amd_smi_lib_loader.cc @@ -33,18 +33,24 @@ amdsmi_status_t AMDSmiLibraryLoader::load(const char* filename) { if (filename == nullptr) { return AMDSMI_STATUS_FAIL_LOAD_MODULE; } - if (libHandler_) { + if (libHandler_ || library_loaded_) { unload(); } std::lock_guard guard(library_mutex_); - libHandler_ = dlopen(filename, RTLD_LAZY); - if (!libHandler_) { - char* error = dlerror(); - std::cerr << "Fail to open " << filename <<": " << error - << std::endl; - return AMDSMI_STATUS_FAIL_LOAD_MODULE; + // check if already loaded, return success if it is + // dlopen(filename, RTLD_NOLOAD) == null only IFF library is not loaded + void* isLibOpen = dlopen(filename, RTLD_NOLOAD); + if (isLibOpen == nullptr) { + libHandler_ = dlopen(filename, RTLD_LAZY); + if (!libHandler_) { + char* error = dlerror(); + std::cerr << "Fail to open " << filename <<": " << error + << std::endl; + return AMDSMI_STATUS_FAIL_LOAD_MODULE; + } } + library_loaded_ = true; return AMDSMI_STATUS_SUCCESS; } @@ -54,6 +60,7 @@ amdsmi_status_t AMDSmiLibraryLoader::unload() { if (libHandler_) { dlclose(libHandler_); libHandler_ = nullptr; + library_loaded_ = false; } return AMDSMI_STATUS_SUCCESS; }