[SWDEV-499029] Fix unable to change memory partition modes

Changes:
  * [API] Removed checking board name, fixes for other MI ASICs
  * [API] Fixed unable to restart AMD GPU, libdrm blocked
    doing this operation
  * [API] Added ability to unload/reload libdrm
    from within AMD SMI APIs
  * [CLI] Increased progress bar to change memory partition modes
    to 140 seconds, since driver reload is variable per system

Change-Id: I52f227f2ab850c4a6332ff3ecdc899903b1080f1
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
Этот коммит содержится в:
Charis Poag
2024-11-21 22:35:43 -06:00
родитель 35d8e827b9
Коммит 7d061f9ae4
8 изменённых файлов: 207 добавлений и 92 удалений
+82 -36
Просмотреть файл
@@ -3978,18 +3978,20 @@ class AMDSMICommands():
raise ValueError(f"Unable to set compute partition to {args.compute_partition} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'computepartition', f"Successfully set compute partition to {args.compute_partition}")
if args.memory_partition:
lock = multiprocessing.Lock()
lock.acquire()
####################################################################
# Get current and available memory partition modes #
# Info used if AMDSMI_STATUS_INVAL is caught & to set progress bar #
####################################################################
try:
memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(gpu) # this info likely actually comes from different apis than used here
memory_partition = amdsmi_interface.amdsmi_get_gpu_memory_partition(args.gpu) # this info likely actually comes from different apis than used here
except amdsmi_exception.AmdSmiLibraryException as e:
memory_partition = "N/A"
logging.debug("Failed to get current memory partition for GPU %s | %s", gpu_id, e.get_error_info())
try:
mem_caps_str = "N/A"
partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(gpu)
partition_dict = amdsmi_interface.amdsmi_get_gpu_accelerator_partition_profile(args.gpu)
temp_mem_caps = partition_dict['partition_profile']['memory_caps']
mem_caps = temp_mem_caps.nps_cap_mask
if temp_mem_caps.amdsmi_nps_flags_t == None:
@@ -4032,48 +4034,92 @@ class AMDSMICommands():
# 2) Requested mode is a valid mode to set
# 3) Current is not already the requested mode
# otherwise function will return fast
else:
showProgressBar = False
threads = []
kTimeWait = 40
k140secs = 140
string_out = f"Updating memory partition for gpu {gpu_id}"
timesToRetryRestartErr = 1
self.helpers.increment_set_count()
set_count = self.helpers.get_set_count()
if set_count == 1: # only show reload warning on 1st set
self.helpers.confirm_changing_memory_partition_gpu_reload_warning()
memory_partition = amdsmi_interface.AmdSmiMemoryPartitionType[args.memory_partition]
try:
if set_count == 1 and showProgressBar: # only show reload warning on 1st set
string_out = f"Updating memory partition for gpu {gpu_id}"
t1 = multiprocessing.Process(target=self.helpers.showProgressbar,
args=(string_out, kTimeWait,))
threads.append(t1)
t1.start()
amdsmi_interface.amdsmi_set_gpu_memory_partition(args.gpu, memory_partition)
for thread in threads:
thread.terminate()
thread.join()
except amdsmi_exception.AmdSmiLibraryException as e:
f = open(os.devnull, 'w') #redirect to /dev/null (crossplatform)
print("\n\n", end='\r', flush=True, file=f)
for thread in threads:
thread.join()
thread.terminate()
while timesToRetryRestartErr >= 0:
timesToRetryRestartErr -= 1
try:
if showProgressBar: # only show reload warning on 1st set
t1 = multiprocessing.Process(target=self.helpers.showProgressbar,
args=(string_out, k140secs,))
threads.append(t1)
t1.start()
memory_partition = amdsmi_interface.AmdSmiMemoryPartitionType[args.memory_partition]
amdsmi_interface.amdsmi_set_gpu_memory_partition(args.gpu, memory_partition)
for thread in threads:
thread.terminate()
print("")
break # successful case
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL:
print(f"[amdsmi_wrapper.AMDSMI_STATUS_INVAL] Unable to set memory partition to {args.memory_partition} on {gpu_string}")
print(f"Valid Memory partition Modes: {mem_caps_str}\n")
# fall through for value error
except amdsmi_exception.AmdSmiLibraryException as e:
f = open(os.devnull, 'w') #redirect to /dev/null (crossplatform)
print("\n\n", end='\r', flush=True, file=f)
for thread in threads:
thread.terminate()
f = open(os.devnull, 'w') #redirect to /dev/null (crossplatform)
print("\n\n", end='\r', flush=True, file=f)
raise ValueError(f"Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e
except Exception as e:
for thread in threads:
thread.join()
thread.terminate()
raise ValueError(f"Generic error found | Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'memorypartition', f"Successfully set memory partition to {args.memory_partition}")
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
raise PermissionError('Command requires elevation') from e
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL:
out = f"[AMDSMI_STATUS_INVAL] Unable to set memory partition to {args.memory_partition} on {gpu_string}"
print(f"Valid Memory partition Modes: {mem_caps_str}\n")
self.logger.store_output(args.gpu, 'memory_partition', out)
self.logger.print_output()
self.logger.clear_multiple_devices_ouput()
lock.release()
return
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED:
out = f"[AMDSMI_STATUS_NOT_SUPPORTED] Device does not support setting memory partition to {args.memory_partition} on {gpu_string}"
self.logger.store_output(args.gpu, 'memory_partition', out)
self.logger.print_output()
self.logger.clear_multiple_devices_ouput()
lock.release()
return
if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_AMDGPU_RESTART_ERR:
# Try again on a failure -> work around for not being able to close libdrm
string_out = f"Trying again - Updating memory partition for gpu {gpu_id}"
for thread in threads:
thread.terminate()
thread.join()
if timesToRetryRestartErr < 0:
out = f"[AMDSMI_STATUS_AMDGPU_RESTART_ERR] Could not successfully restart driver after applying {args.memory_partition} on {gpu_string}"
self.logger.store_output(args.gpu, 'memory_partition', out)
self.logger.print_output()
self.logger.clear_multiple_devices_ouput()
return
continue
f = open(os.devnull, 'w') #redirect to /dev/null (crossplatform)
print("\n\n", end='\r', flush=True, file=f)
out = f"Unable to set memory partition to {args.memory_partition} on {gpu_string}"
print(out)
self.logger.store_output(args.gpu, 'memorypartition', out)
self.logger.print_output()
self.logger.clear_multiple_devices_ouput()
lock.release()
return
except Exception as e:
for thread in threads:
thread.terminate()
out = f"Generic error found | Unable to set memory partition to {args.memory_partition} on {gpu_string}"
print(out)
lock.release()
raise ValueError(f"Generic error found | Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e
self.logger.store_output(args.gpu, 'memory_partition', f"Successfully set memory partition to {args.memory_partition}")
self.logger.print_output()
self.logger.clear_multiple_devices_ouput()
lock.release()
return
if isinstance(args.power_cap, int):
try:
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
+1 -1
Просмотреть файл
@@ -755,7 +755,7 @@ class AMDSMIHelpers():
****** WARNING ******\n
Setting Dynamic Memory (NPS) partition modes require users to quit all GPU workloads.
AMD SMI will then attempt to change memory (NPS) partition mode.
Upon a successful set, AMD SMI will then initiate an action to restart amdgpu driver.
Upon a successful set, AMD SMI will then initiate an action to restart AMD GPU driver.
This action will change all GPU's in the hive to the requested memory (NPS) partition mode.
Please use this utility with caution.
+1
Просмотреть файл
@@ -49,6 +49,7 @@ class AMDSmiLibraryLoader {
private:
void* libHandler_;
std::mutex library_mutex_;
bool library_loaded_ = false;
};
template<typename T> amdsmi_status_t AMDSmiLibraryLoader::load_symbol(
+5
Просмотреть файл
@@ -57,6 +57,11 @@ class AMDSmiSystem {
amdsmi_status_t get_cpu_family(uint32_t *cpu_family);
amdsmi_status_t get_cpu_model(uint32_t *cpu_model);
amdsmi_status_t clean_up_drm() { return drm_.cleanup();}
amdsmi_status_t init_drm() { return drm_.init();}
private:
AMDSmiSystem() : init_flag_(AMDSMI_INIT_AMD_GPUS) {}
+5 -28
Просмотреть файл
@@ -5703,6 +5703,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
const int k1000_MS_WAIT = 1000;
const uint32_t kMaxBoardLength = 128;
bool isCorrectDevice = false;
char boardName[kMaxBoardLength];
@@ -5716,32 +5717,6 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
char current_memory_mode[kMaxCurrentMemoryMode];
current_memory_mode[0] = '\0';
// rsmi_dev_memory_partition_set is only available for for discrete variant,
// others are required to update through bios settings
rsmi_dev_name_get(dv_ind, boardName, static_cast<size_t>(kMaxBoardLength));
std::string myBoardName = boardName;
if (!myBoardName.empty()) {
std::transform(myBoardName.begin(), myBoardName.end(), myBoardName.begin(),
::tolower);
if (myBoardName.find("mi") != std::string::npos &&
myBoardName.find("00x") != std::string::npos) {
isCorrectDevice = true;
}
}
if (!isCorrectDevice) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
<< amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Cause: device board name does not support this action"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED, false);
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}
// Is the current mode already what user requested?
switch (memory_partition) {
@@ -5877,6 +5852,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< " | Returning = "
<< getRSMIStatusString(restartRet, false);
LOG_TRACE(ss);
if (restartRet != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
@@ -5894,10 +5870,10 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
std::string current_memory_mode_str = "unknown";
rsmi_status_t can_read_sysfs_again = RSMI_STATUS_AMDGPU_RESTART_ERR;
int maxWaitSeconds = 10;
const int k1000_MS_WAIT = 1000;
// wait until we can read SYSFS again
if (restartRet == RSMI_STATUS_SUCCESS) {
while (current_memory_mode_str != user_requested_memory_partition) {
while ((current_memory_mode_str != user_requested_memory_partition)
&& maxWaitSeconds > 0) {
maxWaitSeconds -= 1;
can_read_sysfs_again =
rsmi_dev_memory_partition_get(dv_ind, current_memory_mode, kMaxCurrentMemoryMode);
@@ -5913,6 +5889,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< " | Data (user requested mode): " << user_requested_memory_partition
<< " | Current Memory Partition Mode: " << current_memory_mode_str
<< " | Available Memory Partition Modes: " << memory_capabilities_str
<< " | maxWaitSeconds: " << maxWaitSeconds
<< " | total wait time (sec): " << (10 - maxWaitSeconds)
<< " | Returning = "
<< getRSMIStatusString(can_read_sysfs_again, false);
+41 -19
Просмотреть файл
@@ -1477,38 +1477,56 @@ rsmi_status_t Device::restartAMDGpuDriver(void) {
bool restartInProgress = true;
bool isRestartInProgress = true;
bool isAMDGPUModuleLive = false;
bool restartGDM = false;
std::string captureRestartErr;
const int kTimeToWaitForDriverMSec = 1000;
// sudo systemctl is-active gdm
// we do not care about the success of checking if gdm is active
std::tie(success, out) = executeCommand("systemctl is-active gdm");
(out == "active") ? (restartSuccessful &= success) :
(restartSuccessful = true);
std::tie(success, out) = executeCommand("systemctl is-active gdm", true);
(out == "active") ? (restartGDM = true) : (restartGDM = false);
ss << __PRETTY_FUNCTION__ << " | systemctl is-active gdm: out = "
<< out << "; success = " << (success ? "True" : "False");
LOG_INFO(ss);
// if gdm is active -> sudo systemctl stop gdm
// TODO: are are there other display manager's we need to take into account?
// Search GNOME_Display_Manager on wikipedia
if (success && (out == "active")) {
// TODO(AMD_SMI_team): are are there other display manager's we need to take into account?
// see https://help.gnome.org/admin/gdm/stable/overview.html.en_GB
if (success && (out == "active") && (restartGDM)) {
wasGdmServiceActive = true;
std::tie(success, out) = executeCommand("systemctl stop gdm&", false);
restartSuccessful &= success;
std::tie(success, out) = executeCommand("systemctl stop gdm&", true);
ss << __PRETTY_FUNCTION__ << " | systemctl stop gdm&: out = "
<< out << "; success = " << (success ? "True" : "False");
LOG_INFO(ss);
} else {
success = true; // ignore failures to restart gdm
}
ss << __PRETTY_FUNCTION__ << " | B4 modprobing anything!!! out = "
<< out << "; success = " << (success ? "True" : "False")
<< "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
<< "; captureRestartErr = " << captureRestartErr;
LOG_INFO(ss);
// sudo modprobe -r amdgpu
// sudo modprobe amdgpu
std::tie(success, out) =
executeCommand("modprobe -r amdgpu && modprobe amdgpu&", true);
std::tie(success, out) = executeCommand(
"modprobe -r -v amdgpu >/dev/null 2>&1 && modprobe -v amdgpu >/dev/null 2>&1", true);
restartSuccessful &= success;
captureRestartErr = out;
if (success) {
restartSuccessful = false;
}
ss << __PRETTY_FUNCTION__ << " | modprobe -r -v amdgpu && modprobe -v amdgpu: out = "
<< out << "; success = " << (success ? "True" : "False")
<< "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
<< "; captureRestartErr = " << captureRestartErr;
LOG_INFO(ss);
// if gdm was active -> sudo systemctl start gdm
if (wasGdmServiceActive) {
std::tie(success, out) = executeCommand("systemctl start gdm&", false);
restartSuccessful &= success;
// We don't care if successful or not, just try to restart as a courtesy
if (wasGdmServiceActive && restartGDM) {
std::tie(success, out) = executeCommand("systemctl start gdm&", true);
ss << __PRETTY_FUNCTION__ << " | systemctl start gdm&: out = "
<< out << "; success = " << (success ? "True" : "False");
LOG_INFO(ss);
}
// Return early if there was an issue restarting amdgpu
@@ -1522,7 +1540,6 @@ rsmi_status_t Device::restartAMDGpuDriver(void) {
// wait for amdgpu module to come back up
rsmi_status_t status = Device::isRestartInProgress(&isRestartInProgress,
&isAMDGPUModuleLive);
const int kTimeToWaitForDriverMSec = 1000;
int maxLoops = 10; // wait a max of 10 sec
while (status != RSMI_STATUS_SUCCESS) {
maxLoops -= 1;
@@ -1553,7 +1570,7 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress,
// wait for amdgpu module to come back up
std::tie(success, out) = executeCommand("cat /sys/module/amdgpu/initstate", true);
ss << __PRETTY_FUNCTION__
<< " | success = " << success
<< " | success = " << (success ? "True" : "False")
<< " | out = " << out;
LOG_DEBUG(ss);
if ((success == true) && (!out.empty())) {
@@ -1564,6 +1581,11 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress,
}
*isRestartInProgress = deviceRestartInProgress;
*isAMDGPUModuleLive = isSystemAMDGPUModuleLive;
ss << __PRETTY_FUNCTION__
<< " | *isRestartInProgress = " << (*isRestartInProgress ? "True":"False")
<< " | *isAMDGPUModuleLive = " << (*isAMDGPUModuleLive ? "True":"False")
<< " | out = " << out;
LOG_DEBUG(ss);
return ((*isAMDGPUModuleLive && !*isRestartInProgress) ? RSMI_STATUS_SUCCESS :
RSMI_STATUS_AMDGPU_RESTART_ERR);
+58 -1
Просмотреть файл
@@ -53,6 +53,8 @@
#include "amd_smi/impl/amd_smi_processor.h"
#include "rocm_smi/rocm_smi_logger.h"
// a global instance of std::mutex to protect data passed during threads
std::mutex myMutex;
static bool initialized_lib = false;
#define SIZE 10
@@ -1426,8 +1428,63 @@ amdsmi_status_t
amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle,
amdsmi_memory_partition_type_t memory_partition) {
AMDSMI_CHECK_INIT();
return rsmi_wrapper(rsmi_dev_memory_partition_set, processor_handle,
std::ostringstream ss;
std::lock_guard<std::mutex> g(myMutex);
// open libdrm connections prevents the ability to unload driver
amd::smi::AMDSmiSystem::getInstance().clean_up_drm();
ss << __PRETTY_FUNCTION__ << " | \n"
<< "***********************************\n"
<< "* Cleaned up - clean_up_drm() *\n"
<< "***********************************\n";
LOG_INFO(ss);
amdsmi_status_t ret = rsmi_wrapper(rsmi_dev_memory_partition_set, processor_handle,
static_cast<rsmi_memory_partition_type_t>(memory_partition));
if (ret == AMDSMI_STATUS_SUCCESS) {
const uint32_t k256 = 256;
char current_partition[k256];
std::string current_partition_str = "UNKNOWN";
std::string req_user_partition;
amdsmi_status_t ret_get = rsmi_wrapper(rsmi_dev_memory_partition_get, processor_handle,
current_partition, k256);
if (ret_get == AMDSMI_STATUS_SUCCESS) {
current_partition_str.clear();
current_partition_str = current_partition;
}
switch (memory_partition) {
case AMDSMI_MEMORY_PARTITION_NPS1:
req_user_partition = "NPS1";
break;
case AMDSMI_MEMORY_PARTITION_NPS2:
req_user_partition = "NPS2";
break;
case AMDSMI_MEMORY_PARTITION_NPS4:
req_user_partition = "NPS4";
break;
case AMDSMI_MEMORY_PARTITION_NPS8:
req_user_partition = "NPS8";
break;
default:
req_user_partition = "UNKNOWN";
break;
}
if (req_user_partition == current_partition_str) {
amd::smi::AMDSmiSystem::getInstance().init_drm();
ss << __PRETTY_FUNCTION__ << " | \n"
<< "***********************************\n"
<< "* Initialized libdrm - init_drm() *\n"
<< "***********************************\n";
LOG_INFO(ss);
}
}
// TODO(amdsmi_team): issue completely closing -> reopening libdrm on 1st try (workaround above)
// amd::smi::AMDSmiSystem::getInstance().init_drm();
// ss << __PRETTY_FUNCTION__ << " | \n"
// << "***********************************\n"
// << "* Initialized libdrm - init_drm() *\n"
// << "***********************************\n";
// LOG_INFO(ss);
return ret;
}
amdsmi_status_t
+14 -7
Просмотреть файл
@@ -33,18 +33,24 @@ amdsmi_status_t AMDSmiLibraryLoader::load(const char* filename) {
if (filename == nullptr) {
return AMDSMI_STATUS_FAIL_LOAD_MODULE;
}
if (libHandler_) {
if (libHandler_ || library_loaded_) {
unload();
}
std::lock_guard<std::mutex> guard(library_mutex_);
libHandler_ = dlopen(filename, RTLD_LAZY);
if (!libHandler_) {
char* error = dlerror();
std::cerr << "Fail to open " << filename <<": " << error
<< std::endl;
return AMDSMI_STATUS_FAIL_LOAD_MODULE;
// check if already loaded, return success if it is
// dlopen(filename, RTLD_NOLOAD) == null only IFF library is not loaded
void* isLibOpen = dlopen(filename, RTLD_NOLOAD);
if (isLibOpen == nullptr) {
libHandler_ = dlopen(filename, RTLD_LAZY);
if (!libHandler_) {
char* error = dlerror();
std::cerr << "Fail to open " << filename <<": " << error
<< std::endl;
return AMDSMI_STATUS_FAIL_LOAD_MODULE;
}
}
library_loaded_ = true;
return AMDSMI_STATUS_SUCCESS;
}
@@ -54,6 +60,7 @@ amdsmi_status_t AMDSmiLibraryLoader::unload() {
if (libHandler_) {
dlclose(libHandler_);
libHandler_ = nullptr;
library_loaded_ = false;
}
return AMDSMI_STATUS_SUCCESS;
}