[SWDEV-499029] Fix unable to change memory partition modes

Changes:
  * [API] Removed checking board name, fixes for other MI ASICs
  * [CLI] Increased progress bar to change memory partition modes
    to 140 seconds, since driver reload is variable per system

Change-Id: Ifcaf40d28b4adf5eaa800c9e3748d33749dc414a
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/rocm_smi_lib commit: d04cec7f1d]
Αυτή η υποβολή περιλαμβάνεται σε:
Charis Poag
2024-11-21 23:28:03 -06:00
γονέας 21d3a831d7
υποβολή 06fead7e41
3 αρχεία άλλαξαν με 46 προσθήκες και 47 διαγραφές
@@ -1867,7 +1867,7 @@ def setMemoryPartition(deviceList, memoryPartition, autoRespond):
(', '.join(map(str, memory_partition_type_l))) ))
return (None, None)
kTimeWait = 40
kTimeWait = 140
t1 = multiprocessing.Process(target=showProgressbar,
args=("Updating memory partition",kTimeWait,))
t1.start()
@@ -3889,7 +3889,7 @@ def confirmChangingMemoryPartitionAndReloadingAMDGPU(autoRespond):
******WARNING******\n
Setting Dynamic Memory (NPS) partition modes require users to quit all GPU workloads.
ROCm SMI will then attempt to change memory (NPS) partition mode.
Upon a successful set, ROCm SMI will then initiate an action to restart amdgpu driver.
Upon a successful set, ROCm SMI will then initiate an action to restart AMD GPU driver.
This action will change all GPU's in the hive to the requested memory (NPS) partition mode.
Please use this utility with caution.
@@ -4912,6 +4912,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
const int k1000_MS_WAIT = 1000;
const uint32_t kMaxBoardLength = 128;
bool isCorrectDevice = false;
char boardName[kMaxBoardLength];
@@ -4925,32 +4926,6 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
char current_memory_mode[kMaxCurrentMemoryMode];
current_memory_mode[0] = '\0';
// rsmi_dev_memory_partition_set is only available for for discrete variant,
// others are required to update through bios settings
rsmi_dev_name_get(dv_ind, boardName, static_cast<size_t>(kMaxBoardLength));
std::string myBoardName = boardName;
if (!myBoardName.empty()) {
std::transform(myBoardName.begin(), myBoardName.end(), myBoardName.begin(),
::tolower);
if (myBoardName.find("mi") != std::string::npos &&
myBoardName.find("00x") != std::string::npos) {
isCorrectDevice = true;
}
}
if (!isCorrectDevice) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Fail "
<< " | Device #: " << dv_ind
<< " | Type: "
<< amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Cause: device board name does not support this action"
<< " | Returning = "
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED, false);
LOG_ERROR(ss);
return RSMI_STATUS_NOT_SUPPORTED;
}
// Is the current mode already what user requested?
switch (memory_partition) {
@@ -5086,6 +5061,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< " | Returning = "
<< getRSMIStatusString(restartRet, false);
LOG_TRACE(ss);
if (restartRet != RSMI_STATUS_SUCCESS) {
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
@@ -5103,10 +5079,10 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
std::string current_memory_mode_str = "unknown";
rsmi_status_t can_read_sysfs_again = RSMI_STATUS_AMDGPU_RESTART_ERR;
int maxWaitSeconds = 10;
const int k1000_MS_WAIT = 1000;
// wait until we can read SYSFS again
if (restartRet == RSMI_STATUS_SUCCESS) {
while (current_memory_mode_str != user_requested_memory_partition) {
while ((current_memory_mode_str != user_requested_memory_partition)
&& maxWaitSeconds > 0) {
maxWaitSeconds -= 1;
can_read_sysfs_again =
rsmi_dev_memory_partition_get(dv_ind, current_memory_mode, kMaxCurrentMemoryMode);
@@ -5122,6 +5098,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< " | Data (user requested mode): " << user_requested_memory_partition
<< " | Current Memory Partition Mode: " << current_memory_mode_str
<< " | Available Memory Partition Modes: " << memory_capabilities_str
<< " | maxWaitSeconds: " << maxWaitSeconds
<< " | total wait time (sec): " << (10 - maxWaitSeconds)
<< " | Returning = "
<< getRSMIStatusString(can_read_sysfs_again, false);
@@ -1391,38 +1391,56 @@ rsmi_status_t Device::restartAMDGpuDriver(void) {
bool restartInProgress = true;
bool isRestartInProgress = true;
bool isAMDGPUModuleLive = false;
bool restartGDM = false;
std::string captureRestartErr;
const int kTimeToWaitForDriverMSec = 1000;
// sudo systemctl is-active gdm
// we do not care about the success of checking if gdm is active
std::tie(success, out) = executeCommand("systemctl is-active gdm");
(out == "active") ? (restartSuccessful &= success) :
(restartSuccessful = true);
std::tie(success, out) = executeCommand("systemctl is-active gdm", true);
(out == "active") ? (restartGDM = true) : (restartGDM = false);
ss << __PRETTY_FUNCTION__ << " | systemctl is-active gdm: out = "
<< out << "; success = " << (success ? "True" : "False");
LOG_INFO(ss);
// if gdm is active -> sudo systemctl stop gdm
// TODO(AMD_SMI_team): are are there other display manager's we need to take into account?
// see https://help.gnome.org/admin/gdm/stable/overview.html.en_GB
if (success && (out == "active")) {
if (success && (out == "active") && (restartGDM)) {
wasGdmServiceActive = true;
std::tie(success, out) = executeCommand("systemctl stop gdm&", false);
restartSuccessful &= success;
std::tie(success, out) = executeCommand("systemctl stop gdm&", true);
ss << __PRETTY_FUNCTION__ << " | systemctl stop gdm&: out = "
<< out << "; success = " << (success ? "True" : "False");
LOG_INFO(ss);
} else {
success = true; // ignore failures to restart gdm
}
ss << __PRETTY_FUNCTION__ << " | B4 modprobing anything!!! out = "
<< out << "; success = " << (success ? "True" : "False")
<< "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
<< "; captureRestartErr = " << captureRestartErr;
LOG_INFO(ss);
// sudo modprobe -r amdgpu
// sudo modprobe amdgpu
std::tie(success, out) =
executeCommand("modprobe -r amdgpu && modprobe amdgpu&", true);
std::tie(success, out) = executeCommand(
"modprobe -r -v amdgpu >/dev/null 2>&1 && modprobe -v amdgpu >/dev/null 2>&1", true);
restartSuccessful &= success;
captureRestartErr = out;
if (success) {
restartSuccessful = false;
}
ss << __PRETTY_FUNCTION__ << " | modprobe -r -v amdgpu && modprobe -v amdgpu: out = "
<< out << "; success = " << (success ? "True" : "False")
<< "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
<< "; captureRestartErr = " << captureRestartErr;
LOG_INFO(ss);
// if gdm was active -> sudo systemctl start gdm
if (wasGdmServiceActive) {
std::tie(success, out) = executeCommand("systemctl start gdm&", false);
restartSuccessful &= success;
// We don't care if successful or not, just try to restart as a courtesy
if (wasGdmServiceActive && restartGDM) {
std::tie(success, out) = executeCommand("systemctl start gdm&", true);
ss << __PRETTY_FUNCTION__ << " | systemctl start gdm&: out = "
<< out << "; success = " << (success ? "True" : "False");
LOG_INFO(ss);
}
// Return early if there was an issue restarting amdgpu
@@ -1436,7 +1454,6 @@ rsmi_status_t Device::restartAMDGpuDriver(void) {
// wait for amdgpu module to come back up
rsmi_status_t status = Device::isRestartInProgress(&isRestartInProgress,
&isAMDGPUModuleLive);
const int kTimeToWaitForDriverMSec = 1000;
int maxLoops = 10; // wait a max of 10 sec
while (status != RSMI_STATUS_SUCCESS) {
maxLoops -= 1;
@@ -1467,7 +1484,7 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress,
// wait for amdgpu module to come back up
std::tie(success, out) = executeCommand("cat /sys/module/amdgpu/initstate", true);
ss << __PRETTY_FUNCTION__
<< " | success = " << success
<< " | success = " << (success ? "True" : "False")
<< " | out = " << out;
LOG_DEBUG(ss);
if ((success == true) && (!out.empty())) {
@@ -1478,6 +1495,11 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress,
}
*isRestartInProgress = deviceRestartInProgress;
*isAMDGPUModuleLive = isSystemAMDGPUModuleLive;
ss << __PRETTY_FUNCTION__
<< " | *isRestartInProgress = " << (*isRestartInProgress ? "True":"False")
<< " | *isAMDGPUModuleLive = " << (*isAMDGPUModuleLive ? "True":"False")
<< " | out = " << out;
LOG_DEBUG(ss);
return ((*isAMDGPUModuleLive && !*isRestartInProgress) ? RSMI_STATUS_SUCCESS :
RSMI_STATUS_AMDGPU_RESTART_ERR);