[SWDEV-499029] Fix unable to change memory partition modes
Changes:
* [API] Removed checking board name, fixes for other MI ASICs
* [CLI] Increased progress bar to change memory partition modes
to 140 seconds, since driver reload is variable per system
Change-Id: Ifcaf40d28b4adf5eaa800c9e3748d33749dc414a
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
[ROCm/rocm_smi_lib commit: d04cec7f1d]
Αυτή η υποβολή περιλαμβάνεται σε:
@@ -1867,7 +1867,7 @@ def setMemoryPartition(deviceList, memoryPartition, autoRespond):
|
||||
(', '.join(map(str, memory_partition_type_l))) ))
|
||||
return (None, None)
|
||||
|
||||
kTimeWait = 40
|
||||
kTimeWait = 140
|
||||
t1 = multiprocessing.Process(target=showProgressbar,
|
||||
args=("Updating memory partition",kTimeWait,))
|
||||
t1.start()
|
||||
@@ -3889,7 +3889,7 @@ def confirmChangingMemoryPartitionAndReloadingAMDGPU(autoRespond):
|
||||
******WARNING******\n
|
||||
Setting Dynamic Memory (NPS) partition modes require users to quit all GPU workloads.
|
||||
ROCm SMI will then attempt to change memory (NPS) partition mode.
|
||||
Upon a successful set, ROCm SMI will then initiate an action to restart amdgpu driver.
|
||||
Upon a successful set, ROCm SMI will then initiate an action to restart AMD GPU driver.
|
||||
This action will change all GPU's in the hive to the requested memory (NPS) partition mode.
|
||||
|
||||
Please use this utility with caution.
|
||||
|
||||
@@ -4912,6 +4912,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
|
||||
LOG_TRACE(ss);
|
||||
REQUIRE_ROOT_ACCESS
|
||||
DEVICE_MUTEX
|
||||
const int k1000_MS_WAIT = 1000;
|
||||
const uint32_t kMaxBoardLength = 128;
|
||||
bool isCorrectDevice = false;
|
||||
char boardName[kMaxBoardLength];
|
||||
@@ -4925,32 +4926,6 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
|
||||
char current_memory_mode[kMaxCurrentMemoryMode];
|
||||
current_memory_mode[0] = '\0';
|
||||
|
||||
// rsmi_dev_memory_partition_set is only available for for discrete variant,
|
||||
// others are required to update through bios settings
|
||||
rsmi_dev_name_get(dv_ind, boardName, static_cast<size_t>(kMaxBoardLength));
|
||||
std::string myBoardName = boardName;
|
||||
if (!myBoardName.empty()) {
|
||||
std::transform(myBoardName.begin(), myBoardName.end(), myBoardName.begin(),
|
||||
::tolower);
|
||||
if (myBoardName.find("mi") != std::string::npos &&
|
||||
myBoardName.find("00x") != std::string::npos) {
|
||||
isCorrectDevice = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!isCorrectDevice) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: "
|
||||
<< amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
|
||||
<< " | Cause: device board name does not support this action"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED, false);
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
// Is the current mode already what user requested?
|
||||
switch (memory_partition) {
|
||||
@@ -5086,6 +5061,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(restartRet, false);
|
||||
LOG_TRACE(ss);
|
||||
|
||||
if (restartRet != RSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
@@ -5103,10 +5079,10 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
|
||||
std::string current_memory_mode_str = "unknown";
|
||||
rsmi_status_t can_read_sysfs_again = RSMI_STATUS_AMDGPU_RESTART_ERR;
|
||||
int maxWaitSeconds = 10;
|
||||
const int k1000_MS_WAIT = 1000;
|
||||
// wait until we can read SYSFS again
|
||||
if (restartRet == RSMI_STATUS_SUCCESS) {
|
||||
while (current_memory_mode_str != user_requested_memory_partition) {
|
||||
while ((current_memory_mode_str != user_requested_memory_partition)
|
||||
&& maxWaitSeconds > 0) {
|
||||
maxWaitSeconds -= 1;
|
||||
can_read_sysfs_again =
|
||||
rsmi_dev_memory_partition_get(dv_ind, current_memory_mode, kMaxCurrentMemoryMode);
|
||||
@@ -5122,6 +5098,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
|
||||
<< " | Data (user requested mode): " << user_requested_memory_partition
|
||||
<< " | Current Memory Partition Mode: " << current_memory_mode_str
|
||||
<< " | Available Memory Partition Modes: " << memory_capabilities_str
|
||||
<< " | maxWaitSeconds: " << maxWaitSeconds
|
||||
<< " | total wait time (sec): " << (10 - maxWaitSeconds)
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(can_read_sysfs_again, false);
|
||||
|
||||
@@ -1391,38 +1391,56 @@ rsmi_status_t Device::restartAMDGpuDriver(void) {
|
||||
bool restartInProgress = true;
|
||||
bool isRestartInProgress = true;
|
||||
bool isAMDGPUModuleLive = false;
|
||||
bool restartGDM = false;
|
||||
std::string captureRestartErr;
|
||||
const int kTimeToWaitForDriverMSec = 1000;
|
||||
|
||||
// sudo systemctl is-active gdm
|
||||
// we do not care about the success of checking if gdm is active
|
||||
std::tie(success, out) = executeCommand("systemctl is-active gdm");
|
||||
(out == "active") ? (restartSuccessful &= success) :
|
||||
(restartSuccessful = true);
|
||||
std::tie(success, out) = executeCommand("systemctl is-active gdm", true);
|
||||
(out == "active") ? (restartGDM = true) : (restartGDM = false);
|
||||
ss << __PRETTY_FUNCTION__ << " | systemctl is-active gdm: out = "
|
||||
<< out << "; success = " << (success ? "True" : "False");
|
||||
LOG_INFO(ss);
|
||||
|
||||
// if gdm is active -> sudo systemctl stop gdm
|
||||
// TODO(AMD_SMI_team): are are there other display manager's we need to take into account?
|
||||
// see https://help.gnome.org/admin/gdm/stable/overview.html.en_GB
|
||||
if (success && (out == "active")) {
|
||||
if (success && (out == "active") && (restartGDM)) {
|
||||
wasGdmServiceActive = true;
|
||||
std::tie(success, out) = executeCommand("systemctl stop gdm&", false);
|
||||
restartSuccessful &= success;
|
||||
std::tie(success, out) = executeCommand("systemctl stop gdm&", true);
|
||||
ss << __PRETTY_FUNCTION__ << " | systemctl stop gdm&: out = "
|
||||
<< out << "; success = " << (success ? "True" : "False");
|
||||
LOG_INFO(ss);
|
||||
} else {
|
||||
success = true; // ignore failures to restart gdm
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | B4 modprobing anything!!! out = "
|
||||
<< out << "; success = " << (success ? "True" : "False")
|
||||
<< "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
|
||||
<< "; captureRestartErr = " << captureRestartErr;
|
||||
LOG_INFO(ss);
|
||||
|
||||
// sudo modprobe -r amdgpu
|
||||
// sudo modprobe amdgpu
|
||||
std::tie(success, out) =
|
||||
executeCommand("modprobe -r amdgpu && modprobe amdgpu&", true);
|
||||
std::tie(success, out) = executeCommand(
|
||||
"modprobe -r -v amdgpu >/dev/null 2>&1 && modprobe -v amdgpu >/dev/null 2>&1", true);
|
||||
restartSuccessful &= success;
|
||||
captureRestartErr = out;
|
||||
|
||||
if (success) {
|
||||
restartSuccessful = false;
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << " | modprobe -r -v amdgpu && modprobe -v amdgpu: out = "
|
||||
<< out << "; success = " << (success ? "True" : "False")
|
||||
<< "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
|
||||
<< "; captureRestartErr = " << captureRestartErr;
|
||||
LOG_INFO(ss);
|
||||
|
||||
// if gdm was active -> sudo systemctl start gdm
|
||||
if (wasGdmServiceActive) {
|
||||
std::tie(success, out) = executeCommand("systemctl start gdm&", false);
|
||||
restartSuccessful &= success;
|
||||
// We don't care if successful or not, just try to restart as a courtesy
|
||||
if (wasGdmServiceActive && restartGDM) {
|
||||
std::tie(success, out) = executeCommand("systemctl start gdm&", true);
|
||||
ss << __PRETTY_FUNCTION__ << " | systemctl start gdm&: out = "
|
||||
<< out << "; success = " << (success ? "True" : "False");
|
||||
LOG_INFO(ss);
|
||||
}
|
||||
|
||||
// Return early if there was an issue restarting amdgpu
|
||||
@@ -1436,7 +1454,6 @@ rsmi_status_t Device::restartAMDGpuDriver(void) {
|
||||
// wait for amdgpu module to come back up
|
||||
rsmi_status_t status = Device::isRestartInProgress(&isRestartInProgress,
|
||||
&isAMDGPUModuleLive);
|
||||
const int kTimeToWaitForDriverMSec = 1000;
|
||||
int maxLoops = 10; // wait a max of 10 sec
|
||||
while (status != RSMI_STATUS_SUCCESS) {
|
||||
maxLoops -= 1;
|
||||
@@ -1467,7 +1484,7 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress,
|
||||
// wait for amdgpu module to come back up
|
||||
std::tie(success, out) = executeCommand("cat /sys/module/amdgpu/initstate", true);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | success = " << success
|
||||
<< " | success = " << (success ? "True" : "False")
|
||||
<< " | out = " << out;
|
||||
LOG_DEBUG(ss);
|
||||
if ((success == true) && (!out.empty())) {
|
||||
@@ -1478,6 +1495,11 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress,
|
||||
}
|
||||
*isRestartInProgress = deviceRestartInProgress;
|
||||
*isAMDGPUModuleLive = isSystemAMDGPUModuleLive;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | *isRestartInProgress = " << (*isRestartInProgress ? "True":"False")
|
||||
<< " | *isAMDGPUModuleLive = " << (*isAMDGPUModuleLive ? "True":"False")
|
||||
<< " | out = " << out;
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
return ((*isAMDGPUModuleLive && !*isRestartInProgress) ? RSMI_STATUS_SUCCESS :
|
||||
RSMI_STATUS_AMDGPU_RESTART_ERR);
|
||||
|
||||
Αναφορά σε νέο ζήτημα
Block a user