diff --git a/.github/workflows/amdsmi-build.yml b/.github/workflows/amdsmi-build.yml
index 802d68cd07..d659146c97 100644
--- a/.github/workflows/amdsmi-build.yml
+++ b/.github/workflows/amdsmi-build.yml
@@ -23,12 +23,12 @@ jobs:
- ${{ vars.RUNNER_TYPE }}
continue-on-error: true
strategy:
- max-parallel: 10
+ max-parallel: 10 # This can be in parallel, since no tests are running (see comment about driver reloads)
matrix:
os: [Ubuntu20, Ubuntu22, Debian10]
container:
image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }}
- options: --privileged
+ options: --privileged --cap-add=SYS_MODULE -v /lib/modules:/lib/modules
steps:
- uses: actions/checkout@v4
@@ -137,11 +137,12 @@ jobs:
- ${{ vars.RUNNER_TYPE }}
continue-on-error: true
strategy:
+ max-parallel: 1 # Need to tests to run one-at-a-time to avoid conflicts with driver reloads
matrix:
os: [Ubuntu20, Ubuntu22, Debian10]
container:
image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }}
- options: --privileged
+ options: --privileged --cap-add=SYS_MODULE -v /lib/modules:/lib/modules
steps:
- uses: actions/checkout@v4
@@ -358,7 +359,7 @@ jobs:
- ${{ vars.RUNNER_TYPE }}
continue-on-error: true
strategy:
- max-parallel: 10
+ max-parallel: 10 # This can be in parallel, since no tests are running (see comment about driver reloads)
matrix:
os:
- SLES
@@ -369,7 +370,7 @@ jobs:
- AlmaLinux8
container:
image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }}
- options: --privileged
+ options: --privileged --cap-add=SYS_MODULE -v /lib/modules:/lib/modules
steps:
- uses: actions/checkout@v4
@@ -554,12 +555,13 @@ jobs:
rpm-test:
name: Tests
- needs: rpm-buildinstall
+ needs: [rpm-buildinstall, debian-test] # debian-test is needed to complete before rpm-test starts (see comment about driver reloads)
runs-on:
- self-hosted
- ${{ vars.RUNNER_TYPE }}
continue-on-error: true
strategy:
+ max-parallel: 1 # Need to tests to run one-at-a-time to avoid conflicts with driver reloads
matrix:
os:
- SLES
@@ -570,7 +572,7 @@ jobs:
- AlmaLinux8
container:
image: ${{ vars[format('{0}_DOCKER_IMAGE', matrix.os)] }}
- options: --privileged
+ options: --privileged --cap-add=SYS_MODULE -v /lib/modules:/lib/modules
steps:
- uses: actions/checkout@v4
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d4d7718621..d6f7a33d65 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -31,6 +31,9 @@ Full documentation for amd_smi_lib is available at [https://rocm.docs.amd.com/pr
### Added
+- **Added restarting (reloading) AMD GPU driver to both CLI and API calls**
+ - Refer to [Separated driver reload from `amdsmi_set_gpu_memory_partition()` / `amdsmi_set_gpu_memory_partition_mode()` and CLI (`sudo amd-smi set -M `)](#separate-driver-reload-anchor) section for more details.
+
- **Added the Default command**.
- A default view has been added. The default view provides a snapshot of commonly requested information such as bdf, current partition mode, version information, and more. Users can access that information by simply typing `amd-smi` with no additional commands or arguments. Users may also obtain this information through laternate output formats such as json or csv by using the default command with the respective output format: `amd-smi default --json` or `amd-smi default --csv`.
@@ -111,6 +114,16 @@ $ amd-smi
### Changed
+
+- **Separated driver reload from `amdsmi_set_gpu_memory_partition()` / `amdsmi_set_gpu_memory_partition_mode()` and CLI (`sudo amd-smi set -M `)**
+ - Providing new API (`amdsmi_gpu_driver_reload()`) and CLI (`sudo amd-smi reset -r` or `sudo amd-smi reset --reload-driver`) once user is ready to reload driver. We understand
+ the automatic reload could be at an inconvienient time. This is why we now provide this
+ functionality in separate API/CLI commands to use when the time is right.
+ - It is important to understand, the memory (NPS) partition change requires:
+ 1) Memory partition change request (`amdsmi_set_gpu_memory_partition()` / `amdsmi_set_gpu_memory_partition_mode()`) or CLI (`sudo amd-smi set -M `)
+ 2) Driver reload (`amdsmi_gpu_driver_reload()` / `sudo amd-smi reset -r` or `sudo amd-smi reset --reload-driver`) \[\*\]
+ \[\*\] Driver reload requires all GPU activity on all devices to be stopped.
+
- **Modified `amd-smi` CLI `monitor` and `metric` for violations**.
- Disabled `amd-smi monitor --violation` on guests.
- Modified `amd-smi metric -T/--throttle` to alias to `amd-smi metric -v/--violation`.
diff --git a/amdsmi_cli/amdsmi_commands.py b/amdsmi_cli/amdsmi_commands.py
index cddad88098..97fbd7fe39 100644
--- a/amdsmi_cli/amdsmi_commands.py
+++ b/amdsmi_cli/amdsmi_commands.py
@@ -4620,12 +4620,14 @@ class AMDSMICommands():
return
if args.memory_partition:
- lock = multiprocessing.Lock()
- lock.acquire()
####################################################################
# Get current and available memory partition modes #
# Info used if AMDSMI_STATUS_INVAL is caught & to set progress bar #
####################################################################
+ self.helpers.increment_set_count()
+ set_count = self.helpers.get_set_count()
+ if set_count == 1: # only show reload warning on 1st set
+ self.helpers.confirm_changing_memory_partition_gpu_reload_warning()
try:
memory_dict = {'caps': "N/A", 'current': "N/A"}
memory_partition_config = amdsmi_interface.amdsmi_get_gpu_memory_partition_config(args.gpu)
@@ -4633,104 +4635,34 @@ class AMDSMICommands():
memory_dict['current'] = memory_partition_config['mp_mode']
except amdsmi_exception.AmdSmiLibraryException as e:
logging.debug("Failed to get current memory partition for GPU %s | %s", gpu_id, e.get_error_info())
-
- ###############################################################
- # memory partition set starts here #
- ###############################################################
- showProgressBar = False
- if ((str(memory_dict['current']) != "N/A") and (str(args.memory_partition) in memory_dict['caps'])
- and ((str(memory_dict['current']) != str(args.memory_partition)))):
- showProgressBar = True # Only show progress bar if
- # 1) Device can set memory partition modes
- # 2) Requested mode is a valid mode to set
- # 3) Current is not already the requested mode
- # otherwise function will return fast
- else:
- showProgressBar = False
-
- threads = []
- k140secs = 140
- string_out = f"Updating memory partition for GPU: {gpu_id}"
- timesToRetryRestartErr = 1
-
- self.helpers.increment_set_count()
- set_count = self.helpers.get_set_count()
- if set_count == 1: # only show reload warning on 1st set
- self.helpers.confirm_changing_memory_partition_gpu_reload_warning()
-
- while timesToRetryRestartErr >= 0:
- timesToRetryRestartErr -= 1
- try:
- if showProgressBar: # we want to overwrite the previous progress bar
- t1 = multiprocessing.Process(target=self.helpers.showProgressbar,
- args=(string_out, k140secs, True,))
- threads.append(t1)
- t1.start()
- memory_partition = amdsmi_interface.AmdSmiMemoryPartitionType[args.memory_partition]
- amdsmi_interface.amdsmi_set_gpu_memory_partition(args.gpu, memory_partition)
- for thread in threads:
- thread.terminate()
- print("")
- break # successful case
-
- except amdsmi_exception.AmdSmiLibraryException as e:
- f = open(os.devnull, 'w', encoding='utf-8') #redirect to /dev/null (crossplatform)
- print("\n\n", end='\r', flush=True, file=f)
- for thread in threads:
- thread.terminate()
-
- if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
- raise PermissionError('Command requires elevation') from e
- if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL:
- out = f"[AMDSMI_STATUS_INVAL] Unable to set memory partition to {args.memory_partition}"
- print(f"Valid Memory partition Modes: {memory_dict['caps']}\n")
- self.logger.store_output(args.gpu, 'memory_partition', out)
- self.logger.print_output()
- self.logger.clear_multiple_devices_output()
- lock.release()
- return
- if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NOT_SUPPORTED:
- out = f"[AMDSMI_STATUS_NOT_SUPPORTED] Unable to set memory partition to {args.memory_partition}"
- self.logger.store_output(args.gpu, 'memory_partition', out)
- self.logger.print_output()
- self.logger.clear_multiple_devices_output()
- lock.release()
- return
- if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_AMDGPU_RESTART_ERR:
- # Try again on a failure -> work around for not being able to close libdrm
- string_out = f"Trying again - Updating memory partition for GPU: {gpu_id} "
- for thread in threads:
- thread.terminate()
- thread.join()
- if timesToRetryRestartErr < 0:
- out = f"[AMDSMI_STATUS_AMDGPU_RESTART_ERR] Could not successfully restart driver after applying {args.memory_partition}"
- self.logger.store_output(args.gpu, 'memory_partition', out)
- self.logger.print_output()
- self.logger.clear_multiple_devices_output()
- return
- continue
-
- f = open(os.devnull, 'w', encoding='utf-8') #redirect to /dev/null (crossplatform)
- print("\n\n", end='\r', flush=True, file=f)
- out = f"Unable to set memory partition to {args.memory_partition} on {gpu_string}"
- print(out)
- self.logger.store_output(args.gpu, 'memorypartition', out)
+ try:
+ memory_partition = amdsmi_interface.AmdSmiMemoryPartitionType[args.memory_partition]
+ amdsmi_interface.amdsmi_set_gpu_memory_partition(args.gpu, memory_partition)
+ out = f"Successfully set memory partition to {args.memory_partition}, reload driver when ready"
+ except amdsmi_exception.AmdSmiLibraryException as e:
+ out = f"[{e.get_error_info(detailed=False)}] Unable to set memory partition to {args.memory_partition}"
+ if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
+ out = f"[AMDSMI_STATUS_NO_PERM] Command requires elevation"
+ self.logger.store_output(args.gpu, 'memory_partition', out)
+ self.logger.print_output()
+ self.logger.clear_multiple_devices_output()
+ raise PermissionError('Command requires elevation') from e
+ elif e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_INVAL:
+ print(f"Valid Memory partition Modes: {memory_dict['caps']}\n")
+ self.logger.store_output(args.gpu, 'memory_partition', out)
self.logger.print_output()
self.logger.clear_multiple_devices_output()
- lock.release()
return
- except Exception as e:
- for thread in threads:
- thread.terminate()
- out = f"Generic error found | Unable to set memory partition to {args.memory_partition} on {gpu_string}"
- print(out)
- lock.release()
- raise ValueError(f"Generic error found | Unable to set memory partition to {args.memory_partition} on {gpu_string}") from e
- self.logger.store_output(args.gpu, 'memory_partition', f"Successfully set memory partition to {args.memory_partition}")
+ else:
+ self.logger.store_output(args.gpu, 'memory_partition', out)
+ self.logger.print_output()
+ self.logger.clear_multiple_devices_output()
+ return
+ self.logger.store_output(args.gpu, 'memory_partition', out)
self.logger.print_output()
self.logger.clear_multiple_devices_output()
- lock.release()
return
+
if isinstance(args.power_cap, int):
try:
power_cap_info = amdsmi_interface.amdsmi_get_power_cap_info(args.gpu)
@@ -5173,7 +5105,7 @@ class AMDSMICommands():
def reset(self, args, multiple_devices=False, gpu=None, gpureset=None,
clocks=None, fans=None, profile=None, xgmierr=None, perf_determinism=None,
- power_cap=None, clean_local_data=None):
+ power_cap=None, reload_driver=None, clean_local_data=None):
"""Issue reset commands to target gpu(s)
Args:
@@ -5213,6 +5145,8 @@ class AMDSMICommands():
args.perf_determinism = perf_determinism
if power_cap:
args.power_cap = power_cap
+ if reload_driver:
+ args.reload_driver = reload_driver
if clean_local_data:
args.clean_local_data = clean_local_data
@@ -5237,15 +5171,19 @@ class AMDSMICommands():
# Error if no subcommand args are passed
if self.helpers.is_baremetal():
if not any([args.gpureset, args.clocks, args.fans, args.profile, args.xgmierr, \
- args.perf_determinism, \
- args.power_cap, args.clean_local_data]):
+ args.perf_determinism, args.power_cap, args.reload_driver, \
+ args.clean_local_data]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
else:
- if not any([args.clean_local_data]):
+ if not any([args.clean_local_data, args.reload_driver]):
command = " ".join(sys.argv[1:])
raise AmdSmiRequiredCommandException(command, self.logger.format)
+ #######################
+ # BM commands - START #
+ #######################
+
if self.helpers.is_baremetal():
if args.gpureset:
if self.helpers.is_amd_device(args.gpu):
@@ -5414,6 +5352,10 @@ class AMDSMICommands():
self.logger.clear_multiple_devices_output()
return
+ #######################
+ # BM commands - END #
+ #######################
+
if args.clean_local_data:
try:
amdsmi_interface.amdsmi_clean_gpu_local_data(args.gpu)
@@ -5431,6 +5373,94 @@ class AMDSMICommands():
self.logger.clear_multiple_devices_output()
return
+ # Adding to VMs since, they should also support same reload as baremetal
+ if args.reload_driver:
+ # Check permissions BEFORE starting any processes
+ # Required to avoid permission errors when starting the progress bar
+ try:
+ if os.geteuid() != 0:
+ result = "[AMDSMI_STATUS_NO_PERM] Command requires elevation"
+ self.logger.store_output(args.gpu, 'reload_driver', result)
+ self.logger.print_output()
+ self.logger.clear_multiple_devices_output()
+ raise PermissionError('Command requires elevation')
+ except AttributeError:
+ pass # os.geteuid() not available on Windows
+ lock = multiprocessing.Lock()
+ lock.acquire()
+ is_lock_released = False
+ progress_process = None
+ try:
+ self.helpers.increment_set_count()
+ set_count = self.helpers.get_set_count()
+ if set_count == 1:
+ self.helpers.confirm_gpu_driver_reload_warning()
+ # Start progress bar in separate process
+ string_out = f"Reloading driver for all AMD GPUs:"
+ progress_process = multiprocessing.Process(
+ target=self.helpers.showProgressbar,
+ args=(string_out, 140, True)
+ )
+ progress_process.start()
+ # Perform the actual driver reload (this is where permission error occurs)
+ amdsmi_interface.amdsmi_gpu_driver_reload()
+ # If we get here, operation was successful
+ self.helpers.assign_previous_set_success_check(amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_SUCCESS)
+ result = "Successfully reloaded driver"
+ else:
+ if self.helpers.get_previous_set_success_check() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_SUCCESS:
+ result = "Successfully reloaded driver"
+ elif self.helpers.get_previous_set_success_check() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
+ result = "[AMDSMI_STATUS_NO_PERM] Command requires elevation"
+ raise PermissionError('Command requires elevation')
+ else:
+ previous_check = self.helpers.get_previous_set_success_check()
+ temp_exception = amdsmi_exception.AmdSmiLibraryException(previous_check)
+ str_out = temp_exception.get_error_info(detailed=False)
+ result = f"[{str_out}] Unable to successfully restart driver"
+ except amdsmi_exception.AmdSmiLibraryException as e:
+ # Handle permission error FIRST, before any cleanup
+ self.helpers.assign_previous_set_success_check(e.get_error_code())
+ if e.get_error_code() == amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM:
+ self.helpers.assign_previous_set_success_check(amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_NO_PERM)
+ result = f"[{e.get_error_info(detailed=False)}] Command requires elevation"
+ # Clean termination of progress bar
+ if progress_process and progress_process.is_alive():
+ progress_process.terminate()
+ progress_process.join(timeout=0.1) # Wait max 0.1 second
+ if progress_process.is_alive():
+ progress_process.kill() # Force kill if needed
+ print("\n") # Clean up progress bar line
+ # Store result and exit early
+ self.logger.store_output(args.gpu, 'reload_driver', result)
+ self.logger.print_output()
+ self.logger.clear_multiple_devices_output()
+ if not is_lock_released:
+ lock.release()
+ is_lock_released = True
+ raise PermissionError('Command requires elevation') from e
+ else:
+ # Handle other errors
+ self.helpers.assign_previous_set_success_check(e.get_error_code())
+ result = f"[{e.get_error_info(detailed=False)}] Unable to successfully restart driver"
+ finally:
+ # Always clean up progress bar process
+ if progress_process and progress_process.is_alive():
+ progress_process.terminate()
+ progress_process.join(timeout=0.1)
+ if progress_process.is_alive():
+ progress_process.kill()
+ print("\n") # Clean up progress bar line
+ # Always release lock
+ if not is_lock_released:
+ lock.release()
+ is_lock_released = True
+ # Store and print result
+ self.logger.store_output(args.gpu, 'reload_driver', result)
+ self.logger.print_output()
+ self.logger.clear_multiple_devices_output()
+ return
+
def monitor(self, args, multiple_devices=False, watching_output=False, gpu=None,
watch=None, watch_time=None, iterations=None, power_usage=None,
diff --git a/amdsmi_cli/amdsmi_helpers.py b/amdsmi_cli/amdsmi_helpers.py
index af6fc4ad6f..8764009e56 100755
--- a/amdsmi_cli/amdsmi_helpers.py
+++ b/amdsmi_cli/amdsmi_helpers.py
@@ -61,6 +61,7 @@ class AMDSMIHelpers():
# Counts and Tracking variables
self._count_of_sets_called = 0
self._count_of_cper_files = 0
+ self._previous_set_success_check = amdsmi_interface.amdsmi_wrapper.AMDSMI_STATUS_UNKNOWN_ERROR
# Check if the system is a virtual OS
@@ -109,6 +110,17 @@ class AMDSMIHelpers():
def get_set_count(self):
return self._count_of_sets_called
+ def assign_previous_set_success_check(self, status):
+ """Assigns the previous set success check to the status provided.
+ This is used to determine if the last set was successful or not.
+ """
+ self._previous_set_success_check = status
+
+ def get_previous_set_success_check(self):
+ """Returns the previous set success check.
+ This is used to determine if the last set was successful or not.
+ """
+ return self._previous_set_success_check
def increment_cper_count(self):
self._count_of_cper_files += 1
@@ -896,14 +908,59 @@ class AMDSMIHelpers():
def confirm_changing_memory_partition_gpu_reload_warning(self, auto_respond=False):
""" Print the warning for running outside of specification and prompt user to accept the terms.
+ :param autoRespond: Response to automatically provide for all prompts
+ """
+
+ print('''
+ ******WARNING******\n
+ After changing memory (NPS) partition modes, users MUST restart
+ (reload) the AMD GPU driver. This command NO LONGER AUTOMATICALLY
+ reloads the driver, see `amd-smi reset -h` and
+ `sudo amd-smi reset -r` for more information.
+
+ This change is intended to allow users the ability to control when is
+ the best time to restart the AMD GPU driver, as it may not be desired
+ to restart the AMD GPU driver immediately after changing the
+ memory (NPS) partition mode.
+
+ Please use `sudo amd-smi reset -r` AFTER successfully
+ changing the memory (NPS) partition mode. A successful driver reload
+ is REQUIRED in order to complete updating ALL GPUs in the hive to
+ the requested partition mode.
+
+ ******REMINDER******
+ In order to reload the AMD GPU driver, users MUST quit all GPU
+ workloads across all devices.
+ ''')
+
+ if not auto_respond:
+ user_input = input('Do you accept these terms? [Y/N] ')
+ else:
+ user_input = auto_respond
+ if user_input in ['Yes', 'yes', 'y', 'Y', 'YES']:
+ print('')
+ return
+ else:
+ print('Confirmation not given. Exiting without setting value')
+ sys.exit(1)
+
+ def confirm_gpu_driver_reload_warning(self, auto_respond=False):
+ """ Print the warning for running outside of specification and prompt user to accept the terms.
+
:param autoRespond: Response to automatically provide for all prompts
"""
print('''
****** WARNING ******\n
- Setting Dynamic Memory (NPS) partition modes require users to quit all GPU workloads.
- AMD SMI will then attempt to change memory (NPS) partition mode.
- Upon a successful set, AMD SMI will then initiate an action to restart AMD GPU driver.
- This action will change all GPU's in the hive to the requested memory (NPS) partition mode.
+ AMD SMI is about to initiate an AMD GPU driver restart (module reload).
+
+ Reloading the AMD GPU driver REQUIRES users to quit all GPU activity across all
+ devices.
+
+ If user is initiating a driver reload AFTER changing memory (NPS) partition
+ modes (`sudo amd-smi set -M `), a AMD GPU driver reload is REQUIRED
+ to complete updating the partition mode. This change will effect ALL GPUs in
+ the hive. Advise using `amd-smi list -e` and `amd-smi partition -c -m`
+ afterwards to ensure changes were applied as expected.
Please use this utility with caution.
''')
@@ -918,7 +975,6 @@ class AMDSMIHelpers():
print('Confirmation not given. Exiting without setting value')
sys.exit(1)
-
def is_valid_profile(self, profile):
profile_presets = amdsmi_interface.amdsmi_wrapper.amdsmi_power_profile_preset_masks_t__enumvalues
if profile in profile_presets:
diff --git a/amdsmi_cli/amdsmi_parser.py b/amdsmi_cli/amdsmi_parser.py
index df3bac4ab0..2f854c1982 100644
--- a/amdsmi_cli/amdsmi_parser.py
+++ b/amdsmi_cli/amdsmi_parser.py
@@ -1323,6 +1323,7 @@ class AMDSMIParser(argparse.ArgumentParser):
reset_perf_det_help = "Disable performance determinism"
reset_power_cap_help = "Reset power capacity limit to max capable"
reset_gpu_clean_local_data_help = "Clean up local data in LDS/GPRs on a per partition basis"
+ reset_gpu_driver_help = "Reset (reload) AMD GPU driver"
# Create reset subparser
reset_parser = subparsers.add_parser('reset', help=reset_help, description=reset_subcommand_help)
@@ -1342,6 +1343,7 @@ class AMDSMIParser(argparse.ArgumentParser):
reset_exclusive_group.add_argument('-x', '--xgmierr', action='store_true', required=False, help=reset_xgmierr_help)
reset_exclusive_group.add_argument('-d', '--perf-determinism', action='store_true', required=False, help=reset_perf_det_help)
reset_exclusive_group.add_argument('-o', '--power-cap', action='store_true', required=False, help=reset_power_cap_help)
+ reset_exclusive_group.add_argument('-r', '--reload-driver', action='store_true', required=False, help=reset_gpu_driver_help)
# Add Baremetal and Virtual OS reset arguments
reset_exclusive_group.add_argument('-l', '--clean-local-data', action='store_true', required=False, help=reset_gpu_clean_local_data_help)
diff --git a/example/amd_smi_drm_example.cc b/example/amd_smi_drm_example.cc
index 7ed532da23..7850b6a9e0 100644
--- a/example/amd_smi_drm_example.cc
+++ b/example/amd_smi_drm_example.cc
@@ -588,6 +588,8 @@ int main() {
// Since memory partition effects entire GPU hive (and modifies current
// compute/accelerator partition), we'll default to only changing the
// first device for the first socket (GPU #0)
+ // Note: Any device can be requested to change memory partition,
+ // but for simplicity, we will only change GPU #0.
if (gpu_number == 0) {
std::cout << " **Changing memory partition for GPU #"
<< gpu_number << "...**\n";
@@ -613,6 +615,20 @@ int main() {
<< memoryPartitionString(updatePartition) << "): "
<< err_str << "\n\n";
+ // Reload only if the memory partition was set successfully
+ if (ret_set == AMDSMI_STATUS_SUCCESS) {
+ std::cout << "\t**Reloading GPU driver to apply memory "
+ << "partition change, this may take some time... **\n";
+ amdsmi_status_t reload_status = amdsmi_gpu_driver_reload();
+ amdsmi_status_code_to_string(reload_status, &err_str);
+ if (reload_status == AMDSMI_STATUS_SUCCESS) {
+ PRINT_AMDSMI_RET(reload_status)
+ std::cout << "\tamdsmi_gpu_driver_reload(): " << err_str << "\n\n";
+ } else {
+ std::cout << "\tamdsmi_gpu_driver_reload(): " << err_str << "\n\n";
+ }
+ }
+
// Get the current memory partition
char current_memory_partition[AMDSMI_MAX_STRING_LENGTH];
ret = amdsmi_get_gpu_memory_partition(processor_handles[device_index],
@@ -678,6 +694,17 @@ int main() {
std::cout << "\t**Device Index: " << device_index << std::endl;
std::cout << "\t**Device Handle: " << processor_handles[device_index] << std::endl;
std::cout << "\t**GPU Number: " << gpu_number << std::endl;
+ // Since memory partition effects entire GPU hive (and modifies current
+ // compute/accelerator partition), we'll default to only changing the
+ // first device for the first socket (GPU #0)
+ // Note: Any device can be requested to change memory partition,
+ // but for simplicity, we will only change GPU #0.
+ if (gpu_number != 0) {
+ std::cout << " **Skipping memory partition reset for GPU #"
+ << gpu_number << "...**\n";
+ gpu_number++;
+ continue;
+ }
// Reset to original memory partition settings
amdsmi_memory_partition_type_t orig_partition =
@@ -693,6 +720,19 @@ int main() {
std::cout << "\tamdsmi_set_gpu_memory_partition(" << gpu_number << ", "
<< memoryPartitionString(orig_partition) << "): "
<< err_str << "\n\n";
+ // Reload only if the memory partition was set successfully
+ if (ret_set == AMDSMI_STATUS_SUCCESS) {
+ std::cout << "\t**Reloading GPU driver to apply memory "
+ << "partition change, this may take some time... **\n";
+ amdsmi_status_t reload_status = amdsmi_gpu_driver_reload();
+ amdsmi_status_code_to_string(reload_status, &err_str);
+ if (reload_status == AMDSMI_STATUS_SUCCESS) {
+ PRINT_AMDSMI_RET(reload_status)
+ std::cout << "\tamdsmi_gpu_driver_reload(): " << err_str << "\n\n";
+ } else {
+ std::cout << "\tamdsmi_gpu_driver_reload(): " << err_str << "\n\n";
+ }
+ }
// Get the current memory partition
char current_memory_partition[AMDSMI_MAX_STRING_LENGTH];
ret = amdsmi_get_gpu_memory_partition(processor_handles[device_index],
diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h
index 77075ac9a4..00a8d9f502 100644
--- a/include/amd_smi/amdsmi.h
+++ b/include/amd_smi/amdsmi.h
@@ -5752,6 +5752,9 @@ amdsmi_get_gpu_memory_partition(amdsmi_processor_handle processor_handle, char *
* device's memory partition setting. This function does not allow any concurrent operations.
* Device must be idle and have no workloads when performing set partition operations.
*
+ * On @platform{gpu_bm_linux} AMDGPU driver restart is REQUIRED to complete updating to
+ * the new memory partition setting. Refer to `amdsmi_gpu_driver_reload()` for more details.
+ *
* @param[in] processor_handle Device which to query
*
* @param[in] memory_partition using enum ::amdsmi_memory_partition_type_t,
@@ -5762,8 +5765,6 @@ amdsmi_get_gpu_memory_partition(amdsmi_processor_handle processor_handle, char *
* @retval ::AMDSMI_STATUS_INVAL the provided arguments are not valid
* @retval ::AMDSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
* support this function
- * @retval ::AMDSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart
- * the amdgpu driver
* @return ::amdsmi_status_t
*
*/
@@ -5796,6 +5797,14 @@ amdsmi_get_gpu_memory_partition_config(amdsmi_processor_handle processor_handle,
*
* @platform{gpu_bm_linux} @platform{host}
*
+ * @details Given a processor handle @p processor_handle and a type of memory partition
+ * @p mode, this function will attempt to update the selected
+ * device's memory partition setting. This function does not allow any concurrent operations.
+ * Device must be idle and have no workloads when performing set partition operations.
+ *
+ * On @platform{gpu_bm_linux} AMDGPU driver restart is REQUIRED to complete updating to
+ * the new memory partition setting. Refer to `amdsmi_gpu_driver_reload()` for more details.
+ *
* @param[in] processor_handle A processor handle
*
* @param[in] mode Enum representing memory partitioning mode to set
@@ -6395,6 +6404,61 @@ amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *
/** @} End tagProcessInfo */
+/*****************************************************************************/
+/** @defgroup tagDriverControl Driver control mechanisms
+ * These functions provide control over the driver. Users should use with
+ * caution as they may cause the driver to become unstable.
+ * @{
+ */
+/**
+ * @brief Restart the device driver (kmod module) for all AMD GPUs on the
+ * system.
+ *
+ * @ingroup tagDriverControl
+ *
+ * @platform{gpu_bm_linux} @platform{guest_1vf} @platform{guest_mvf}
+ *
+ * @details This function will reload the AMD GPU driver as described in
+ * the Linux kernel documentation -
+ * https://docs.kernel.org/admin-guide/sysctl/kernel.html#modprobe
+ * with no extra parameters as specified in
+ * https://docs.kernel.org/gpu/amdgpu/module-parameters.html.
+ *
+ * Use this function with caution, as it will unload and reload the AMD GPU
+ * driver: `modprobe -r amdgpu && modprobe amdgpu`.
+ *
+ * Any process or workload using the AMD GPU driver is REQUIRED to be
+ * stopped before calling this function. Otherwise, function will return
+ * ::AMDSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart
+ * the amdgpu driver.
+ *
+ * User is REQUIRED to have root/admin privileges to call this function.
+ * Otherwise, this function will return ::AMDSMI_STATUS_NO_PERM.
+ *
+ * This API will take time to complete, as we are checking the driver's
+ * loading status to confirm it reloaded properly. If
+ * ::AMDSMI_STATUS_AMDGPU_RESTART_ERR is returned, it means the driver
+ * did not reload properly and the user should check dmesg logs.
+ *
+ * This function has been created in order to conviently reload the
+ * AMD GPU driver once `amdsmi_set_gpu_memory_partition()` or
+ * `amdsmi_set_gpu_memory_partition_mode()` successfully has been changed
+ * on Baremetal systems. Now users can control the reload once all GPU
+ * processes/workloads have been stopped on the AMD GPU driver.
+ * A (AMD GPU) driver reload is REQUIRED to complete changing
+ * to the new memory partition configuration
+ * (`amdsmi_set_gpu_memory_partition()`/`amdsmi_set_gpu_memory_partition_mode()`)
+ * operation MUST be successful. This function WILL EFFECT all GPUs in the
+ * hive to be reconfigured with the specified memory partition configuration.
+ *
+ * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success
+ * @return | ::AMDSMI_STATUS_NO_PERM function requires root access
+ * @return | ::AMDSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart
+ * the amdgpu driver.
+ */
+amdsmi_status_t amdsmi_gpu_driver_reload(void);
+/** @} End tagDriverControl */
+
#ifdef ENABLE_ESMI_LIB
/*****************************************************************************/
diff --git a/py-interface/__init__.py b/py-interface/__init__.py
index 9ad7b13d35..57e3b322f6 100644
--- a/py-interface/__init__.py
+++ b/py-interface/__init__.py
@@ -144,6 +144,7 @@ from .amdsmi_interface import amdsmi_set_gpu_od_volt_info
from .amdsmi_interface import amdsmi_set_gpu_perf_level
from .amdsmi_interface import amdsmi_get_gpu_power_profile_presets
from .amdsmi_interface import amdsmi_reset_gpu
+from .amdsmi_interface import amdsmi_gpu_driver_reload
from .amdsmi_interface import amdsmi_set_gpu_perf_determinism_mode
from .amdsmi_interface import amdsmi_set_gpu_fan_speed
from .amdsmi_interface import amdsmi_reset_gpu_fan
diff --git a/py-interface/amdsmi_interface.py b/py-interface/amdsmi_interface.py
index 162c3aa520..49b1f4c0e9 100644
--- a/py-interface/amdsmi_interface.py
+++ b/py-interface/amdsmi_interface.py
@@ -3721,6 +3721,9 @@ def amdsmi_reset_gpu(processor_handle: processor_handle):
_check_res(amdsmi_wrapper.amdsmi_reset_gpu(processor_handle))
+def amdsmi_gpu_driver_reload():
+ _check_res(amdsmi_wrapper.amdsmi_gpu_driver_reload())
+
def amdsmi_set_gpu_fan_speed(
processor_handle: processor_handle, sensor_idx: int, fan_speed: int
diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py
index 8f78defeca..b512ef4265 100644
--- a/py-interface/amdsmi_wrapper.py
+++ b/py-interface/amdsmi_wrapper.py
@@ -2934,6 +2934,9 @@ amdsmi_get_violation_status.argtypes = [amdsmi_processor_handle, ctypes.POINTER(
amdsmi_get_gpu_process_list = _libraries['libamd_smi.so'].amdsmi_get_gpu_process_list
amdsmi_get_gpu_process_list.restype = amdsmi_status_t
amdsmi_get_gpu_process_list.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(struct_amdsmi_proc_info_t)]
+amdsmi_gpu_driver_reload = _libraries['libamd_smi.so'].amdsmi_gpu_driver_reload
+amdsmi_gpu_driver_reload.restype = amdsmi_status_t
+amdsmi_gpu_driver_reload.argtypes = []
amdsmi_get_cpu_core_energy = _libraries['libamd_smi.so'].amdsmi_get_cpu_core_energy
amdsmi_get_cpu_core_energy.restype = amdsmi_status_t
amdsmi_get_cpu_core_energy.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint64)]
@@ -3446,13 +3449,14 @@ __all__ = \
'amdsmi_get_xgmi_plpd', 'amdsmi_gpu_block_t',
'amdsmi_gpu_cache_info_t', 'amdsmi_gpu_control_counter',
'amdsmi_gpu_counter_group_supported', 'amdsmi_gpu_create_counter',
- 'amdsmi_gpu_destroy_counter', 'amdsmi_gpu_metrics_t',
- 'amdsmi_gpu_read_counter', 'amdsmi_gpu_validate_ras_eeprom',
- 'amdsmi_gpu_xcp_metrics_t', 'amdsmi_gpu_xgmi_error_status',
- 'amdsmi_hsmp_driver_version_t', 'amdsmi_hsmp_freqlimit_src_names',
- 'amdsmi_hsmp_metrics_table_t', 'amdsmi_init',
- 'amdsmi_init_flags_t', 'amdsmi_init_gpu_event_notification',
- 'amdsmi_io_bw_encoding_t', 'amdsmi_is_P2P_accessible',
+ 'amdsmi_gpu_destroy_counter', 'amdsmi_gpu_driver_reload',
+ 'amdsmi_gpu_metrics_t', 'amdsmi_gpu_read_counter',
+ 'amdsmi_gpu_validate_ras_eeprom', 'amdsmi_gpu_xcp_metrics_t',
+ 'amdsmi_gpu_xgmi_error_status', 'amdsmi_hsmp_driver_version_t',
+ 'amdsmi_hsmp_freqlimit_src_names', 'amdsmi_hsmp_metrics_table_t',
+ 'amdsmi_init', 'amdsmi_init_flags_t',
+ 'amdsmi_init_gpu_event_notification', 'amdsmi_io_bw_encoding_t',
+ 'amdsmi_is_P2P_accessible',
'amdsmi_is_gpu_power_management_enabled', 'amdsmi_kfd_info_t',
'amdsmi_link_id_bw_type_t', 'amdsmi_link_metrics_t',
'amdsmi_link_type_t', 'amdsmi_memory_page_status_t',
diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h
index 9fab284ee1..16f8ecd7f2 100644
--- a/rocm_smi/include/rocm_smi/rocm_smi.h
+++ b/rocm_smi/include/rocm_smi/rocm_smi.h
@@ -5498,6 +5498,56 @@ rsmi_dev_metrics_log_get(uint32_t dv_ind);
/** @} */ // end of DevMetricsHeaderInfoGet
+/*****************************************************************************/
+/** @defgroup DriverControl Driver control mechanisms
+ * These functions provide control over the driver. Users should use with
+ * caution as they may cause the driver to become unstable.
+ * @{
+ */
+/**
+ * @brief Restart the device driver (kmod module) for all AMD GPUs on the
+ * system.
+ *
+ * @details This function will reload the AMD GPU driver as described in
+ * the Linux kernel documentation -
+ * https://docs.kernel.org/admin-guide/sysctl/kernel.html#modprobe
+ * with no extra parameters as specified in
+ * https://docs.kernel.org/gpu/amdgpu/module-parameters.html.
+ *
+ * Use this function with caution, as it will unload and reload the AMD GPU
+ * driver: `modprobe -r amdgpu && modprobe amdgpu`.
+ *
+ * Any process or workload using the AMD GPU driver is REQUIRED to be
+ * stopped before calling this function. Otherwise, function will return
+ * ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart
+ * the amdgpu driver.
+ *
+ * User is REQUIRED to have root/admin privileges to call this function.
+ * Otherwise, this function will return ::RSMI_STATUS_PERMISSION.
+ *
+ * This API will take time to complete, as we are checking the driver's
+ * loading status to confirm it reloaded properly. If
+ * ::RSMI_STATUS_AMDGPU_RESTART_ERR is returned, it means the driver
+ * did not reload properly and the user should check dmesg logs.
+ *
+ * This function has been created in order to conviently reload the
+ * AMD GPU driver once `rsmi_dev_memory_partition_set()`
+ * successfully has been changed on Baremetal systems.
+ * Now users can control the reload once all GPU processes/workloads
+ * have been stopped on the AMD GPU driver. A (AMD GPU) driver reload
+ * is REQUIRED to complete changing to the new memory partition
+ * configuration (`rsmi_dev_memory_partition_set()`) operation MUST
+ * be successful. This function WILL EFFECT all GPUs in the hive to
+ * be reconfigured with the specified memory partition configuration.
+ *
+ * @retval ::RSMI_STATUS_SUCCESS call was successful
+ * @retval ::RSMI_STATUS_PERMISSION function requires root access
+ * @retval ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart
+ * the amdgpu driver.
+ */
+rsmi_status_t rsmi_dev_amdgpu_driver_reload(void);
+/** @} */ // end of DriverControl
+
#ifdef __cplusplus
}
#endif // __cplusplus
diff --git a/rocm_smi/src/rocm_smi.cc b/rocm_smi/src/rocm_smi.cc
index ef627c15d7..09ca45fec2 100644
--- a/rocm_smi/src/rocm_smi.cc
+++ b/rocm_smi/src/rocm_smi.cc
@@ -3731,6 +3731,52 @@ rsmi_dev_gpu_reset(uint32_t dv_ind) {
CATCH
}
+rsmi_status_t rsmi_dev_amdgpu_driver_reload(void) {
+ TRY
+ std::ostringstream ss;
+ ss << __PRETTY_FUNCTION__ << "| ======= start =======";
+ LOG_TRACE(ss);
+ // TODO(amdsmi_team): technically, we should block for all devices
+ // As this is a global operation, we can use a mutex to ensure
+ // that only one thread is trying to restart the driver at a time.
+ uint32_t dv_ind = 0; // Default to first device
+ DEVICE_MUTEX
+ GET_DEV_FROM_INDX
+
+ rsmi_status_t restartRet = dev->restartAMDGpuDriver();
+
+ // Attempting to speed up processing time
+ bool is_logger_enabled = ROCmLogging::Logger::getInstance()->isLoggerEnabled();
+ if (restartRet != RSMI_STATUS_SUCCESS) {
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__
+ << " | ======= end ======= "
+ << " | Fail - restart AMD GPU detected"
+ << " | Device #: " << dv_ind
+ << " | Type: AMDGPU Driver Reload"
+ << " | Cause: AMDGPU Driver Reload failed "
+ << " | Returning = "
+ << getRSMIStatusString(restartRet, false);
+ LOG_ERROR(ss);
+ }
+ return restartRet;
+ }
+
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__
+ << " | ======= end ======= "
+ << " | Success - if restart completed successfully"
+ << " | Device #: " << dv_ind
+ << " | Type: AMDGPU Driver Reload"
+ << " | Returning = "
+ << getRSMIStatusString(restartRet, false);
+ LOG_INFO(ss);
+ }
+ return restartRet;
+
+ CATCH
+}
+
rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind,
uint32_t *num_regions, rsmi_freq_volt_region_t *buffer) {
TRY
@@ -6490,17 +6536,11 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
LOG_TRACE(ss);
REQUIRE_ROOT_ACCESS
DEVICE_MUTEX
- const int k1000_MS_WAIT = 1000;
const uint32_t kMaxMemoryCapabilitiesSize = 30;
char available_memory_capabilities[kMaxMemoryCapabilitiesSize];
available_memory_capabilities[0] = '\0';
- const uint32_t kMaxCurrentMemoryMode = 5;
- char current_memory_mode[kMaxCurrentMemoryMode];
- current_memory_mode[0] = '\0';
-
-
// Is the current mode already what user requested?
switch (memory_partition) {
case RSMI_MEMORY_PARTITION_NPS1:
@@ -6605,11 +6645,11 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
GET_DEV_FROM_INDX
int ret = dev->writeDevInfo(amd::smi::kDevMemoryPartition,
newMemoryPartition);
+ rsmi_status_t status = amd::smi::ErrnoToRsmiStatus(ret);
- if (amd::smi::ErrnoToRsmiStatus(ret) != RSMI_STATUS_SUCCESS) {
- rsmi_status_t err = amd::smi::ErrnoToRsmiStatus(ret);
- if (ret == EACCES) {
- err = RSMI_STATUS_NOT_SUPPORTED; // already verified permissions
+ if (status != RSMI_STATUS_SUCCESS) {
+ if (status == EACCES) {
+ status = RSMI_STATUS_NOT_SUPPORTED; // already verified permissions
}
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
@@ -6619,93 +6659,22 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
<< amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Cause: issue writing reqested setting of " + newMemoryPartition
<< " | Returning = "
- << getRSMIStatusString(err, false);
+ << getRSMIStatusString(status, false);
LOG_ERROR(ss);
- return err;
+ return status;
}
- rsmi_status_t restartRet = dev->restartAMDGpuDriver();
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
- << " | Success - if restart completed successfully"
+ << " | Success "
<< " | Device #: " << dv_ind
<< " | Type: "
<< amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
<< " | Data: " << newMemoryPartition
<< " | Returning = "
- << getRSMIStatusString(restartRet, false);
- LOG_TRACE(ss);
-
- if (restartRet != RSMI_STATUS_SUCCESS) {
- ss << __PRETTY_FUNCTION__
- << " | ======= end ======= "
- << " | Fail - restart AMD GPU detected"
- << " | Device #: " << dv_ind
- << " | Type: "
- << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
- << " | Cause: issue writing reqested setting of " + newMemoryPartition
- << " | Returning = "
- << getRSMIStatusString(restartRet, false);
- LOG_ERROR(ss);
- return restartRet;
- }
-
- std::string current_memory_mode_str = "unknown";
- rsmi_status_t can_read_sysfs_again = RSMI_STATUS_AMDGPU_RESTART_ERR;
- int maxWaitSeconds = 10;
- // wait until we can read SYSFS again
- if (restartRet == RSMI_STATUS_SUCCESS) {
- while ((current_memory_mode_str != user_requested_memory_partition)
- && maxWaitSeconds > 0) {
- maxWaitSeconds -= 1;
- can_read_sysfs_again =
- rsmi_dev_memory_partition_get(dv_ind, current_memory_mode, kMaxCurrentMemoryMode);
- if (can_read_sysfs_again == RSMI_STATUS_SUCCESS) {
- current_memory_mode_str.clear();
- current_memory_mode_str = current_memory_mode;
- ss << __PRETTY_FUNCTION__
- << " | ======= rsmi_dev_memory_partition_get ======= "
- << " | Success - can read SYSFS"
- << " | Device #: " << dv_ind
- << " | Type: "
- << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
- << " | Data (user requested mode): " << user_requested_memory_partition
- << " | Current Memory Partition Mode: " << current_memory_mode_str
- << " | Available Memory Partition Modes: " << memory_capabilities_str
- << " | maxWaitSeconds: " << maxWaitSeconds
- << " | total wait time (sec): " << (10 - maxWaitSeconds)
- << " | Returning = "
- << getRSMIStatusString(can_read_sysfs_again, false);
- LOG_TRACE(ss);
- if (!current_memory_mode_str.empty()
- && (current_memory_mode_str == user_requested_memory_partition)) {
- break;
- }
- }
- amd::smi::system_wait(k1000_MS_WAIT);
- }
- }
-
- if (current_memory_mode_str == user_requested_memory_partition) {
- restartRet = RSMI_STATUS_SUCCESS;
- } else {
- restartRet = RSMI_STATUS_AMDGPU_RESTART_ERR;
- }
-
- ss << __PRETTY_FUNCTION__
- << " | ======= end ======= "
- << " | Success - completed driver restart and all SYSFS are active"
- << " | Device #: " << dv_ind
- << " | Type: "
- << amd::smi::Device::get_type_string(amd::smi::kDevMemoryPartition)
- << " | Data: " << user_requested_memory_partition
- << " | Current Memory Partition Mode: " << current_memory_mode_str
- << " | Available Memory Partition Modes: " << memory_capabilities_str
- << " | Returning = "
- << getRSMIStatusString(restartRet, false);
- LOG_TRACE(ss);
-
- return restartRet;
+ << getRSMIStatusString(status, false);
+ LOG_INFO(ss);
+ return status;
CATCH
}
diff --git a/rocm_smi/src/rocm_smi_device.cc b/rocm_smi/src/rocm_smi_device.cc
index 2509c837a2..9185cbb581 100755
--- a/rocm_smi/src/rocm_smi_device.cc
+++ b/rocm_smi/src/rocm_smi_device.cc
@@ -1678,34 +1678,44 @@ rsmi_status_t Device::restartAMDGpuDriver(void) {
bool isAMDGPUModuleLive = false;
bool restartGDM = false;
std::string captureRestartErr;
+ // 1 sec = 1000 ms = 1000000 us
const int kTimeToWaitForDriverMSec = 1000;
+ // Attempting to speed up processing time
+ bool is_logger_enabled = ROCmLogging::Logger::getInstance()->isLoggerEnabled();
// sudo systemctl is-active gdm
// we do not care about the success of checking if gdm is active
- std::tie(success, out) = executeCommand("systemctl is-active gdm", true);
+ std::tie(success, out) = executeCommand("systemctl is-active gdm 2>/dev/null", true);
(out == "active") ? (restartGDM = true) : (restartGDM = false);
- ss << __PRETTY_FUNCTION__ << " | systemctl is-active gdm: out = "
- << out << "; success = " << (success ? "True" : "False");
- LOG_INFO(ss);
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__ << " | systemctl is-active gdm: out = "
+ << out << "; success = " << (success ? "True" : "False")
+ << "; restartGDM = " << (restartGDM ? "True" : "False");
+ LOG_INFO(ss);
+ }
// if gdm is active -> sudo systemctl stop gdm
// TODO(AMD_SMI_team): are are there other display manager's we need to take into account?
// see https://help.gnome.org/admin/gdm/stable/overview.html.en_GB
if (success && (out == "active") && (restartGDM)) {
wasGdmServiceActive = true;
- std::tie(success, out) = executeCommand("systemctl stop gdm&", true);
- ss << __PRETTY_FUNCTION__ << " | systemctl stop gdm&: out = "
- << out << "; success = " << (success ? "True" : "False");
- LOG_INFO(ss);
+ std::tie(success, out) = executeCommand("systemctl stop gdm& 2>/dev/null", true);
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__ << " | systemctl stop gdm&: out = "
+ << out << "; success = " << (success ? "True" : "False");
+ LOG_INFO(ss);
+ }
} else {
success = true; // ignore failures to restart gdm
}
- ss << __PRETTY_FUNCTION__ << " | B4 modprobing anything!!! out = "
- << out << "; success = " << (success ? "True" : "False")
- << "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
- << "; captureRestartErr = " << captureRestartErr;
- LOG_INFO(ss);
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__ << " | B4 modprobing anything!!! out = "
+ << out << "; success = " << (success ? "True" : "False")
+ << "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
+ << "; captureRestartErr = " << captureRestartErr;
+ LOG_INFO(ss);
+ }
// sudo modprobe -r amdgpu
// sudo modprobe amdgpu
@@ -1713,26 +1723,32 @@ rsmi_status_t Device::restartAMDGpuDriver(void) {
"modprobe -r -v amdgpu >/dev/null 2>&1 && modprobe -v amdgpu >/dev/null 2>&1", true);
restartSuccessful &= success;
captureRestartErr = out;
- ss << __PRETTY_FUNCTION__ << " | modprobe -r -v amdgpu && modprobe -v amdgpu: out = "
- << out << "; success = " << (success ? "True" : "False")
- << "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
- << "; captureRestartErr = " << captureRestartErr;
- LOG_INFO(ss);
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__ << " | modprobe -r -v amdgpu && modprobe -v amdgpu: out = "
+ << out << "; success = " << (success ? "True" : "False")
+ << "; restartSuccessful = " << (restartSuccessful ? "True" : "False")
+ << "; captureRestartErr = " << captureRestartErr;
+ LOG_INFO(ss);
+ }
// if gdm was active -> sudo systemctl start gdm
// We don't care if successful or not, just try to restart as a courtesy
if (wasGdmServiceActive && restartGDM) {
- std::tie(success, out) = executeCommand("systemctl start gdm&", true);
- ss << __PRETTY_FUNCTION__ << " | systemctl start gdm&: out = "
- << out << "; success = " << (success ? "True" : "False");
- LOG_INFO(ss);
+ std::tie(success, out) = executeCommand("systemctl start gdm& 2>/dev/null", true);
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__ << " | systemctl start gdm&: out = "
+ << out << "; success = " << (success ? "True" : "False");
+ LOG_INFO(ss);
+ }
}
// Return early if there was an issue restarting amdgpu
if (!restartSuccessful) {
- ss << __PRETTY_FUNCTION__ << " | [WARNING] Issue found during amdgpu restart: "
- << captureRestartErr << "; retartSuccessful: " << (restartSuccessful ? "True" : "False");
- LOG_INFO(ss);
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__ << " | [ERROR] Issue found during amdgpu restart: "
+ << captureRestartErr << "; retartSuccessful: " << (restartSuccessful ? "True" : "False");
+ LOG_ERROR(ss);
+ }
return RSMI_STATUS_AMDGPU_RESTART_ERR;
}
@@ -1764,13 +1780,17 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress,
bool deviceRestartInProgress = true; // Assume in progress, we intend to disprove
bool isSystemAMDGPUModuleLive = false; // Assume AMD GPU module is not live,
// we intend to disprove
+ // Attempting to speed up processing time
+ bool is_logger_enabled = ROCmLogging::Logger::getInstance()->isLoggerEnabled();
// wait for amdgpu module to come back up
std::tie(success, out) = executeCommand("cat /sys/module/amdgpu/initstate", true);
- ss << __PRETTY_FUNCTION__
- << " | success = " << (success ? "True" : "False")
- << " | out = " << out;
- LOG_DEBUG(ss);
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__
+ << " | success = " << (success ? "True" : "False")
+ << " | out = " << out;
+ LOG_DEBUG(ss);
+ }
if ((success == true) && (!out.empty())) {
isSystemAMDGPUModuleLive = containsString(out, "live");
}
@@ -1779,11 +1799,13 @@ rsmi_status_t Device::isRestartInProgress(bool *isRestartInProgress,
}
*isRestartInProgress = deviceRestartInProgress;
*isAMDGPUModuleLive = isSystemAMDGPUModuleLive;
- ss << __PRETTY_FUNCTION__
- << " | *isRestartInProgress = " << (*isRestartInProgress ? "True":"False")
- << " | *isAMDGPUModuleLive = " << (*isAMDGPUModuleLive ? "True":"False")
- << " | out = " << out;
- LOG_DEBUG(ss);
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__
+ << " | *isRestartInProgress = " << (*isRestartInProgress ? "True":"False")
+ << " | *isAMDGPUModuleLive = " << (*isAMDGPUModuleLive ? "True":"False")
+ << " | out = " << out;
+ LOG_DEBUG(ss);
+ }
return ((*isAMDGPUModuleLive && !*isRestartInProgress) ? RSMI_STATUS_SUCCESS :
RSMI_STATUS_AMDGPU_RESTART_ERR);
diff --git a/rocm_smi/src/rocm_smi_utils.cc b/rocm_smi/src/rocm_smi_utils.cc
index 4863e59c6d..8332cb2574 100644
--- a/rocm_smi/src/rocm_smi_utils.cc
+++ b/rocm_smi/src/rocm_smi_utils.cc
@@ -1283,18 +1283,25 @@ void system_wait(int milli_seconds) {
auto start = std::chrono::high_resolution_clock::now();
// 1 ms = 1000 us
int waitTime = milli_seconds * 1000;
- ss << __PRETTY_FUNCTION__ << " | "
- << "** Waiting for " << std::dec << waitTime
- << " us (" << waitTime/1000 << " milli-seconds) **";
- LOG_DEBUG(ss);
+ // Attempting to speed up processing time
+ bool is_logger_enabled = ROCmLogging::Logger::getInstance()->isLoggerEnabled();
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__ << " | "
+ << "** Waiting for " << std::dec << waitTime
+ << " us (" << waitTime/1000 << " milli-seconds) **";
+ LOG_DEBUG(ss);
+ }
+
usleep(waitTime);
auto stop = std::chrono::high_resolution_clock::now();
auto duration =
std::chrono::duration_cast(stop - start);
- ss << __PRETTY_FUNCTION__ << " | "
- << "** Waiting took " << duration.count() / 1000
- << " milli-seconds **";
- LOG_DEBUG(ss);
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__ << " | "
+ << "** Waiting took " << duration.count() / 1000
+ << " milli-seconds **";
+ LOG_DEBUG(ss);
+ }
}
int countDigit(uint64_t n) {
diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc
index 2c3b37b190..ca0cc53d45 100644
--- a/src/amd_smi/amd_smi.cc
+++ b/src/amd_smi/amd_smi.cc
@@ -2346,7 +2346,8 @@ amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle,
ss << __PRETTY_FUNCTION__
<< " | After attepting to set memory partition to " << req_user_partition << "\n"
<< " | Current memory partition is " << current_partition_str << "\n"
- << " | Returning: " << smi_amdgpu_get_status_string(ret, false);
+ << " | Returning: " << smi_amdgpu_get_status_string(ret, false)
+ << " | User will need to reload driver in order to see a NPS mode change";
LOG_INFO(ss);
return ret;
}
@@ -2983,6 +2984,8 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h
}
} else {
profile->profile_type = AMDSMI_ACCELERATOR_PARTITION_INVALID;
+ current_partition_str.clear();
+ current_partition_str = "N/A";
}
amdsmi_gpu_metrics_t metric_info = {};
@@ -3656,6 +3659,26 @@ amdsmi_status_t amdsmi_reset_gpu(amdsmi_processor_handle processor_handle) {
return ret;
}
+amdsmi_status_t amdsmi_gpu_driver_reload(void) {
+ std::ostringstream ss;
+ AMDSMI_CHECK_INIT();
+
+ // Attempting to speed up processing time
+ bool is_logger_enabled = ROCmLogging::Logger::getInstance()->isLoggerEnabled();
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__ << " | ======= start =======";
+ LOG_INFO(ss);
+ }
+ rsmi_status_t ret = rsmi_dev_amdgpu_driver_reload();
+ amdsmi_status_t amdsmi_status = amd::smi::rsmi_to_amdsmi_status(ret);
+ if (is_logger_enabled) {
+ ss << __PRETTY_FUNCTION__
+ << " | Returning: " << smi_amdgpu_get_status_string(amdsmi_status, false);
+ LOG_INFO(ss);
+ }
+ return amdsmi_status;
+}
+
amdsmi_status_t amdsmi_get_gpu_busy_percent(amdsmi_processor_handle processor_handle,
uint32_t *gpu_busy_percent) {
return rsmi_wrapper(rsmi_dev_busy_percent_get, processor_handle, 0, gpu_busy_percent);
diff --git a/tests/amd_smi_test/functional/memorypartition_read_write.cc b/tests/amd_smi_test/functional/memorypartition_read_write.cc
index 60d568a790..546476b4ec 100755
--- a/tests/amd_smi_test/functional/memorypartition_read_write.cc
+++ b/tests/amd_smi_test/functional/memorypartition_read_write.cc
@@ -42,6 +42,50 @@ const uint32_t MAX_DPX_PARTITIONS = 2;
const uint32_t MAX_TPX_PARTITIONS = 3;
const uint32_t MAX_QPX_PARTITIONS = 4;
+void ReloadDriverWithMessages(bool isVerbose,
+ const std::string& preReloadMessage,
+ const std::string& successMessage,
+ const std::string& errorMessage,
+ const std::string& restartErrorMessage,
+ amdsmi_status_t *reload_status) {
+ if (isVerbose) {
+ std::cout << "\t**" << preReloadMessage << std::endl;
+ }
+
+ auto start_time = std::chrono::steady_clock::now();
+ auto driver_reload_status = amdsmi_gpu_driver_reload();
+ auto end_time = std::chrono::steady_clock::now();
+ auto elapsed_time = std::chrono::duration_cast(
+ end_time - start_time);
+ auto elapsed_seconds = std::chrono::duration_cast(end_time - start_time);
+ *reload_status = driver_reload_status;
+
+ if (isVerbose) {
+ std::cout << "\t**"
+ << "amdsmi_gpu_driver_reload() took "
+ << elapsed_time.count() << " milliseconds ("
+ << elapsed_seconds.count() << " seconds)" << std::endl;
+ }
+
+ if (driver_reload_status == AMDSMI_STATUS_SUCCESS) {
+ if (isVerbose) {
+ std::cout << "\t**" << successMessage << std::endl;
+ }
+ } else if (driver_reload_status == AMDSMI_STATUS_AMDGPU_RESTART_ERR) {
+ if (isVerbose) {
+ std::cout << "\t**" << restartErrorMessage << std::endl;
+ }
+ ASSERT_TRUE(driver_reload_status == AMDSMI_STATUS_AMDGPU_RESTART_ERR);
+ } else {
+ if (isVerbose) {
+ std::cout << "\t**" << errorMessage << ": "
+ << smi_amdgpu_get_status_string(driver_reload_status, false) << std::endl;
+ }
+ }
+ // Test should fail if the driver reload fails
+ ASSERT_EQ(driver_reload_status, AMDSMI_STATUS_SUCCESS);
+}
+
TestMemoryPartitionReadWrite::TestMemoryPartitionReadWrite() : TestBase() {
set_title("AMDSMI Memory Partition Read Test");
set_description("The memory partition tests verifies that the memory "
@@ -335,6 +379,26 @@ void TestMemoryPartitionReadWrite::Run(void) {
}
}
+ // Basic check we can reload the driver, regardless of if changing memory partition
+ // is supported or not
+ // FYI Need to place after saving current compute partitions, since reloading driver will reset
+ // all back to SPX/DPX/etc (whatever is default for that NPS mode; see
+ // `sudo amd-smi partition -a`).
+ IF_VERB(STANDARD) {
+ std::cout << "\t**"
+ << "======== TEST AMDSMI_GPU_DRIVER_RELOAD() BEFORE"
+ << " MEMORY PARTITION CHECKS ===============" << std::endl;
+ }
+ amdsmi_status_t driver_reload_status = AMDSMI_STATUS_NOT_SUPPORTED;
+ std::string preload_message =
+ "\t Reloading the AMD GPU driver before memory partition checks."
+ " This may take some time, please wait...";
+ ReloadDriverWithMessages(isVerbose, preload_message,
+ "amdsmi_gpu_driver_reload() successful.",
+ "amdsmi_gpu_driver_reload() failed",
+ "amdsmi_gpu_driver_reload() failed with AMDGPU_RESTART_ERR",
+ &driver_reload_status);
+
// Run memory partition tests
IF_VERB(STANDARD) {
std::cout << "\t**=========================================================\n";
@@ -348,7 +412,7 @@ void TestMemoryPartitionReadWrite::Run(void) {
std::cout << "\t**Total Num Devices: " << current_num_devices << std::endl;
}
// Leaving for debug purposes - uncomment to test a specific number of devices
- // uint32_t num_devices_to_test = promptNumDevicesToTest(current_num_devices);
+ // uint32_t num_devices_to_test = 1;
uint32_t num_devices_to_test = current_num_devices;
for (uint32_t dv_ind = 0; dv_ind < num_devices_to_test; ++dv_ind) {
bool wasSetSuccess = false;
@@ -587,8 +651,34 @@ void TestMemoryPartitionReadWrite::Run(void) {
|| (ret_set == AMDSMI_STATUS_NOT_SUPPORTED));
}
+ amdsmi_status_t driver_reload_status = AMDSMI_STATUS_NOT_SUPPORTED;
if (ret_set == AMDSMI_STATUS_SUCCESS) { // do not continue trying to reset
- wasSetSuccess = true;
+ // Now we require a separate call to reload the driver, since this operation
+ // has been removed from the amdsmi_set_gpu_memory_partition_mode and
+ // amdsmi_set_gpu_memory_partition().
+ // This is to allow the user to select the appropriate time to reload the driver
+ // since there can be errors if any device has a workload/process running on it.
+ std::string reload_message =
+ "\t Reloading the AMD GPU driver after setting memory partition to "
+ + memoryPartitionString(new_memory_partition)
+ + ". This may take some time, please wait...";
+ std::string driver_reload_success_message =
+ "amdsmi_gpu_driver_reload() successful after setting memory partition to "
+ + memoryPartitionString(new_memory_partition);
+ std::string failure_message =
+ "amdsmi_gpu_driver_reload() failed after setting memory partition to "
+ + memoryPartitionString(new_memory_partition);
+ std::string restart_error_message =
+ "amdsmi_gpu_driver_reload() failed with AMDGPU_RESTART_ERR after "
+ "setting memory partition to " + memoryPartitionString(new_memory_partition);
+ ReloadDriverWithMessages(isVerbose, reload_message,
+ driver_reload_success_message,
+ failure_message,
+ restart_error_message,
+ &driver_reload_status);
+ if (driver_reload_status == AMDSMI_STATUS_SUCCESS) {
+ wasSetSuccess = true;
+ }
}
ret = amdsmi_get_gpu_memory_partition_config(processor_handles_[dv_ind],
@@ -664,6 +754,32 @@ void TestMemoryPartitionReadWrite::Run(void) {
<< smi_amdgpu_get_status_string(ret, false) << std::endl;
}
CHK_ERR_ASRT(ret)
+ if (ret == AMDSMI_STATUS_SUCCESS) {
+ // Now we require a separate call to reload the driver, since this operation
+ // has been removed from the amdsmi_set_gpu_memory_partition_mode and
+ // amdsmi_set_gpu_memory_partition().
+ // This is to allow the user to select the appropriate time to reload the driver
+ // since there can be errors if any device has a workload/process running on it.
+ driver_reload_status = AMDSMI_STATUS_NOT_SUPPORTED;
+ std::string reload_message =
+ "\t Reloading the AMD GPU driver after resetting memory partition to "
+ + std::string(orig_memory_partition)
+ + ". This may take some time, please wait...";
+ std::string driver_reload_success_message =
+ "amdsmi_gpu_driver_reload() successful after resetting memory partition to "
+ + std::string(orig_memory_partition);
+ std::string failure_message =
+ "amdsmi_gpu_driver_reload() failed after resetting memory partition to "
+ + std::string(orig_memory_partition);
+ std::string restart_error_message =
+ "amdsmi_gpu_driver_reload() failed with AMDGPU_RESTART_ERR after "
+ "resetting memory partition to " + std::string(orig_memory_partition);
+ ReloadDriverWithMessages(isVerbose, reload_message,
+ driver_reload_success_message,
+ failure_message,
+ restart_error_message,
+ &driver_reload_status);
+ }
ret = amdsmi_get_gpu_memory_partition(processor_handles_[dv_ind],
current_memory_partition, k255Len);
CHK_ERR_ASRT(ret)