2
0

[SWDEV-574637] Avoid redundant hive gpu resets (#2657)

Mode-1 GPU reset affects entire XGMI hive. Added
xgmi_hive_id check to reset only once for same-hive
GPUs while preserving separate resets for different
hives or no hives.
 - Example:
   `sudo amd-smi reset -G` or `sudo amd-smi reset -G -g 0`
   on MI300 will reset all GPU's only once.

Signed-off-by: Bindhiya Kanangot Balakrishnan <Bindhiya.KanangotBalakrishnan@amd.com>
Este cometimento está contido em:
Bindhiya Kanangot Balakrishnan
2026-01-28 22:59:17 -06:00
cometido por GitHub
ascendente e9c72b06b0
cometimento fa6f071751
+37
Ver ficheiro
@@ -5566,6 +5566,43 @@ class AMDSMICommands():
self.helpers.check_required_groups(check_render=True, check_video=False)
self.group_check_printed = True
# Mode-1 gpureset is hive-wide.
# Group GPUs by hive and reset each hive only once.
gpus_to_reset = []
if args.gpureset and isinstance(args.gpu, list) and len(args.gpu) > 1:
# Group GPUs by their XGMI hive ID.
# If GPU not in a hive or no hive info, reset individually.
hive_to_gpus = {}
gpus_without_hive = []
for gpu in args.gpu:
try:
xgmi_info = amdsmi_interface.amdsmi_get_xgmi_info(gpu)
if isinstance(xgmi_info, dict):
hive_id = xgmi_info.get('xgmi_hive_id', None)
if hive_id is not None and hive_id != 0:
if hive_id not in hive_to_gpus:
hive_to_gpus[hive_id] = []
hive_to_gpus[hive_id].append(gpu)
else:
gpus_without_hive.append(gpu)
else:
gpus_without_hive.append(gpu)
except:
gpus_without_hive.append(gpu)
# For each hive, reset using the first GPU (resets entire hive)
for hive_id, gpu_list in hive_to_gpus.items():
gpus_to_reset.append(gpu_list[0])
# Add all non-hive GPUs to reset individually
gpus_to_reset.extend(gpus_without_hive)
# Update args.gpu to only the GPUs to reset
if gpus_to_reset:
args.gpu = gpus_to_reset
# Handle multiple GPUs
handled_multiple_gpus, device_handle = self.helpers.handle_gpus(args, self.logger, self.reset)
if handled_multiple_gpus: