[SWDEV-518325/SWDEV-518320/SWDEV-443309] Fix Partition Enumeration

* Changes:
  - Updates to DRM renderD* / card* pathing for partition devices
  - Now use KFD to discover AMD devices and populate accordingly
    Device MUST have an accessible KFD node (via cgroups)
  - Updated several ROCm SMI CLI outputs to handle SYSFS files
    which are not accessible on partition nodes
  - Added a new method to help get card/drm info
    (rsmi_dev_device_identifiers_get) from ROCm SMI

Change-Id: If844f27ffc595942272abe9c8167ed90a0b0e225
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
This commit is contained in:
Charis Poag
2025-04-13 22:38:31 -05:00
zatwierdzone przez Arif, Maisam
rodzic 2630bf0a8c
commit a0df877fdf
9 zmienionych plików z 554 dodań i 368 usunięć
+6 -6
Wyświetl plik
@@ -321,7 +321,7 @@ def getDRMDeviceId(device, silent=False):
dv_id = c_short()
ret = rocmsmi.rsmi_dev_id_get(device, byref(dv_id))
device_id_ret = "N/A"
if rsmi_ret_ok(ret, device, 'get_device_id', silent):
if rsmi_ret_ok(ret, device, 'get_device_id', silent=True):
device_id_ret = hex(dv_id.value)
return device_id_ret
@@ -336,7 +336,7 @@ def getRev(device, silent=False):
dv_rev = c_short()
ret = rocmsmi.rsmi_dev_revision_get(device, byref(dv_rev))
revision_ret = "N/A"
if rsmi_ret_ok(ret, device, 'get_device_rev', silent=silent):
if rsmi_ret_ok(ret, device, 'get_device_rev', silent=True):
revision_ret = padHexValue(hex(dv_rev.value), 2)
return revision_ret
@@ -350,7 +350,7 @@ def getSubsystemId(device, silent=False):
model = c_short()
ret = rocmsmi.rsmi_dev_subsystem_id_get(device, byref(model))
device_model = "N/A"
if rsmi_ret_ok(ret, device, 'get_subsystem_name', silent=silent):
if rsmi_ret_ok(ret, device, 'get_subsystem_name', silent=True):
device_model = model.value
# padHexValue is used for applications that expect 4-digit card models
device_model = padHexValue(hex(device_model), 4)
@@ -1986,7 +1986,7 @@ def showAllConcise(deviceList):
(retCode, fanLevel, fanSpeed) = getFanSpeed(device, silent)
fan = str(fanSpeed) + '%'
if getPerfLevel(device, silent) != -1:
perf = getPerfLevel(device, silent)
perf = str(getPerfLevel(device, silent)).lower()
else:
perf = 'N/A'
if getMaxPower(device, silent) != -1:
@@ -2007,7 +2007,7 @@ def showAllConcise(deviceList):
str(getGUID(device)),
temp_val, powerVal,
combined_partition_data,
sclk, mclk, fan, str(perf).lower(),
sclk, mclk, fan, perf,
str(pwrCap),
allocated_mem_percent['combined'],
str(gpu_busy)]
@@ -2514,7 +2514,7 @@ def showMemUse(deviceList):
printLog(device, 'GPU Memory Allocated (VRAM%)',
int(allocated_mem_percent['value']))
ret = rocmsmi.rsmi_dev_memory_busy_percent_get(device, byref(memoryUse))
if rsmi_ret_ok(ret, device, '% memory use'):
if rsmi_ret_ok(ret, device, '% memory use', silent=True):
printLog(device, 'GPU Memory Read/Write Activity (%)', memoryUse.value)
util_counters = getCoarseGrainUtil(device, "Memory Activity")
if util_counters != -1: