From f9fd6b0a963cb2327d421192cec202677dfcf1be Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 22 Aug 2023 17:15:18 -0500 Subject: [PATCH 1/5] rocm_smi_lib/rocm_smi.py: Fix rocm-smi --showfan shows 'unable to detect fan' Properly handles 'Unable to detect' vs 'Not supported' fan cases where: * sysfs file (pwm#) exists, and readings report zero (0), "Unable to detect fan speed" * sysfs file (pwm#) does not exist, then "Not supported" Change-Id: If4b0312c872b76647a3e54427ba2a3f3e8e6dab1 Signed-off-by: Oliveira, Daniel --- python_smi_tools/rocm_smi.py | 58 +++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 003efadc66..8483d804b8 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -205,16 +205,31 @@ def getFanSpeed(device): fl = 0 fm = 0 + """ If ret = 2; (No such file or directory) + /sys/class/drm/cardX/device/hwmon/hwmonX/pwmX + """ ret = rocmsmi.rsmi_dev_fan_speed_get(device, sensor_ind, byref(fanLevel)) if rsmi_ret_ok(ret, device, 'get_fan_speed', True): fl = fanLevel.value + last_ret = ret + + """ If ret = 2; (No such file or directory) + /sys/class/drm/cardX/device/hwmon/hwmonX/pwmX + """ ret = rocmsmi.rsmi_dev_fan_speed_max_get(device, sensor_ind, byref(fanMax)) if rsmi_ret_ok(ret, device, 'get_fan_max_speed', True): fm = fanMax.value - if fl == 0 or fm == 0: - return (fl, 0) # to prevent division by zero crash - return (fl, round((float(fl) / float(fm)) * 100, 2)) + """ In case we had an error before, we don't overwrite it with a + possible success now. Otherwise, we get the next error. + """ + if (last_ret == rsmi_status_t.RSMI_STATUS_SUCCESS): + last_ret = ret + + if fl == 0 or fm == 0: + return (last_ret, fl, 0) # to prevent division by zero crash + + return (last_ret, fl, round((float(fl) / float(fm)) * 100, 2)) def getGpuUse(device): @@ -1608,7 +1623,7 @@ def showAllConcise(deviceList): concise = True sclk = showCurrentClocks([device], 'sclk', concise) mclk = showCurrentClocks([device], 'mclk', concise) - (fanLevel, fanSpeed) = getFanSpeed(device) + (retCode, fanLevel, fanSpeed) = getFanSpeed(device) fan = str(fanSpeed) + '%' if getPerfLevel(device) != -1: perf = getPerfLevel(device) @@ -1816,22 +1831,25 @@ def showCurrentFans(deviceList): sensor_ind = c_uint32(0) for device in deviceList: - (fanLevel, fanSpeed) = getFanSpeed(device) - fanSpeed = round(fanSpeed) - if fanLevel == 0 or fanSpeed == 0: - printLog(device, 'Unable to detect fan speed for GPU %d' % (device), None) - logging.debug('Current fan speed is: %d\n' % (fanSpeed) + \ - ' Current fan level is: %d\n' % (fanLevel) + \ - ' (GPU might be cooled with a non-PWM fan)') - continue - if PRINT_JSON: - printLog(device, 'Fan speed (level)', str(fanLevel)) - printLog(device, 'Fan speed (%)', str(fanSpeed)) + (retCode, fanLevel, fanSpeed) = getFanSpeed(device) + if (retCode == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED): + printLog(device, 'Not supported', None) else: - printLog(device, 'Fan Level', str(fanLevel) + ' (%s%%)' % (str(fanSpeed))) - ret = rocmsmi.rsmi_dev_fan_rpms_get(device, sensor_ind, byref(rpmSpeed)) - if rsmi_ret_ok(ret, device, 'get_fan_rpms'): - printLog(device, 'Fan RPM', rpmSpeed.value) + fanSpeed = round(fanSpeed) + if fanLevel == 0 or fanSpeed == 0: + printLog(device, 'Unable to detect fan speed for GPU %d' % (device), None) + logging.debug('Current fan speed is: %d\n' % (fanSpeed) + \ + ' Current fan level is: %d\n' % (fanLevel) + \ + ' (GPU might be cooled with a non-PWM fan)') + continue + if PRINT_JSON: + printLog(device, 'Fan speed (level)', str(fanLevel)) + printLog(device, 'Fan speed (%)', str(fanSpeed)) + else: + printLog(device, 'Fan Level', str(fanLevel) + ' (%s%%)' % (str(fanSpeed))) + ret = rocmsmi.rsmi_dev_fan_rpms_get(device, sensor_ind, byref(rpmSpeed)) + if rsmi_ret_ok(ret, device, 'get_fan_rpms'): + printLog(device, 'Fan RPM', rpmSpeed.value) printLogSpacer() @@ -3337,7 +3355,7 @@ def save(deviceList, savefilepath): clocks[device][clk_type] = str(freq.current) else: clocks[device][clk_type] = '0' - fanSpeeds[device] = getFanSpeed(device)[0] + fanSpeeds[device] = getFanSpeed(device)[1] od = c_uint32() ret = rocmsmi.rsmi_dev_overdrive_level_get(device, byref(od)) if rsmi_ret_ok(ret, device, 'get_overdrive_level'): From 654f65118b15dea36625e4b72a291556f96a7a1a Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Wed, 23 Aug 2023 23:44:26 -0500 Subject: [PATCH 2/5] rocm_smi_lib/rocm_smi.py: Fix rocm-smi --resetfans shows 'permission denied' Properly handles 'Not supported' fan cases where: * sysfs file (pwm#_enable) exists * sysfs file (pwm#_enable) does not exist Change-Id: Ifa3c290e5ee1d27a550e94d86cd25ad8dcef3f59 Signed-off-by: Oliveira, Daniel --- python_smi_tools/rocm_smi.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 8483d804b8..24e13e882e 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -785,8 +785,12 @@ def resetFans(deviceList): for device in deviceList: sensor_ind = c_uint32(0) ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind) - if rsmi_ret_ok(ret, device, 'reset_fan'): - printLog(device, 'Successfully reset fan speed to driver control', None) + if (ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED) or (ret == rsmi_status_t.RSMI_STATUS_PERMISSION): + if not rsmi_ret_ok(rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED, device, 'reset_fan'): + continue + else: + if rsmi_ret_ok(ret, device, 'reset_fan'): + printLog(device, 'Successfully reset fan speed to driver control', None) printLogSpacer() From 36024471096ec88725354efee9ed24a8ada5dc50 Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 8 Aug 2023 16:47:16 -0500 Subject: [PATCH 3/5] rocm_smi_lib/rocm_smi.py: Fix Add 'GPU name' in rocm-smi output Code changes related to the following: * rocm_smi.py Change-Id: I600e776bf479f972b8d639ce5a658a24916aed3c Signed-off-by: Oliveira, Daniel --- python_smi_tools/rocm_smi.py | 86 ++++++++++++++++++++++++++++++++++-- 1 file changed, 82 insertions(+), 4 deletions(-) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 24e13e882e..08f2652665 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -1586,18 +1586,24 @@ def showAllConcise(deviceList): @param deviceList: List of DRM devices (can be a single-item list) """ - global PRINT_JSON + global PRINT_JSON, appWidth if PRINT_JSON: print('ERROR: Cannot print JSON/CSV output for concise output') sys.exit(1) + + """ Place holder for the actual max size """ + MAX_ALL_CONCISE_WIDTH = 100 + appWidth_temp = appWidth + appWidth = MAX_ALL_CONCISE_WIDTH + printLogSpacer(' Concise Info ') deviceList.sort() (temp_type, _) = findFirstAvailableTemp(deviceList[0]) available_temp_type = temp_type.lower() available_temp_type = available_temp_type.replace('(', '') available_temp_type = available_temp_type.replace(')', '') - header = ['GPU', 'Temp', 'AvgPwr', 'Partitions', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%'] - subheader = ['', temp_type, '', '(Mem, Compute)', '', '', '', '', '', '', ''] + header = ['GPU', '[Model : Revision]', 'Temp', 'AvgPwr', 'Partitions', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%'] + subheader = ['', 'Name (20 chars)', temp_type, '', '(Mem, Compute)', '', '', '', '', '', '', ''] # add additional spaces to match header for idx, item in enumerate(subheader): header_size = len(header[idx]) @@ -1614,6 +1620,8 @@ def showAllConcise(deviceList): values = {} degree_sign = u'\N{DEGREE SIGN}' for device in deviceList: + gpu_dev_product_info = getDevProductInfo(device) + gpu_dev_product_info_names = list(gpu_dev_product_info[device]) temp_val = str(getTemp(device, available_temp_type)) if temp_val != 'N/A': temp_val += degree_sign + 'C' @@ -1647,10 +1655,19 @@ def showAllConcise(deviceList): mem_use_pct='Unsupported' if vram_used != None and vram_total != None and float(vram_total) != 0: mem_use_pct = '% 3.0f%%' % (100 * (float(vram_used) / float(vram_total))) - values['card%s' % (str(device))] = [device, temp_val, avgPwr, + + gpu_dev_product_info_top_name = gpu_dev_product_info_names[0] + if (len(gpu_dev_product_info_names) > 1): + values['card%s_Info' % (str(device))] = ['', gpu_dev_product_info_names[0], '', '', '', + '', '', '', + '', '', '', ''] + gpu_dev_product_info_top_name = gpu_dev_product_info_names[1] + + values['card%s' % (str(device))] = [device, gpu_dev_product_info_top_name, temp_val, avgPwr, combined_partition, sclk, mclk, fan, str(perf).lower(), pwrCap, mem_use_pct, gpu_busy] + val_widths = {} for device in deviceList: val_widths[device] = [len(str(val)) + 2 for val in values['card%s' % (str(device))]] @@ -1662,10 +1679,19 @@ def showAllConcise(deviceList): printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), subheader)), None, useItalics=True) printLogSpacer(fill='=') + for device in deviceList: printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), values['card%s' % (str(device))])), None) + gpu_dev_product_info = getDevProductInfo(device) + gpu_dev_product_info_names = list(gpu_dev_product_info[device]) + if (len(gpu_dev_product_info_names) > 1): + printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in + zip(range(len(max_widths)), values['card%s_Info' % (str(device))])), None) + printLogSpacer() + """ Restore original max size """ + appWidth = appWidth_temp def showAllConciseHw(deviceList): @@ -2378,6 +2404,58 @@ def showProductName(deviceList): printLogSpacer() +def getDevProductInfo(device): + """ Show the requested product name for the device requested + + @param device: Device we want to get the info for + """ + + # Retrieve card vendor + MAX_BUFF_SIZE = 256 + MAX_DESC_SIZE = 20 + device_info = "N/A" + device_list = {} + vendor = create_string_buffer(MAX_BUFF_SIZE) + ret = rocmsmi.rsmi_dev_vendor_name_get(device, vendor, MAX_BUFF_SIZE) + # Only continue if GPU vendor is AMD + if rsmi_ret_ok(ret, device, 'get_vendor_name') and isAmdDevice(device): + # Retrieve the device series + series = create_string_buffer(MAX_BUFF_SIZE) + ret = rocmsmi.rsmi_dev_name_get(device, series, MAX_BUFF_SIZE) + if rsmi_ret_ok(ret, device, 'get_name'): + try: + device_series = series.value.decode() + except UnicodeDecodeError: + device_series = "N/A" + printErrLog(device, "Unable to read card series") + + # Retrieve the device model + model = create_string_buffer(MAX_BUFF_SIZE) + ret = rocmsmi.rsmi_dev_subsystem_name_get(device, model, MAX_BUFF_SIZE) + if rsmi_ret_ok(ret, device, 'get_subsystem_name'): + try: + device_model = model.value.decode() + device_model = padHexValue(device_model, 4) + except UnicodeDecodeError: + device_model = "N/A" + printErrLog(device, "Unable to read device model") + + try: + gpu_revision = padHexValue(getRev(device), 2) + except Exception as exc: + gpu_revision = "N/A" + printErrLog(device, "Unable to read card revision %s" % (exc)) + + device_series_str = str(device_series[:MAX_DESC_SIZE]) + device_series_str = device_series_str.ljust(MAX_DESC_SIZE, ' ') + device_model_str = str(('[' + device_model + ' : ' + gpu_revision + ']')) + device_model_str = str(device_model_str[:MAX_DESC_SIZE]) + device_model_str = device_model_str.ljust(MAX_DESC_SIZE, ' ') + device_list = {device : [device_series_str, device_model_str]} + + return device_list + + def showProfile(deviceList): """ Display available Power Profiles for a list of devices. From 471fbfddc1dd6786542c4e4d1753346c5225840e Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Thu, 24 Aug 2023 13:08:47 -0500 Subject: [PATCH 4/5] Numa affinity shows large number Change the affinity from unsigned int to integer to represent -1. Change-Id: I82dc6f476b45fa4ec03a3c686fe8e6e2b7761b56 --- include/rocm_smi/rocm_smi.h | 2 +- python_smi_tools/rocm_smi.py | 2 +- src/rocm_smi.cc | 7 ++++--- src/rocm_smi_device.cc | 2 +- tests/rocm_smi_test/functional/sys_info_read.cc | 7 ++++--- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 053447b501..9fa7de0b35 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -1534,7 +1534,7 @@ rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid); * support this function with the given arguments * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid */ -rsmi_status_t rsmi_topo_numa_affinity_get(uint32_t dv_ind, uint32_t *numa_node); +rsmi_status_t rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node); /** * @brief Get PCIe traffic information diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 08f2652665..4771a29f8f 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -3077,7 +3077,7 @@ def showNumaTopology(deviceList): @param deviceList: List of DRM devices (can be a single-item list) """ printLogSpacer(' Numa Nodes ') - numa_numbers = c_uint32() + numa_numbers = c_int32() for device in deviceList: ret = rocmsmi.rsmi_topo_get_numa_node_number(device, byref(numa_numbers)) if rsmi_ret_ok(ret, device, 'get_numa_node_number'): diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index b3347a155d..5c2e3f8fd5 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -756,7 +756,7 @@ rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) { } rsmi_status_t -rsmi_topo_numa_affinity_get(uint32_t dv_ind, uint32_t *numa_node) { +rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node) { TRY rsmi_status_t ret; uint64_t val = 0; @@ -764,9 +764,10 @@ rsmi_topo_numa_affinity_get(uint32_t dv_ind, uint32_t *numa_node) { CHK_SUPPORT_NAME_ONLY(numa_node) DEVICE_MUTEX - ret = get_dev_value_int(amd::smi::kDevNumaNode, dv_ind, &val); + std::string str_val; + ret = get_dev_value_str(amd::smi::kDevNumaNode, dv_ind, &str_val); + *numa_node = std::stol(str_val, 0); - *numa_node = static_cast(val); return ret; CATCH } diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index ddaf41a44a..f859a9812e 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -928,7 +928,6 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevDFCountersAvailable: case kDevMemBusyPercent: case kDevXGMIError: - case kDevNumaNode: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); if (tempStr == "") { @@ -1046,6 +1045,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { case kDevAvailableComputePartition: case kDevComputePartition: case kDevMemoryPartition: + case kDevNumaNode: return readDevInfoStr(type, val); break; diff --git a/tests/rocm_smi_test/functional/sys_info_read.cc b/tests/rocm_smi_test/functional/sys_info_read.cc index dcff82aa84..1a2d9ff4df 100755 --- a/tests/rocm_smi_test/functional/sys_info_read.cc +++ b/tests/rocm_smi_test/functional/sys_info_read.cc @@ -90,6 +90,7 @@ void TestSysInfoRead::Run(void) { rsmi_status_t err; uint64_t val_ui64; uint32_t val_ui32; + int32_t val_i32; char buffer[80]; rsmi_version_t ver = {0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, nullptr}; @@ -136,11 +137,11 @@ void TestSysInfoRead::Run(void) { err = rsmi_dev_pci_id_get(i, nullptr); ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); - err = rsmi_topo_numa_affinity_get(i, &val_ui32); + err = rsmi_topo_numa_affinity_get(i, &val_i32); CHK_ERR_ASRT(err) IF_VERB(STANDARD) { - std::cout << "\t**NUMA NODE: 0x" << std::hex << val_ui32; - std::cout << " (" << std::dec << val_ui32 << ")" << std::endl; + std::cout << "\t**NUMA NODE: 0x" << std::hex << val_i32; + std::cout << " (" << std::dec << val_i32 << ")" << std::endl; } // Verify api support checking functionality is working err = rsmi_topo_numa_affinity_get(i, nullptr); From 84e90e55d5a2dad3ae4d91662f94e33349110b7a Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Fri, 25 Aug 2023 14:01:53 -0500 Subject: [PATCH 5/5] TESTS - Add 90402 and simplify description Change-Id: Ie6ab12d4201841fcb832d6827a5ec0ae5bb65114 Signed-off-by: Galantsev, Dmitrii --- tests/rocm_smi_test/rsmitst.exclude | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tests/rocm_smi_test/rsmitst.exclude b/tests/rocm_smi_test/rsmitst.exclude index 7d229b68f4..43a738ab22 100644 --- a/tests/rocm_smi_test/rsmitst.exclude +++ b/tests/rocm_smi_test/rsmitst.exclude @@ -57,18 +57,19 @@ $BLACKLIST_ALL_ASICS\ "rsmitstReadWrite.TestPerfLevelReadWrite" # SWDEV-391407 +# aqua_vanjaram and later systems show 'ip discovery' in +# /sys/class/kfd/kfd/topology/nodes/*/name +# +# For those systems gfx_target_version must be used. It can be found in +# /sys/class/kfd/kfd/topology/nodes/*/properties FILTER[90400]=\ $BLACKLIST_ALL_ASICS\ "rsmitstReadOnly.TestVoltCurvRead:"\ "rsmitstReadOnly.TestFrequenciesRead:"\ "rsmitstReadWrite.TestFrequenciesReadWrite:"\ "rsmitstReadWrite.TestPowerReadWrite" -FILTER[90401]=\ -$BLACKLIST_ALL_ASICS\ -"rsmitstReadOnly.TestVoltCurvRead:"\ -"rsmitstReadOnly.TestFrequenciesRead:"\ -"rsmitstReadWrite.TestFrequenciesReadWrite:"\ -"rsmitstReadWrite.TestPowerReadWrite" +FILTER[90401]=${FILTER[90400]} +FILTER[90402]=${FILTER[90400]} # SWDEV-321166 FILTER[virtualization]=\