diff --git a/projects/rocm-smi-lib/CMakeLists.txt b/projects/rocm-smi-lib/CMakeLists.txt index 60e7f3ccf6..83e4df78ec 100755 --- a/projects/rocm-smi-lib/CMakeLists.txt +++ b/projects/rocm-smi-lib/CMakeLists.txt @@ -23,6 +23,9 @@ set(ROCM_SMI_COMPONENT "lib${ROCM_SMI}") set(ROCM_SMI_TARGET "${ROCM_SMI}64") set(ROCM_SMI_LIB_NAME "lib${ROCM_SMI_TARGET}") +# Expose project info to IDEs +option(CMAKE_EXPORT_COMPILE_COMMANDS "Export compile commands for linters and autocompleters" ON) + # provide git to utilities find_program (GIT NAMES git) @@ -55,9 +58,6 @@ project(${AMD_SMI_LIBS_TARGET}) include(GNUInstallDirs) set(COMMON_PROJ_ROOT ${PROJECT_SOURCE_DIR}) -## Verbose output. -set(CMAKE_VERBOSE_MAKEFILE on) - if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.4.0) message("Compiler version is " ${CMAKE_CXX_COMPILER_VERSION}) diff --git a/projects/rocm-smi-lib/DEBIAN/postinst.in b/projects/rocm-smi-lib/DEBIAN/postinst.in index c3176bb6c6..98e9365825 100755 --- a/projects/rocm-smi-lib/DEBIAN/postinst.in +++ b/projects/rocm-smi-lib/DEBIAN/postinst.in @@ -35,9 +35,14 @@ do_configureLogrotate() { size 1M copytruncate dateext - dateformat .%Y-%m-%d_%H:%M:%S + dateformat .%%Y-%%m-%%d_%H:%%M:%%S } EOF + # Fix for %S argument not found (now we escape with %%) + # issue was RPM build thought we were using macros + # https://gitlab.kitware.com/cmake/cmake/-/issues/22965 + # https://rpm-software-management.github.io/rpm/manual/spec.html + sudo sed -i s/%%/%/g /etc/logrotate.d/rocm_smi.conf # workaround: remove extra 'OURCE' text # from rocm_smi.conf. Unsure if CMAKE, # bash, or here document diff --git a/projects/rocm-smi-lib/RPM/post.in b/projects/rocm-smi-lib/RPM/post.in index d1992de5ba..bfb0613388 100755 --- a/projects/rocm-smi-lib/RPM/post.in +++ b/projects/rocm-smi-lib/RPM/post.in @@ -34,9 +34,14 @@ do_configureLogrotate() { size 1M copytruncate dateext - dateformat .%Y-%m-%d_%H:%M:%S + dateformat .%%Y-%%m-%%d_%H:%%M:%%S } EOF + # Fix for %S argument not found (now we escape with %%) + # issue was RPM build thought we were using macros + # https://gitlab.kitware.com/cmake/cmake/-/issues/22965 + # https://rpm-software-management.github.io/rpm/manual/spec.html + sudo sed -i s/%%/%/g /etc/logrotate.d/rocm_smi.conf # workaround: remove extra 'OURCE' text # from rocm_smi.conf. Unsure if CMAKE, # bash, or here document diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index 502a0537d9..5b304ee770 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -326,7 +326,7 @@ def getPerfLevel(device): ret = rocmsmi.rsmi_dev_perf_level_get(device, byref(perf)) if rsmi_ret_ok(ret, device, 'get_perf_level'): return perf_level_string(perf.value) - return -1 + return 'N/A' def getPid(name): @@ -540,8 +540,7 @@ def printEventList(device, delay, eventList): print2DArray([['\rGPU[%d]:\t' % (device), ctime().split()[3], notification_type_names[data.event.value - 1], data.message.decode('utf8') + '\r']]) - -def printLog(device, metricName, value, extraSpace=False): +def printLog(device, metricName, value=None, extraSpace=False): """ Print out to the SMI log @param device: DRM device identifier @@ -900,7 +899,7 @@ def setClockRange(deviceList, clkType, minvalue, maxvalue, autoRespond): try: int(minvalue) & int(maxvalue) except ValueError: - printErrLog(device, 'Unable to set %s range' % (clkType)) + printErrLog(None, 'Unable to set %s range' % (clkType)) logging.error('%s or %s is not an integer', minvalue, maxvalue) RETCODE = 1 return @@ -1076,7 +1075,7 @@ def setClocks(deviceList, clktype, clk): int(check_value) except ValueError: printLog(None, 'Unable to set clock level', None) - logging.error('Non-integer characters are present in value %s', value) + logging.error('Non-integer characters are present in value %s', check_value) RETCODE = 1 return # Generate a frequency bitmask from user input value @@ -1155,7 +1154,7 @@ def setPerfDeterminism(deviceList, clkvalue): try: int(clkvalue) except ValueError: - printErrLog(device, 'Unable to set Performance Determinism') + printErrLog(None, 'Unable to set Performance Determinism') logging.error('%s is not an integer', clkvalue) RETCODE = 1 return @@ -1225,19 +1224,16 @@ def setRas(deviceList, rasAction, rasBlock, rasType): printLog(None, "This is experimental feature, use 'amdgpuras' tool for ras error manipulations for newer vbios") if rasAction not in validRasActions: - printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType), - None) + printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType)) logging.debug('Action %s is not a valid RAS command' % rasAction) return if rasBlock not in validRasBlocks: - printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType), - None) + printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType)) printLog(None, 'Block %s is not a valid RAS block' % rasBlock) return if rasType not in validRasTypes: - printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType), - None) + printLog(None, 'Unable to perform RAS command %s on block %s for type %s' % (rasAction, rasBlock, rasType)) printLog(None, 'Memory error type %s is not a valid RAS memory type' % rasAction) return @@ -1277,7 +1273,6 @@ def setFanSpeed(deviceList, fan): for device in deviceList: if str(fan): fanLevel = c_int64() - sensor_ind = c_uint32(0) last_char = str(fan)[-1] if last_char == '%': fanLevel = int(str(fan)[:-1]) / 100 * 255 @@ -1675,7 +1670,6 @@ def showClocks(deviceList): printLogSpacer(' Supported clock frequencies ') for device in deviceList: for clk_type in sorted(rsmi_clk_names_dict): - freq_list = [] if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1: ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq)) if rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True): @@ -1694,7 +1688,6 @@ def showClocks(deviceList): ret = rocmsmi.rsmi_dev_pci_bandwidth_get(device, byref(bw)) if rsmi_ret_ok(ret, device, 'get_PCIe_bandwidth', True): printLog(device, 'Supported %s frequencies on GPU%s' % ('PCIe', str(device)), None) - freq_list = [] for x in range(bw.transfer_rate.num_supported): fr = '{:>.1f}GT/s x{}'.format(bw.transfer_rate.frequency[x] / 1000000000, bw.lanes[x]) if x == bw.transfer_rate.current: @@ -1718,8 +1711,6 @@ def showCurrentClocks(deviceList, clk_defined=None, concise=False): global PRINT_JSON freq = rsmi_frequencies_t() bw = rsmi_pcie_bandwidth_t() - currentString = '' - sortedClocksArray = [] if not concise: printLogSpacer(' Current clock frequencies ') for device in deviceList: @@ -1832,7 +1823,6 @@ def showFwInfo(deviceList, fwType): firmware_blocks = fwType printLogSpacer(' Firmware Information ') for device in deviceList: - fw_ver_list = [] fw_ver = c_uint64() for fw_name in firmware_blocks: fw_name = fw_name.upper() @@ -2054,10 +2044,13 @@ def showMemVendor(deviceList): printLogSpacer(' Memory Vendor ') for device in deviceList: ret = rocmsmi.rsmi_dev_vram_vendor_get(device, vendor, 256) - if rsmi_ret_ok(ret, device, 'get_vram_vendor') and vendor.value.decode(): - printLog(device, 'GPU memory vendor', vendor.value.decode()) - else: - logging.debug('GPU memory vendor missing or not supported') + try: + if rsmi_ret_ok(ret, device, 'get_vram_vendor') and vendor.value.decode(): + printLog(device, 'GPU memory vendor', vendor.value.decode()) + else: + logging.debug('GPU memory vendor missing or not supported') + except UnicodeDecodeError: + printErrLog(device, 'Unable to read GPU memory vendor') printLogSpacer() @@ -2070,6 +2063,8 @@ def showOverDrive(deviceList, odtype): rsmi_od = c_uint32() printLogSpacer(' OverDrive Level ') for device in deviceList: + odStr = '' + od = '' if odtype == 'sclk': odStr = 'GPU' ret = rocmsmi.rsmi_dev_overdrive_level_get(device, byref(rsmi_od)) @@ -2085,7 +2080,6 @@ def showOverDrive(deviceList, odtype): else: printErrLog(device, 'Unable to retrieve OverDrive') logging.error('Unsupported clock type %s', odtype) - RETCODE = 1 printLog(device, odStr + ' OverDrive value (%)', od) printLogSpacer() @@ -2380,7 +2374,7 @@ def showRasInfo(deviceList, rasType): for name in rasType: if name.upper() not in rsmi_gpu_block_d: rasType.remove(name) - printErrLog(device, '%s is not a RAS block' % (name)) + printErrLog(None, '%s is not a RAS block' % (name)) rasBlocks = [block.upper() for block in rasType] @@ -2522,15 +2516,15 @@ def showEvents(deviceList, eventTypes): printErrLog(None, 'Ignoring unrecognized event type %s' % (event.replace(',', ''))) if len(eventTypeList) == 0: eventTypeList = notification_type_names - try: print2DArray([['DEVICE\t', 'TIME\t', 'TYPE\t', 'DESCRIPTION']]) # Create a seperate thread for each GPU for device in deviceList: - _thread.start_new_thread(printEventList, (device, 1000, eventTypeList)) - time.sleep(0.25) - except Exception as e: - printErrLog(device, 'Unable to start new thread. %s' % (e)) - return + try: + _thread.start_new_thread(printEventList, (device, 1000, eventTypeList)) + time.sleep(0.25) + except Exception as e: + printErrLog(device, 'Unable to start new thread. %s' % (e)) + return while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c' getch = _Getch() user_input = getch() @@ -2555,16 +2549,19 @@ def printTempGraph(deviceList, delay): printEmptyLine() originalTerminalWidth = os.get_terminal_size()[0] while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c' - printString = '' + terminalWidth = os.get_terminal_size()[0] + printStrings = list() for device in deviceList: temp = getTemp(device, 'edge') - percentage = temp + if temp == 'N/A': + percentage = 0 + else: + percentage = temp if percentage >= 100: percentage = 100 if percentage < 0: percentage = 0 # Get available space based on terminal width - terminalWidth = os.get_terminal_size()[0] availableSpace = 0 if terminalWidth >= 20: availableSpace = terminalWidth - 20 @@ -2587,13 +2584,17 @@ def printTempGraph(deviceList, delay): return # Two spare Spaces tempString = (str(int(temp)) + '°C').ljust(5) - printString += '\033[2;30;47mGPU[%d] Temp %s|%s%s\x1b[0m%s\r\n' % (device, tempString, color, paddingSpace[1:], remainderSpace) + printStrings.append('\033[2;30;47mGPU[%d] Temp %s|%s%s\x1b[0m%s' % (device, tempString, color, paddingSpace[1:], remainderSpace)) originalTerminalWidth = terminalWidth time.sleep((delay / 1000)) + if terminalWidth >= 20: - for i in range(devices): - printString = '\033[A' + printString - print(printString, end = '\r') + # go up and prepare to rewrite the lines + for i in printStrings: + print('\033[A', end='\r') + # print all strings + for i in printStrings: + print(i, end='\r\n') def getGraphColor(percentage): @@ -2966,7 +2967,7 @@ def showNodesBw(deviceList): else: gpu_links_type[srcdevice][destdevice] = "N/A" if PRINT_JSON: - formatMatrixToJSON(deviceList, "{}-{}".format(minBW.value, maxBW.value), " min-max bandwidth between DRM devices {} and {}".format(srcdevice, destdevice)) + # TODO return printTableRow(None, ' ') for row in deviceList: @@ -3219,7 +3220,10 @@ def rsmi_ret_ok(my_ret, device=None, metric=None, silent=False): returnString += '%s GPU[%s]:' % (my_ret, device) if metric is not None: returnString += ' %s: ' % (metric) - returnString += '%s\t' % (err_str.value.decode()) + else: + metric = '' + if err_str.value is not None: + returnString += '%s\t' % (err_str.value.decode()) if not PRINT_JSON: logging.debug('%s', returnString) if not silent: @@ -3279,7 +3283,7 @@ def save(deviceList, savefilepath): 'fan': fanSpeeds[device], 'overdrivesclk': overDriveGpu[device], 'overdrivemclk': overDriveGpuMem[device], 'profile': profiles[device], 'perflevel': perfLevels[device]} - printLog(device, 'Current settings successfully saved to', savefilepath) + printLog(None, 'Current settings successfully saved to', savefilepath) with open(savefilepath, 'w') as savefile: json.dump(jsonData, savefile, ensure_ascii=True) printLogSpacer() diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 0eea621c88..6b5fdfedb3 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -2237,16 +2237,79 @@ rsmi_dev_vendor_name_get(uint32_t dv_ind, char *name, size_t len) { rsmi_status_t rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { + rsmi_status_t ret; TRY std::ostringstream ss; ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); - CHK_SUPPORT_NAME_ONLY(b) + GET_DEV_AND_KFDNODE_FROM_INDX + CHK_API_SUPPORT_ONLY((b), RSMI_DEFAULT_VARIANT, RSMI_DEFAULT_VARIANT) DEVICE_MUTEX - - return get_frequencies(amd::smi::kDevPCIEClk, RSMI_CLK_TYPE_PCIE, dv_ind, + ret = get_frequencies(amd::smi::kDevPCIEClk, RSMI_CLK_TYPE_PCIE, dv_ind, &b->transfer_rate, b->lanes); + if (ret == RSMI_STATUS_SUCCESS) { + return ret; + } + + // Only fallback to gpu_metric if connecting via PCIe + if (kfd_node->numa_node_type() != amd::smi::IOLINK_TYPE_PCIEXPRESS) { + return ret; + } + + rsmi_gpu_metrics_t gpu_metrics; + ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); + if (ret != RSMI_STATUS_SUCCESS) { + return ret; + } + + // Hardcode based on PCIe specification: https://en.wikipedia.org/wiki/PCI_Express + const uint32_t link_width[] = {1, 2, 4, 8, 12, 16}; + const uint32_t link_speed[] = {25, 50, 80, 160}; // 0.1 Ghz + const uint32_t WIDTH_DATA_LENGTH = sizeof(link_width)/sizeof(uint32_t); + const uint32_t SPEED_DATA_LENGTH = sizeof(link_speed)/sizeof(uint32_t); + + // Calculate the index + int width_index = -1; + int speed_index = -1; + uint32_t cur_index = 0; + for (cur_index = 0; cur_index < WIDTH_DATA_LENGTH; cur_index++) { + if (link_width[cur_index] == gpu_metrics.pcie_link_width) { + width_index = cur_index; + break; + } + } + for (cur_index = 0; + cur_index < SPEED_DATA_LENGTH; cur_index++) { + if (link_speed[cur_index] == gpu_metrics.pcie_link_speed) { + speed_index = cur_index; + break; + } + } + if (width_index == -1 || speed_index == -1) { + return RSMI_STATUS_NOT_SUPPORTED; + } + // Set possible lanes and frequencies + b->transfer_rate.num_supported = WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; + b->transfer_rate.current = speed_index*WIDTH_DATA_LENGTH + width_index; + for (cur_index = 0; + cur_index < WIDTH_DATA_LENGTH * SPEED_DATA_LENGTH; cur_index++) { + b->transfer_rate.frequency[cur_index] + = link_speed[cur_index/WIDTH_DATA_LENGTH] * 100 * 1000000L; + b->lanes[cur_index] = link_width[cur_index % WIDTH_DATA_LENGTH]; + } + /* + frequency = {2500, 2500, 2500, 2500, 2500, 2500, + 5000, 5000, 5000, 5000, 5000, 5000, + 8000, 8000, 8000, 8000, 8000, 8000, + 16000, 16000, 16000, 16000, 16000, 16000}; // Mhz + lanes = {1, 2, 4, 8, 12, 16, + 1, 2, 4, 8, 12, 16, + 1, 2, 4, 8, 12, 16, + 1, 2, 4, 8, 12, 16 }; // For each frequency + */ + + return RSMI_STATUS_SUCCESS; CATCH } diff --git a/projects/rocm-smi-lib/src/rocm_smi_utils.cc b/projects/rocm-smi-lib/src/rocm_smi_utils.cc index 70425d78c3..2cbb936454 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_utils.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_utils.cc @@ -50,6 +50,7 @@ #include #include +#include #include #include #include