From c5d1e3f8c015f157819b8f8e37aa55080f714e4b Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Thu, 13 Jul 2023 08:28:53 -0500 Subject: [PATCH 01/15] rocm-smi --showevents shows wrong gpuID Use the gpuid returned from the event data instead. Change-Id: I7f286cc105f7ea12985223e603504f0ef3d9724e [ROCm/amdsmi commit: 0aeb6025bdc08444c0c297824e3d77e39ab69203] --- projects/amdsmi/python_smi_tools/rocm_smi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/amdsmi/python_smi_tools/rocm_smi.py b/projects/amdsmi/python_smi_tools/rocm_smi.py index 5c12624142..2fe68ab518 100755 --- a/projects/amdsmi/python_smi_tools/rocm_smi.py +++ b/projects/amdsmi/python_smi_tools/rocm_smi.py @@ -537,7 +537,7 @@ def printEventList(device, delay, eventList): data = rsmi_evt_notification_data_t(1) rocmsmi.rsmi_event_notification_get(delay, byref(num_elements), byref(data)) if len(data.message) > 0: - print2DArray([['\rGPU[%d]:\t' % (device), ctime().split()[3], notification_type_names[data.event.value - 1], + print2DArray([['\rGPU[%d]:\t' % (data.dv_ind), ctime().split()[3], notification_type_names[data.event.value - 1], data.message.decode('utf8') + '\r']]) def printLog(device, metricName, value=None, extraSpace=False): From fa34ddea561650f2de6dcc5fe4fe58ef351e9611 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Fri, 14 Jul 2023 02:22:45 -0500 Subject: [PATCH 02/15] Add .cache to gitignore Change-Id: Ida03bf1f50704bea44827d7578cd74c1896d4368 Signed-off-by: Galantsev, Dmitrii [ROCm/amdsmi commit: b0fe2fbd0762db27c263fdbae46b58dfbfbbb3e4] --- projects/amdsmi/.gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/projects/amdsmi/.gitignore b/projects/amdsmi/.gitignore index 4abf0e47d9..55abee102c 100644 --- a/projects/amdsmi/.gitignore +++ b/projects/amdsmi/.gitignore @@ -12,3 +12,6 @@ python_smi_tools/rsmiBindings.py # Build directory build/ + +# CMake cache +.cache/ From 77d8364211f3aec2d89dac64b2f8dd3e8f9e48d3 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Fri, 14 Jul 2023 02:22:50 -0500 Subject: [PATCH 03/15] Fix sys and id tests The following read tests were failing: *.TestIdInfoRead *.TestSysInfoRead 1. *.TestIdInfoRead failed because rsmi_dev_brand_get did not specify dependency on vbios_version. 2. *.TestSysInfoRead failed because the test didn't expect vbios_version to be missing. Which is a new behavior in Aqua Vanjaram. Change-Id: I9ee88a12fcf6cff2032049e2ecdfb2957efb03ab Signed-off-by: Galantsev, Dmitrii [ROCm/amdsmi commit: 8fe848d10e485af839fb43501027d2442994021f] --- projects/amdsmi/src/rocm_smi_device.cc | 3 ++- projects/amdsmi/tests/rocm_smi_test/functional/id_info_read.cc | 2 +- .../amdsmi/tests/rocm_smi_test/functional/sys_info_read.cc | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/projects/amdsmi/src/rocm_smi_device.cc b/projects/amdsmi/src/rocm_smi_device.cc index 18c96b7f13..554b8c0eb8 100755 --- a/projects/amdsmi/src/rocm_smi_device.cc +++ b/projects/amdsmi/src/rocm_smi_device.cc @@ -379,7 +379,8 @@ static const std::map kDevFuncDependsMap = { {"rsmi_dev_name_get", {{kDevVendorIDFName, kDevDevIDFName}, {}}}, {"rsmi_dev_sku_get", {{kDevDevProdNumFName}, {}}}, - {"rsmi_dev_brand_get", {{kDevVendorIDFName}, {}}}, + {"rsmi_dev_brand_get", {{kDevVendorIDFName, + kDevVBiosVerFName}, {}}}, {"rsmi_dev_vendor_name_get", {{kDevVendorIDFName}, {}}}, {"rsmi_dev_serial_number_get", {{kDevSerialNumberFName}, {}}}, {"rsmi_dev_subsystem_id_get", {{kDevSubSysDevIDFName}, {}}}, diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/id_info_read.cc b/projects/amdsmi/tests/rocm_smi_test/functional/id_info_read.cc index 5c7eea1256..11828feb85 100755 --- a/projects/amdsmi/tests/rocm_smi_test/functional/id_info_read.cc +++ b/projects/amdsmi/tests/rocm_smi_test/functional/id_info_read.cc @@ -162,7 +162,7 @@ void TestIdInfoRead::Run(void) { IF_VERB(STANDARD) { std::cout << "\t**Device Vram Vendor name: " << buffer << std::endl; } - err = rsmi_dev_brand_get(i, nullptr, kBufferLen); + err = rsmi_dev_vram_vendor_get(i, nullptr, kBufferLen); ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); } err = rsmi_dev_vendor_id_get(i, &id); diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/sys_info_read.cc b/projects/amdsmi/tests/rocm_smi_test/functional/sys_info_read.cc index 14d1977676..dcff82aa84 100755 --- a/projects/amdsmi/tests/rocm_smi_test/functional/sys_info_read.cc +++ b/projects/amdsmi/tests/rocm_smi_test/functional/sys_info_read.cc @@ -105,7 +105,7 @@ void TestSysInfoRead::Run(void) { err = rsmi_dev_vbios_version_get(i, buffer, 80); if (err != RSMI_STATUS_SUCCESS) { - if (err == RSMI_STATUS_FILE_ERROR) { + if ((err == RSMI_STATUS_FILE_ERROR) || (err == RSMI_STATUS_NOT_SUPPORTED)) { IF_VERB(STANDARD) { std::cout << "\t**VBIOS read: Not supported on this machine" << std::endl; From bec2ebc89374c64d1f8d2bc50d54a4dfbe995046 Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Mon, 17 Jul 2023 22:39:08 -0500 Subject: [PATCH 04/15] Add revision to --showhw Code changes related to the following: * Added 'rsmi_dev_revision_get()' related code * Test code * Functional tests Change-Id: I8c2097c65384a028c8c8437b717d05d52fe45250 Signed-off-by: Oliveira, Daniel [ROCm/amdsmi commit: 573620f586309fe070a65753bcb4ca057dbc8f49] --- projects/amdsmi/include/rocm_smi/rocm_smi.h | 15 ++++++++++++++ .../amdsmi/include/rocm_smi/rocm_smi_device.h | 1 + projects/amdsmi/python_smi_tools/rocm_smi.py | 20 +++++++++++++++++-- .../python_smi_tools/rsmiBindings.py.in | 16 +++++++++++---- .../rocm_smi/example/rocm_smi_example.cc | 3 +++ projects/amdsmi/src/rocm_smi.cc | 17 +++++++++++++++- projects/amdsmi/src/rocm_smi_device.cc | 6 +++++- projects/amdsmi/src/rocm_smi_main.cc | 1 + .../rocm_smi_test/functional/id_info_read.cc | 18 +++++++++++++++++ .../amdsmi/tests/rocm_smi_test/test_base.cc | 6 ++++++ .../amdsmi/tests/rocm_smi_test/test_common.cc | 1 + 11 files changed, 96 insertions(+), 8 deletions(-) diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/include/rocm_smi/rocm_smi.h index f0a531f154..6c0e1b9d60 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi.h @@ -1088,6 +1088,21 @@ rsmi_status_t rsmi_num_monitor_devices(uint32_t *num_devices); */ rsmi_status_t rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id); +/** + * @brief Get the device revision associated with the device + * + * @details Given a device index @p dv_ind and a pointer to a uint32_t to + * which the revision will be written + * + * @param[in] dv_ind a device index + * + * @param[inout] revision a pointer to uint32_t to which the device revision + * will be written + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + * + */ +rsmi_status_t rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision); /** * @brief Get the SKU for a desired device associated with the device with diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h index c975baae55..3dcf7e1345 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h @@ -100,6 +100,7 @@ enum DevInfoTypes { kDevOverDriveLevel, kDevMemOverDriveLevel, kDevDevID, + kDevDevRevID, kDevDevProdName, kDevDevProdNum, kDevVendorID, diff --git a/projects/amdsmi/python_smi_tools/rocm_smi.py b/projects/amdsmi/python_smi_tools/rocm_smi.py index 2fe68ab518..f8755c1954 100755 --- a/projects/amdsmi/python_smi_tools/rocm_smi.py +++ b/projects/amdsmi/python_smi_tools/rocm_smi.py @@ -249,6 +249,17 @@ def getId(device): return hex(dv_id.value) +def getRev(device): + """ Return the hexadecimal value of a device's Revision + + @param device: DRM device identifier + """ + dv_rev = c_short() + ret = rocmsmi.rsmi_dev_revision_get(device, byref(dv_rev)) + if rsmi_ret_ok(ret, device, 'get_device_rev'): + return hex(dv_rev.value) + + def getMaxPower(device): """ Return the maximum power cap of a given device @@ -1601,19 +1612,23 @@ def showAllConciseHw(deviceList): print('ERROR: Cannot print JSON/CSV output for concise hardware output') sys.exit(1) printLogSpacer(' Concise Hardware Info ') - header = ['GPU', 'DID', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS'] + header = ['GPU', 'DID', 'DREV', 'GFX RAS', 'SDMA RAS', 'UMC RAS', 'VBIOS', 'BUS'] head_widths = [len(head) + 2 for head in header] values = {} for device in deviceList: gpuid = getId(device) if str(gpuid).startswith('0x'): gpuid = str(gpuid)[2:] + gpurev = getRev(device) + if str(gpurev).startswith('0x'): + gpurev = str(gpurev)[2:] + gfxRas = getRasEnablement(device, 'GFX') sdmaRas = getRasEnablement(device, 'SDMA') umcRas = getRasEnablement(device, 'UMC') vbios = getVbiosVersion(device) bus = getBus(device) - values['card%s' % (str(device))] = [device, gpuid, gfxRas, sdmaRas, umcRas, vbios, bus] + values['card%s' % (str(device))] = [device, gpuid, gpurev, gfxRas, sdmaRas, umcRas, vbios, bus] val_widths = {} for device in deviceList: val_widths[device] = [len(str(val)) + 2 for val in values['card%s' % (str(device))]] @@ -1952,6 +1967,7 @@ def showId(deviceList): printLogSpacer(' ID ') for device in deviceList: printLog(device, 'GPU ID', getId(device)) + printLog(device, 'GPU Rev', getRev(device)) printLogSpacer() diff --git a/projects/amdsmi/python_smi_tools/rsmiBindings.py.in b/projects/amdsmi/python_smi_tools/rsmiBindings.py.in index b6e7f2474d..9ffcac138d 100644 --- a/projects/amdsmi/python_smi_tools/rsmiBindings.py.in +++ b/projects/amdsmi/python_smi_tools/rsmiBindings.py.in @@ -11,8 +11,16 @@ import os # Use ROCm installation path if running from standard installation # With File Reorg rsmiBindings.py will be installed in /opt/rocm/libexec/rocm_smi. -# relative path changed accordingly -path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@' +# relative path changed accordingly. +# if ROCM_SMI_LIB_PATH is set, we can load 'librocm_smi64.so' from that location +# +path_librocm = str() +rocm_smi_lib_path = os.getenv('ROCM_SMI_LIB_PATH') +if (rocm_smi_lib_path != None): + path_librocm = rocm_smi_lib_path +else: + path_librocm = os.path.dirname(os.path.realpath(__file__)) + '/../../@CMAKE_INSTALL_LIBDIR@/librocm_smi64.so.@VERSION_MAJOR@' + if not os.path.isfile(path_librocm): print('Unable to find %s . Trying /opt/rocm*' % path_librocm) for root, dirs, files in os.walk('/opt', followlinks=True): @@ -22,9 +30,10 @@ if not os.path.isfile(path_librocm): print('Using lib from %s' % path_librocm) else: print('Unable to find librocm_smi64.so.@VERSION_MAJOR@') +else: + print('Library loaded from: %s ' % path_librocm) # ----------> TODO: Support static libs as well as SO - try: cdll.LoadLibrary(path_librocm) rocmsmi = CDLL(path_librocm) @@ -36,7 +45,6 @@ except OSError: .format('\33[33m', '\033[0m')) exit() - # Device ID dv_id = c_uint64() # GPU ID diff --git a/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc b/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc index bb456f7a0e..9e9019e2b8 100755 --- a/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc +++ b/projects/amdsmi/rocm_smi/example/rocm_smi_example.cc @@ -718,6 +718,9 @@ int main() { ret = rsmi_dev_id_get(i, &val_ui16); CHK_RSMI_RET_I(ret) std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << std::endl; + ret = rsmi_dev_revision_get(i, &val_ui16); + CHK_RSMI_RET_I(ret) + std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << val_ui16 << std::endl; char current_compute_partition[256]; current_compute_partition[0] = '\0'; diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index 4cd359ce3a..4851e8e398 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -632,7 +632,7 @@ rsmi_status_t rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_error_count_t *ec) { std::vector val_vec; - rsmi_status_t ret; + rsmi_status_t ret(RSMI_STATUS_NOT_SUPPORTED); std::ostringstream ss; TRY @@ -820,6 +820,21 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) { return ret; } +rsmi_status_t +rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision) { + std::ostringstream outss; + rsmi_status_t ret; + outss << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(outss); + CHK_SUPPORT_NAME_ONLY(revision) + + ret = get_id(dv_ind, amd::smi::kDevDevRevID, revision); + outss << __PRETTY_FUNCTION__ << " | ======= end =======" + << ", reporting " << amd::smi::getRSMIStatusString(ret); + LOG_TRACE(outss); + return ret; +} + rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *id) { TRY diff --git a/projects/amdsmi/src/rocm_smi_device.cc b/projects/amdsmi/src/rocm_smi_device.cc index 554b8c0eb8..bdc5984ce4 100755 --- a/projects/amdsmi/src/rocm_smi_device.cc +++ b/projects/amdsmi/src/rocm_smi_device.cc @@ -85,6 +85,7 @@ static const char *kDevPerfLevelFName = "power_dpm_force_performance_level"; static const char *kDevDevProdNameFName = "product_name"; static const char *kDevDevProdNumFName = "product_number"; static const char *kDevDevIDFName = "device"; +static const char *kDevDevRevIDFName = "revision"; static const char *kDevVendorIDFName = "vendor"; static const char *kDevSubSysDevIDFName = "subsystem_device"; static const char *kDevSubSysVendorIDFName = "subsystem_vendor"; @@ -238,6 +239,7 @@ static const std::map kDevAttribNameMap = { {kDevDevProdName, kDevDevProdNameFName}, {kDevDevProdNum, kDevDevProdNumFName}, {kDevDevID, kDevDevIDFName}, + {kDevDevRevID, kDevDevRevIDFName}, {kDevVendorID, kDevVendorIDFName}, {kDevSubSysDevID, kDevSubSysDevIDFName}, {kDevSubSysVendorID, kDevSubSysVendorIDFName}, @@ -374,8 +376,8 @@ static const std::map kDevFuncDependsMap = { // Functions with only mandatory dependencies {"rsmi_dev_vram_vendor_get", {{kDevVramVendorFName}, {}}}, {"rsmi_dev_id_get", {{kDevDevIDFName}, {}}}, + {"rsmi_dev_revision_get", {{kDevDevRevIDFName}, {}}}, {"rsmi_dev_vendor_id_get", {{kDevVendorIDFName}, {}}}, - {"rsmi_dev_name_get", {{kDevVendorIDFName, kDevDevIDFName}, {}}}, {"rsmi_dev_sku_get", {{kDevDevProdNumFName}, {}}}, @@ -889,6 +891,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { switch (type) { case kDevDevID: + case kDevDevRevID: case kDevSubSysDevID: case kDevSubSysVendorID: case kDevVendorID: @@ -1026,6 +1029,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { case kDevDevProdName: case kDevDevProdNum: case kDevDevID: + case kDevDevRevID: case kDevSubSysDevID: case kDevSubSysVendorID: case kDevVendorID: diff --git a/projects/amdsmi/src/rocm_smi_main.cc b/projects/amdsmi/src/rocm_smi_main.cc index 3a5565dbe9..92ffe5af4f 100755 --- a/projects/amdsmi/src/rocm_smi_main.cc +++ b/projects/amdsmi/src/rocm_smi_main.cc @@ -84,6 +84,7 @@ amd::smi::RocmSMI::devInfoTypesStrings = { {amd::smi::kDevOverDriveLevel, amdSMI + "kDevOverDriveLevel"}, {amd::smi::kDevMemOverDriveLevel, amdSMI + "kDevMemOverDriveLevel"}, {amd::smi::kDevDevID, amdSMI + "kDevDevID"}, + {amd::smi::kDevDevRevID, amdSMI + "kDevDevRevID"}, {amd::smi::kDevDevProdName, amdSMI + "kDevDevProdName"}, {amd::smi::kDevDevProdNum, amdSMI + "kDevDevProdNum"}, {amd::smi::kDevVendorID, amdSMI + "kDevVendorID"}, diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/id_info_read.cc b/projects/amdsmi/tests/rocm_smi_test/functional/id_info_read.cc index 11828feb85..1988d951a1 100755 --- a/projects/amdsmi/tests/rocm_smi_test/functional/id_info_read.cc +++ b/projects/amdsmi/tests/rocm_smi_test/functional/id_info_read.cc @@ -121,6 +121,24 @@ void TestIdInfoRead::Run(void) { err = rsmi_dev_id_get(i, nullptr); ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); } + // Get device Revision + err = rsmi_dev_revision_get(i, &id); + if (err == RSMI_STATUS_NOT_SUPPORTED) { + rsmi_status_t ret; + // Verify api support checking functionality is working + ret = rsmi_dev_revision_get(i, nullptr); + ASSERT_EQ(ret, RSMI_STATUS_NOT_SUPPORTED); + } else { + CHK_ERR_ASRT(err) + + IF_VERB(STANDARD) { + std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << id << std::endl; + } + // Verify api support checking functionality is working + err = rsmi_dev_revision_get(i, nullptr); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + } + err = rsmi_dev_name_get(i, buffer, kBufferLen); if (err == RSMI_STATUS_NOT_SUPPORTED) { std::cout << "\t**Device Marketing name not found on this system." << diff --git a/projects/amdsmi/tests/rocm_smi_test/test_base.cc b/projects/amdsmi/tests/rocm_smi_test/test_base.cc index 6984736e75..a406868c63 100755 --- a/projects/amdsmi/tests/rocm_smi_test/test_base.cc +++ b/projects/amdsmi/tests/rocm_smi_test/test_base.cc @@ -132,6 +132,12 @@ void TestBase::PrintDeviceHeader(uint32_t dv_ind) { IF_VERB(STANDARD) { std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << std::endl; } + err = rsmi_dev_revision_get(dv_ind, &val_ui16); + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << val_ui16 << std::endl; + } + char name[128]; err = rsmi_dev_name_get(dv_ind, name, 128); CHK_ERR_ASRT(err) diff --git a/projects/amdsmi/tests/rocm_smi_test/test_common.cc b/projects/amdsmi/tests/rocm_smi_test/test_common.cc index eabc6125b5..d7a8a34d86 100755 --- a/projects/amdsmi/tests/rocm_smi_test/test_common.cc +++ b/projects/amdsmi/tests/rocm_smi_test/test_common.cc @@ -278,6 +278,7 @@ void DumpMonitorInfo(const TestBase *test) { }; print_val_str(amd::smi::kDevDevID, "Device ID: "); + print_val_str(amd::smi::kDevDevRevID, "Dev.Rev.ID: "); print_val_str(amd::smi::kDevPerfLevel, "Performance Level: "); print_val_str(amd::smi::kDevOverDriveLevel, "OverDrive Level: "); print_vector(amd::smi::kDevGPUMClk, From 43075e2886502c504f02aa4fac5dccd5492a5e81 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Thu, 20 Jul 2023 15:20:36 -0500 Subject: [PATCH 05/15] Update logging and README for other project usage Updates: * [rocm-smi] Logging now can update files on per-project-basis for install/remove * [rocm-smi] README now has latest build instructions, including test builds * [rocm-smi] Updated README to include revision dates Change-Id: Ifb19a6f32ccf6938f47225db53fef88021909264 Signed-off-by: Charis Poag [ROCm/amdsmi commit: 4613e8dec384c651620b309348190d09bf8d76f7] --- projects/amdsmi/.gitignore | 3 ++ projects/amdsmi/DEBIAN/postinst.in | 43 +++++++++++++------------ projects/amdsmi/DEBIAN/prerm.in | 14 ++++++--- projects/amdsmi/README.md | 15 +++++---- projects/amdsmi/RPM/post.in | 50 +++++++++++++++++++----------- projects/amdsmi/RPM/preun.in | 12 +++++-- 6 files changed, 85 insertions(+), 52 deletions(-) diff --git a/projects/amdsmi/.gitignore b/projects/amdsmi/.gitignore index 55abee102c..91cbeef563 100644 --- a/projects/amdsmi/.gitignore +++ b/projects/amdsmi/.gitignore @@ -15,3 +15,6 @@ build/ # CMake cache .cache/ + +# Simulated SYSFS - for early development or debug +device/ \ No newline at end of file diff --git a/projects/amdsmi/DEBIAN/postinst.in b/projects/amdsmi/DEBIAN/postinst.in index eaccaa938b..a62d7e9eea 100755 --- a/projects/amdsmi/DEBIAN/postinst.in +++ b/projects/amdsmi/DEBIAN/postinst.in @@ -1,32 +1,37 @@ #!/bin/bash - #set -x +packageName="rocm-smi-lib" +logPath=/var/log/rocm_smi_lib +logName=ROCm-SMI-lib.log +logFile="${logPath}/${logName}" +logrotateConfFile=/etc/logrotate.d/rocm_smi.conf + do_addLogFolder() { - sudo mkdir -p /var/log/rocm_smi_lib - sudo touch /var/log/rocm_smi_lib/ROCm-SMI-lib.log - sudo chmod -R a+rw /var/log/rocm_smi_lib - sudo chmod a+rw /var/log/rocm_smi_lib/ROCm-SMI-lib.log + sudo mkdir -p "${logPath}" + sudo touch "${logFile}" + sudo chmod -R a+rw "${logPath}" + sudo chmod a+rw "${logFile}" } do_configureLogrotate() { logrotate --version &>/dev/null if [ $? -ne 0 ]; then echo "[WARNING] Detected logrotate is not installed."\ - "ROCm-smi logs (when turned on) will not rotate properly." + "$packageName logs (when turned on) will not rotate properly." return fi - if [ ! -f /etc/logrotate.d/rocm_smi.conf ]; then - sudo touch /etc/logrotate.d/rocm_smi.conf - sudo chmod 644 /etc/logrotate.d/rocm_smi.conf # root r/w, all others read + if [ ! -f $logrotateConfFile ]; then + sudo touch "${logrotateConfFile}" + sudo chmod 644 "${logrotateConfFile}" # root r/w, all others read # ROCm SMI logging rotation, rotates files using root user/group # Hourly logrotation check # Only rotates if size grew larger than 1MB # Max of 4 rotation files, oldest will be removed # Rotated files use date extention of ex. ROCm-SMI-lib.log.2023-05-09_16:51:42 - cat <<'EOF' | sudo tee /etc/logrotate.d/rocm_smi.conf >/dev/null -/var/log/rocm_smi_lib/ROCm-SMI-lib.log { + cat << EOF | sudo tee "${logrotateConfFile}" >/dev/null +${logFile} { su root root hourly missingok @@ -42,12 +47,12 @@ EOF # issue was RPM build thought we were using macros # https://gitlab.kitware.com/cmake/cmake/-/issues/22965 # https://rpm-software-management.github.io/rpm/manual/spec.html - sudo sed -i s/%%/%/g /etc/logrotate.d/rocm_smi.conf + sudo sed -i s/%%/%/g "${logrotateConfFile}" # workaround: remove extra 'OURCE' text # from rocm_smi.conf. Unsure if CMAKE, # bash, or here document # issue (only seen on RHEL 8.7) - sudo sed -i s/OURCE//g /etc/logrotate.d/rocm_smi.conf + sudo sed -i s/OURCE//g "${logrotateConfFile}" fi # check if logrotate uses system timers, Ubuntu/modern OS's do # Several older OS's like RHEL 8.7, do not. Instead defaults @@ -60,16 +65,16 @@ EOF if [ -f /etc/cron.hourly/logrotate ]; then sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate else - echo "[WARNING] Could find and configure hourly cron for ROCm-smi's"\ - " logrotate. ROCm-smi logs (when turned on) will not rotate properly." + echo "[WARNING] Could find and configure hourly cron for $packageName's"\ + " logrotate. $packageName logs (when turned on) will not rotate properly." return fi else # confirm that it's already been moved to hourly sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly if [ $? -ne 0 ]; then - echo "[WARNING] Could not configure an hourly cron for ROCm-smi's logrotate."\ - "ROCm-smi logs (when turned on) may not rotate properly." + echo "[WARNING] Could not configure an hourly cron for $packageName's logrotate."\ + "$packageName logs (when turned on) may not rotate properly." fi fi else @@ -94,8 +99,8 @@ WantedBy=timers.target EOF sudo systemctl reenable --now logrotate.timer else - echo "[WARNING] Could not configure systemd timer for ROCm's logrotate."\ - "ROCm-smi logs (when turned on) will not rotate properly." + echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ + "$packageName logs (when turned on) will not rotate properly." fi fi } diff --git a/projects/amdsmi/DEBIAN/prerm.in b/projects/amdsmi/DEBIAN/prerm.in index 7c23e8e990..9b4efa9899 100755 --- a/projects/amdsmi/DEBIAN/prerm.in +++ b/projects/amdsmi/DEBIAN/prerm.in @@ -1,14 +1,19 @@ #!/bin/bash - set -e +packageName="rocm-smi-lib" +logPath=/var/log/rocm_smi_lib +logName=ROCm-SMI-lib.log +logFile="${logPath}/${logName}" +logrotateConfFile=/etc/logrotate.d/rocm_smi.conf + rm_logFolder() { - sudo rm -rf /var/log/rocm_smi_lib + sudo rm -rf "$logPath" } return_logrotateToOrigConfig() { - if [ -f /etc/logrotate.d/rocm_smi.conf ]; then - sudo rm -rf /etc/logrotate.d/rocm_smi.conf + if [ -f $logrotateConfFile ]; then + sudo rm -rf "${logrotateConfFile}" fi if [ -f /etc/cron.hourly/logrotate ]; then sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate @@ -33,6 +38,7 @@ rm_pyc() { rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBEXECDIR@/rocm_smi/__pycache__ } + case "$1" in ( remove | upgrade) rm_ldconfig diff --git a/projects/amdsmi/README.md b/projects/amdsmi/README.md index c36f81f705..948ef14988 100755 --- a/projects/amdsmi/README.md +++ b/projects/amdsmi/README.md @@ -8,7 +8,7 @@ The ROCm System Management Interface Library, or ROCm SMI library, is part of th The information contained herein is for informational purposes only, and is subject to change without notice. In addition, any stated support is planned and is also subject to change. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. -© 2022 Advanced Micro Devices, Inc. All Rights Reserved. +© 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved. # Building ROCm SMI @@ -28,8 +28,8 @@ After the ROCm SMI library git repository has been cloned to a local Linux machi ```shell mkdir -p build cd build -cmake -make +cmake .. +make -j $(nproc) # Install library file and header; default location is /opt/rocm $ make install ``` @@ -57,11 +57,10 @@ In order to verify the build and capability of ROCm SMI on your system and to se ```shell # Set environment variables used in CMakeLists.txt file -ROCM_DIR= -mkdir -cd -cmake -DROCM_DIR=$ROCM_DIR /tests/rocm_smi_test -make +mkdir build +cd build +cmake -DBUILD_TESTS=ON .. +make -j $(nproc) ``` To run the test, execute the program `rsmitst` that is built from the steps above. diff --git a/projects/amdsmi/RPM/post.in b/projects/amdsmi/RPM/post.in index 29787d660f..379dcd7152 100755 --- a/projects/amdsmi/RPM/post.in +++ b/projects/amdsmi/RPM/post.in @@ -1,31 +1,37 @@ #!/bin/bash #set -x +packageName="rocm-smi-lib" +logPath=/var/log/rocm_smi_lib +logName=ROCm-SMI-lib.log +logFile="${logPath}/${logName}" +logrotateConfFile=/etc/logrotate.d/rocm_smi.conf + do_addLogFolder() { - sudo mkdir -p /var/log/rocm_smi_lib - sudo touch /var/log/rocm_smi_lib/ROCm-SMI-lib.log - sudo chmod -R a+rw /var/log/rocm_smi_lib - sudo chmod a+rw /var/log/rocm_smi_lib/ROCm-SMI-lib.log + sudo mkdir -p "${logPath}" + sudo touch "${logFile}" + sudo chmod -R a+rw "${logPath}" + sudo chmod a+rw "${logFile}" } do_configureLogrotate() { logrotate --version &>/dev/null if [ $? -ne 0 ]; then echo "[WARNING] Detected logrotate is not installed."\ - "ROCm-smi logs (when turned on) will not rotate properly." + "$packageName logs (when turned on) will not rotate properly." return fi - if [ ! -f /etc/logrotate.d/rocm_smi.conf ]; then - sudo touch /etc/logrotate.d/rocm_smi.conf - sudo chmod 644 /etc/logrotate.d/rocm_smi.conf # root r/w, all others read + if [ ! -f $logrotateConfFile ]; then + sudo touch "${logrotateConfFile}" + sudo chmod 644 "${logrotateConfFile}" # root r/w, all others read # ROCm SMI logging rotation, rotates files using root user/group # Hourly logrotation check # Only rotates if size grew larger than 1MB # Max of 4 rotation files, oldest will be removed # Rotated files use date extention of ex. ROCm-SMI-lib.log.2023-05-09_16:51:42 - cat <<'EOF' | sudo tee /etc/logrotate.d/rocm_smi.conf >/dev/null -/var/log/rocm_smi_lib/ROCm-SMI-lib.log { + cat << EOF | sudo tee "${logrotateConfFile}" >/dev/null +${logFile} { su root root hourly missingok @@ -41,12 +47,12 @@ EOF # issue was RPM build thought we were using macros # https://gitlab.kitware.com/cmake/cmake/-/issues/22965 # https://rpm-software-management.github.io/rpm/manual/spec.html - sudo sed -i s/%%/%/g /etc/logrotate.d/rocm_smi.conf + sudo sed -i s/%%/%/g "${logrotateConfFile}" # workaround: remove extra 'OURCE' text # from rocm_smi.conf. Unsure if CMAKE, # bash, or here document # issue (only seen on RHEL 8.7) - sudo sed -i s/OURCE//g /etc/logrotate.d/rocm_smi.conf + sudo sed -i s/OURCE//g "${logrotateConfFile}" fi # check if logrotate uses system timers, Ubuntu/modern OS's do # Several older OS's like RHEL 8.7, do not. Instead defaults @@ -59,16 +65,16 @@ EOF if [ -f /etc/cron.hourly/logrotate ]; then sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate else - echo "[WARNING] Could find and configure hourly cron for ROCm-smi's"\ - " logrotate. ROCm-smi logs (when turned on) will not rotate properly." + echo "[WARNING] Could find and configure hourly cron for $packageName's"\ + " logrotate. $packageName logs (when turned on) will not rotate properly." return fi else # confirm that it's already been moved to hourly sudo find /etc/cron.* -iname logrotate -print -quit |grep -iq hourly if [ $? -ne 0 ]; then - echo "[WARNING] Could not configure an hourly cron for ROCm-smi's logrotate."\ - "ROCm-smi logs (when turned on) may not rotate properly." + echo "[WARNING] Could not configure an hourly cron for $packageName's logrotate."\ + "$packageName logs (when turned on) may not rotate properly." fi fi else @@ -93,12 +99,20 @@ WantedBy=timers.target EOF sudo systemctl reenable --now logrotate.timer else - echo "[WARNING] Could not configure systemd timer for ROCm's logrotate."\ - "ROCm-smi logs (when turned on) will not rotate properly." + echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ + "$packageName logs (when turned on) will not rotate properly." fi fi } +do_ldconfig() { + # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build + if [ "@ENABLE_LDCONFIG@" == "ON" ]; then + echo @CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@ > /etc/ld.so.conf.d/x86_64-librocm_smi_lib.conf + ldconfig + fi +} + # left-hand term originates from ENABLE_LDCONFIG = ON/OFF at package build if [ "@ENABLE_LDCONFIG@" == "ON" ]; then echo -e "@CPACK_PACKAGING_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@" > /etc/ld.so.conf.d/x86_64-librocm_smi_lib.conf diff --git a/projects/amdsmi/RPM/preun.in b/projects/amdsmi/RPM/preun.in index 612504c3f2..b9c2eda6b3 100755 --- a/projects/amdsmi/RPM/preun.in +++ b/projects/amdsmi/RPM/preun.in @@ -1,13 +1,19 @@ #!/bin/bash #set -x +packageName="rocm-smi-lib" +logPath=/var/log/rocm_smi_lib +logName=ROCm-SMI-lib.log +logFile="${logPath}/${logName}" +logrotateConfFile=/etc/logrotate.d/rocm_smi.conf + rm_logFolder() { - sudo rm -rf /var/log/rocm_smi_lib + sudo rm -rf "$logPath" } return_logrotateToOrigConfig() { - if [ -f /etc/logrotate.d/rocm_smi.conf ]; then - sudo rm -rf /etc/logrotate.d/rocm_smi.conf + if [ -f $logrotateConfFile ]; then + sudo rm -rf "${logrotateConfFile}" fi if [ -f /etc/cron.hourly/logrotate ]; then sudo mv /etc/cron.hourly/logrotate /etc/cron.daily/logrotate From 8c2266573f3340eca41823627a7ff7afca7f0069 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Wed, 14 Jun 2023 10:52:46 -0500 Subject: [PATCH 06/15] SWDEV-394316 - Handle not applicable vbios Change-Id: I3390078a63c9a5eff67024b84a3be1369c4b1460 Signed-off-by: Maisam Arif [ROCm/amdsmi commit: c78ec4667142345835e691d82a74aa5c33ac2d72] --- projects/amdsmi/python_smi_tools/rocm_smi.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/projects/amdsmi/python_smi_tools/rocm_smi.py b/projects/amdsmi/python_smi_tools/rocm_smi.py index f8755c1954..e374bd3e20 100755 --- a/projects/amdsmi/python_smi_tools/rocm_smi.py +++ b/projects/amdsmi/python_smi_tools/rocm_smi.py @@ -410,7 +410,9 @@ def getVbiosVersion(device): """ vbios = create_string_buffer(256) ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256) - if rsmi_ret_ok(ret, device): + if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: + return "Unsupported" + elif rsmi_ret_ok(ret, device): return vbios.value.decode() @@ -2288,8 +2290,12 @@ def showProductName(deviceList): # if rsmi_ret_ok(ret, device) and sku.value.decode(): # device_sku = sku.value.decode() # Retrieve the device SKU as a substring from VBIOS + device_sku = "" ret = rocmsmi.rsmi_dev_vbios_version_get(device, vbios, 256) - if rsmi_ret_ok(ret, device, 'get_vbios_version') and vbios.value.decode(): + if ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: + device_sku = "Unsupported" + printLog(device, 'Card SKU', '\t\t' + device_sku) + elif rsmi_ret_ok(ret, device, 'get_vbios_version') and vbios.value.decode(): # Device SKU is just the characters in between the two '-' in vbios_version if vbios.value.decode().count('-') == 2 and len(str(vbios.value.decode().split('-')[1])) > 1: device_sku = vbios.value.decode().split('-')[1] From 1f2d0cefb3ca015ff5d2ee73db40697210c4fc82 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Wed, 26 Jul 2023 15:28:18 -0500 Subject: [PATCH 07/15] Handle csv output when the command is not based on the device Fix the error only one csv line can be printed out when output is not based on device. Change-Id: Idacc5d98acc223e932fb3d46c888bfa04778b73c [ROCm/amdsmi commit: 80d650b95a1db443d76c7fb644861f99110ca996] --- projects/amdsmi/python_smi_tools/rocm_smi.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/projects/amdsmi/python_smi_tools/rocm_smi.py b/projects/amdsmi/python_smi_tools/rocm_smi.py index e374bd3e20..95f7e2510f 100755 --- a/projects/amdsmi/python_smi_tools/rocm_smi.py +++ b/projects/amdsmi/python_smi_tools/rocm_smi.py @@ -112,19 +112,10 @@ def formatCsv(deviceList): if outputType == 'system': jsonobj = json.loads(jsondata) keylist = header - for record in jsonobj: - my_string += str(record) - for key in keylist: - if key == 'system': - tempstr = str(jsonobj[record]) - tempstr = tempstr[tempstr.find('\'')+1:] - tempstr = tempstr[:tempstr.find('\'')] - # Force output device type to 'system' - my_string += ',%s\nsystem,%s' % (tempstr, jsonobj[record][tempstr]) - my_string += '\n' - # Force output device type to 'system' - if my_string.startswith('system'): - my_string = 'device' + my_string[6:] + for record in jsonobj['system']: + my_string += "\"%s\", \"%s\"\n" % (record, jsonobj['system'][record]) + # add header + my_string = "name, value\n" + my_string return my_string headerkeys = [] # Separate device-specific information from system-level information From 0d0e10d677cf10c67493bb69d939bca18b206522 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Thu, 27 Jul 2023 15:18:28 -0500 Subject: [PATCH 08/15] Change reset power error message to logging Since the reset will continue if the reset power and current power is the same, error may confuse the user. Change-Id: I35b9ef17afd47b5af5bd2b8882a44f63991fe509 [ROCm/amdsmi commit: aeb6c61f54b7167cc4034360ac2cf29b3c07b49d] --- projects/amdsmi/python_smi_tools/rocm_smi.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/amdsmi/python_smi_tools/rocm_smi.py b/projects/amdsmi/python_smi_tools/rocm_smi.py index 95f7e2510f..27302cc50a 100755 --- a/projects/amdsmi/python_smi_tools/rocm_smi.py +++ b/projects/amdsmi/python_smi_tools/rocm_smi.py @@ -1357,7 +1357,7 @@ def setPowerOverDrive(deviceList, value, autoRespond): RETCODE = 1 continue if new_power_cap.value == current_power_cap.value: - printErrLog(device,'Max power was already at: {}W'.format(new_power_cap.value / 1000000)) + printLog(device,'Max power was already at: {}W'.format(new_power_cap.value / 1000000)) if current_power_cap.value < default_power_cap.value: current_power_cap.value = default_power_cap.value From 14752945c0f2214cdecc4a1d65fa19e2adce6ced Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Mon, 31 Jul 2023 08:35:24 -0500 Subject: [PATCH 09/15] Crash when ecc count sysfile cannot be read Replace assert with error handling code. Change-Id: I6500ae4d38a8caea87828aa7d76373d20c8354c7 [ROCm/amdsmi commit: 0522439ac2083f8ee5bee66437548cec12cf4643] --- projects/amdsmi/src/rocm_smi.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index 4851e8e398..f06ecfac0b 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -682,6 +682,7 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, DEVICE_MUTEX ret = GetDevValueVec(type, dv_ind, &val_vec); + if (val_vec.size() != 2 ) ret = RSMI_STATUS_FILE_ERROR; if (ret == RSMI_STATUS_FILE_ERROR) { ss << __PRETTY_FUNCTION__ << " | ======= end =======" @@ -698,8 +699,6 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, return ret; } - assert(val_vec.size() == 2); - std::string junk; std::istringstream fs1(val_vec[0]); From 65c425c77f8271f0076b146f3d8fb538501e6329 Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Tue, 1 Aug 2023 21:46:19 -0500 Subject: [PATCH 10/15] [lib] Enhance Logger: gpu_metrics + enable console out * Updates: - Env variable RSMI_LOGGING=0 or any other value -> all logging off - Env variable RSMI_LOGGING=1 -> logs only - Env variable RSMI_LOGGING=2 -> console only - Env variable RSMI_LOGGING=3 -> both logs + console - Metrics output includes hexdump of current file and decoded metrics (functions: logHexDump and log_gpu_metrics) - System info gathered, now includes if system's perceived endianness - little or big endian helpful for viewing decoded hexdump or any binary translation - Added templates for printing unsigned hex (print_unsigned_hex_and_int), unsigned integers (print_unsigned_int), and printing both unsigned hex and int with an optional header (print_unsigned_hex_and_int) - Fixed some build compile warnings/errors - ex. doing strncpys for sku or board names this operation is expected and needed and for temp file writes if unsuccessful we now properly send RSMI_STATUS_FILE_ERROR - Fixed on RHEL 8.8/9.x logrotate does not properly initialize Change-Id: Ifa0f0218c9cafd0a8cd6aa8e7f94d61e9107200f Signed-off-by: Charis Poag [ROCm/amdsmi commit: 9c7eed7edcb773acf62a9316a8f0eb2a95d67d57] --- projects/amdsmi/DEBIAN/postinst.in | 5 +- projects/amdsmi/RPM/post.in | 5 +- .../amdsmi/include/rocm_smi/rocm_smi_logger.h | 1 + .../amdsmi/include/rocm_smi/rocm_smi_main.h | 1 + .../amdsmi/include/rocm_smi/rocm_smi_utils.h | 49 +++- projects/amdsmi/oam/src/amd_oam.cc | 3 + projects/amdsmi/src/rocm_smi.cc | 4 +- projects/amdsmi/src/rocm_smi_device.cc | 7 +- projects/amdsmi/src/rocm_smi_gpu_metrics.cc | 218 ++++++++++++++++++ projects/amdsmi/src/rocm_smi_logger.cc | 45 +++- projects/amdsmi/src/rocm_smi_main.cc | 29 ++- projects/amdsmi/src/rocm_smi_utils.cc | 127 +++++++++- 12 files changed, 470 insertions(+), 24 deletions(-) diff --git a/projects/amdsmi/DEBIAN/postinst.in b/projects/amdsmi/DEBIAN/postinst.in index a62d7e9eea..ab2f640553 100755 --- a/projects/amdsmi/DEBIAN/postinst.in +++ b/projects/amdsmi/DEBIAN/postinst.in @@ -62,12 +62,11 @@ EOF # confirm logrotate file exists in daily if [ -f /etc/cron.daily/logrotate ]; then # move logrotate daily to hourly - if [ -f /etc/cron.hourly/logrotate ]; then + if [ -d /etc/cron.hourly ]; then sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate else echo "[WARNING] Could find and configure hourly cron for $packageName's"\ " logrotate. $packageName logs (when turned on) will not rotate properly." - return fi else # confirm that it's already been moved to hourly @@ -77,6 +76,7 @@ EOF "$packageName logs (when turned on) may not rotate properly." fi fi + return #done configuring for non-systemd timers else # Configure systemd timers - the typical setup for modern Linux logrotation setups if [ -f /lib/systemd/system/logrotate.timer ]; then @@ -102,6 +102,7 @@ EOF echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ "$packageName logs (when turned on) will not rotate properly." fi + return #done configuring for systemd timers fi } diff --git a/projects/amdsmi/RPM/post.in b/projects/amdsmi/RPM/post.in index 379dcd7152..b04e31c9f9 100755 --- a/projects/amdsmi/RPM/post.in +++ b/projects/amdsmi/RPM/post.in @@ -62,12 +62,11 @@ EOF # confirm logrotate file exists in daily if [ -f /etc/cron.daily/logrotate ]; then # move logrotate daily to hourly - if [ -f /etc/cron.hourly/logrotate ]; then + if [ -d /etc/cron.hourly ]; then sudo mv /etc/cron.daily/logrotate /etc/cron.hourly/logrotate else echo "[WARNING] Could find and configure hourly cron for $packageName's"\ " logrotate. $packageName logs (when turned on) will not rotate properly." - return fi else # confirm that it's already been moved to hourly @@ -77,6 +76,7 @@ EOF "$packageName logs (when turned on) may not rotate properly." fi fi + return #done configuring for non-systemd timers else # Configure systemd timers - the typical setup for modern Linux logrotation setups if [ -f /lib/systemd/system/logrotate.timer ]; then @@ -102,6 +102,7 @@ EOF echo "[WARNING] Could not configure systemd timer for $packageName's logrotate."\ "$packageName logs (when turned on) will not rotate properly." fi + return #done configuring for systemd timers fi } diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_logger.h b/projects/amdsmi/include/rocm_smi/rocm_smi_logger.h index 3ff1070418..bd2608db58 100644 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_logger.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_logger.h @@ -100,6 +100,7 @@ typedef enum LOG_TYPE { NO_LOG = 1, CONSOLE = 2, FILE_LOG = 3, + BOTH_FILE_AND_CONSOLE = 4 } LogType; class Logger { diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_main.h b/projects/amdsmi/include/rocm_smi/rocm_smi_main.h index a64adddcc5..f276bd85bb 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_main.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_main.h @@ -115,6 +115,7 @@ class RocmSMI { const RocmSMI_env_vars& getEnv(void); void printEnvVarInfo(void); bool isLoggingOn(void); + uint32_t getLogSetting(void); static const std::map devInfoTypesStrings; private: diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h index 087ca5faa9..a655b5b136 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h @@ -48,6 +48,9 @@ #include #include #include +#include +#include +#include #include "rocm_smi/rocm_smi_device.h" @@ -94,8 +97,52 @@ GetDevBinaryBlob(amd::smi::DevInfoTypes type, rsmi_status_t ErrnoToRsmiStatus(int err); std::string getRSMIStatusString(rsmi_status_t ret); std::tuple getSystemDetails(void); + std::string, std::string, std::string, std::string> + getSystemDetails(void); void logSystemDetails(void); +void logHexDump(const char *desc, const void *addr, const size_t len, + size_t perLine); +bool isSystemBigEndian(); +template +std::string print_int_as_hex(T i, bool showHexNotation=true) { + std::stringstream ss; + if (showHexNotation) { + ss << "0x" << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex; + } else { + ss << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex; + } + + if (std::is_same::value) { + ss << static_cast(i|0); + } else if (std::is_same::value) { + ss << static_cast(static_cast(i|0)); + } else if (std::is_signed::value) { + ss << static_cast(i | 0); + } else { + ss << static_cast(i | 0); + } + ss << std::dec; + return ss.str(); +}; + +template +std::string print_unsigned_int(T i) { + std::stringstream ss; + ss << static_cast(i | 0); + return ss.str(); +} + +template +std::string print_unsigned_hex_and_int(T i, std::string heading="") { + std::stringstream ss; + if (heading.empty() == false) { + ss << "\n" << heading << " = "; + } + ss << "Hex (MSB): " << print_int_as_hex(i) << ", " + << "Unsigned int: " << print_unsigned_int(i) << ", " + << "Byte Size: " << sizeof(T); + return ss.str(); +} struct pthread_wrap { public: diff --git a/projects/amdsmi/oam/src/amd_oam.cc b/projects/amdsmi/oam/src/amd_oam.cc index 8d63d94f40..62d4b28287 100755 --- a/projects/amdsmi/oam/src/amd_oam.cc +++ b/projects/amdsmi/oam/src/amd_oam.cc @@ -166,8 +166,11 @@ TRY rsmi_dev_name_get(dev_inx, dev->device_name, DEVICE_NAME_LEN); rsmi_dev_vbios_version_get(dev_inx, buf, buf_size); if (std::strlen(buf) > 0) { +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-truncation" std::strncpy(dev->sku_name, &buf[4], 6); std::strncpy(dev->board_name, buf, 12); +#pragma GCC diagnostic pop } rsmi_dev_serial_number_get(dev_inx, dev->board_serial_number, BOARD_SERIAL_NUM_LEN); diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index f06ecfac0b..61128c03eb 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -673,8 +673,8 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, default: ss << __PRETTY_FUNCTION__ << " | ======= end =======" - << ", default case -> reporting RSMI_STATUS_NOT_SUPPORTED" - << amd::smi::getRSMIStatusString(ret); + << ", default case -> reporting " + << amd::smi::getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED); LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } diff --git a/projects/amdsmi/src/rocm_smi_device.cc b/projects/amdsmi/src/rocm_smi_device.cc index bdc5984ce4..87077195ce 100755 --- a/projects/amdsmi/src/rocm_smi_device.cc +++ b/projects/amdsmi/src/rocm_smi_device.cc @@ -826,7 +826,12 @@ int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size, } ss << "Successfully read DevInfoBinary for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ") - SYSFS (" - << sysfs_path << "), returning binaryData = " << p_binary_data; + << sysfs_path << "), returning binaryData = " << p_binary_data + << "; byte_size = " << std::dec << static_cast(b_size); + + std::string metricDescription = "AMD SMI GPU METRICS (16-byte width), " + + sysfs_path; + logHexDump(metricDescription.c_str(), p_binary_data, b_size, 16); LOG_INFO(ss); return 0; } diff --git a/projects/amdsmi/src/rocm_smi_gpu_metrics.cc b/projects/amdsmi/src/rocm_smi_gpu_metrics.cc index 648b18a0e2..885c36d7f6 100755 --- a/projects/amdsmi/src/rocm_smi_gpu_metrics.cc +++ b/projects/amdsmi/src/rocm_smi_gpu_metrics.cc @@ -60,6 +60,10 @@ #include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_exception.h" +#include "rocm_smi/rocm_smi_logger.h" + +using namespace ROCmLogging; +using namespace amd::smi; #define TRY try { #define CATCH } catch (...) {return amd::smi::handleException();} @@ -139,6 +143,196 @@ typedef struct { } rsmi_gpu_metrics_v_1_3; + +// log current gpu_metrics file content read +// any metrics value can be a nullptr +void log_gpu_metrics(const metrics_table_header_t *gpu_metrics_table_header, + const rsmi_gpu_metrics_v_1_2 *rsmi_gpu_metrics_v_1_2, + const rsmi_gpu_metrics_v_1_3 *gpu_metrics_v_1_3, + const rsmi_gpu_metrics_t *rsmi_gpu_metrics) { + if (RocmSMI::getInstance().isLoggingOn() == false) { + return; + } + std::ostringstream ss; + if (gpu_metrics_table_header != nullptr) { + ss + /* Common Header */ + << print_unsigned_hex_and_int( + gpu_metrics_table_header->structure_size, + "gpu_metrics_table_header->structure_size") + << print_unsigned_hex_and_int( + gpu_metrics_table_header->format_revision, + "gpu_metrics_table_header->format_revision") + << print_unsigned_hex_and_int( + gpu_metrics_table_header->content_revision, + "gpu_metrics_table_header->content_revision"); + LOG_DEBUG(ss); + } + if (rsmi_gpu_metrics == nullptr) { + return; + } else { + // do nothing - continue + } + ss + /* Common Header */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->common_header.structure_size, + "rsmi_gpu_metrics->common_header.structure_size") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->common_header.format_revision, + "rsmi_gpu_metrics->common_header.format_revision") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->common_header.content_revision, + "rsmi_gpu_metrics->common_header.content_revision") + /* Temperature */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_edge, + "rsmi_gpu_metrics->temperature_edge") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_hotspot, + "rsmi_gpu_metrics->temperature_hotspot") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_mem, + "rsmi_gpu_metrics->temperature_mem") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_vrgfx, + "rsmi_gpu_metrics->temperature_vrgfx") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_vrsoc, + "rsmi_gpu_metrics->temperature_vrsoc") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_vrmem, + "rsmi_gpu_metrics->temperature_vrmem") + /* Utilization */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_gfx_activity, + "rsmi_gpu_metrics->average_gfx_activity") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_umc_activity, + "rsmi_gpu_metrics->average_umc_activity") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_mm_activity, + "rsmi_gpu_metrics->average_mm_activity") + /* Power/Energy */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_socket_power, + "rsmi_gpu_metrics->average_socket_power") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->energy_accumulator, + "rsmi_gpu_metrics->energy_accumulator") + /* Driver attached timestamp (in ns) */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->system_clock_counter, + "rsmi_gpu_metrics->system_clock_counter") + /* Average clocks */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_gfxclk_frequency, + "rsmi_gpu_metrics->average_gfxclk_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_socclk_frequency, + "rsmi_gpu_metrics->average_socclk_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_uclk_frequency, + "rsmi_gpu_metrics->average_uclk_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_vclk0_frequency, + "rsmi_gpu_metrics->average_vclk0_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_dclk0_frequency, + "rsmi_gpu_metrics->average_dclk0_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_vclk1_frequency, + "rsmi_gpu_metrics->average_vclk1_frequency") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->average_dclk1_frequency, + "rsmi_gpu_metrics->average_dclk1_frequency") + /* Current clocks */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_gfxclk, + "rsmi_gpu_metrics->current_gfxclk") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_socclk, + "rsmi_gpu_metrics->current_socclk") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_uclk, + "rsmi_gpu_metrics->current_uclk") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_vclk0, + "rsmi_gpu_metrics->current_vclk0") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_dclk0, + "rsmi_gpu_metrics->current_dclk0") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_vclk1, + "rsmi_gpu_metrics->current_vclk1") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_dclk1, + "rsmi_gpu_metrics->current_dclk1") + /* Throttle status */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->throttle_status, + "rsmi_gpu_metrics->throttle_status") + /* Fans */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->current_fan_speed, + "rsmi_gpu_metrics->current_fan_speed") + /* Link width/speed */ + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->pcie_link_width, + "rsmi_gpu_metrics->pcie_link_width") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->pcie_link_speed, + "rsmi_gpu_metrics->pcie_link_speed") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->padding, + "rsmi_gpu_metrics->padding") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->gfx_activity_acc, + "rsmi_gpu_metrics->gfx_activity_acc") + << print_unsigned_hex_and_int( + rsmi_gpu_metrics->mem_actvity_acc, + "rsmi_gpu_metrics->mem_actvity_acc"); + for (int i=0; i < RSMI_NUM_HBM_INSTANCES; i++) { + ss << print_unsigned_hex_and_int( + rsmi_gpu_metrics->temperature_hbm[i], + "rsmi_gpu_metrics->temperature_hbm[" + std::to_string(i) + "]"); + } + + if (rsmi_gpu_metrics_v_1_2 != nullptr) { + /* PMFW attached timestamp (10ns resolution) */ + ss + << print_unsigned_hex_and_int( + rsmi_gpu_metrics_v_1_2->firmware_timestamp, + "rsmi_gpu_metrics_v_1_2->firmware_timestamp"); + } + + if (gpu_metrics_v_1_3 != nullptr) { + /* PMFW attached timestamp (10ns resolution) */ + ss + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->firmware_timestamp, + "gpu_metrics_v_1_3->firmware_timestamp") + /* Voltage (mV) */ + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->voltage_soc, + "gpu_metrics_v_1_3->voltage_soc") + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->voltage_gfx, + "gpu_metrics_v_1_3->voltage_gfx") + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->voltage_mem, + "gpu_metrics_v_1_3->voltage_mem") + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->padding1, + "gpu_metrics_v_1_3->padding1") + /* Throttle status (ASIC independent) */ + << print_unsigned_hex_and_int( + gpu_metrics_v_1_3->indep_throttle_status, + "gpu_metrics_v_1_3->indep_throttle_status"); + } + LOG_DEBUG(ss); +} + static rsmi_status_t GetGPUMetricsFormat1(uint32_t dv_ind, rsmi_gpu_metrics_t *data, uint8_t content_v) { assert(content_v != RSMI_GPU_METRICS_API_CONTENT_VER_1 && @@ -268,16 +462,28 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { rsmi_gpu_metrics_v_1_3 smu_v_1_3; rsmi_status_t ret; + std::ostringstream ss; if (!dev->gpu_metrics_ver().structure_size) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(struct metrics_table_header_t), &dev->gpu_metrics_ver()); + log_gpu_metrics(&dev->gpu_metrics_ver(), nullptr, nullptr, nullptr); if (ret != RSMI_STATUS_SUCCESS) { + ss << "Returning = " << getRSMIStatusString(ret) + << ",\ndev->gpu_metrics_ver().structure_size = " + << print_unsigned_int(dev->gpu_metrics_ver().structure_size) + << ", could not read common header"; + LOG_ERROR(ss); return ret; } } // only supports gpu_metrics_v1_x version if (dev->gpu_metrics_ver().format_revision != 1) { + ss << "Returning = " << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) + << ",\ndev->gpu_metrics_ver().format_revision = " + << print_unsigned_int(dev->gpu_metrics_ver().format_revision) + << " was not equal to 1"; + LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } @@ -289,19 +495,31 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { RSMI_GPU_METRICS_API_CONTENT_VER_1) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(rsmi_gpu_metrics_t), smu); + ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_1"; + LOG_DEBUG(ss); + log_gpu_metrics(nullptr, nullptr, nullptr, smu); } else if (dev->gpu_metrics_ver().content_revision == RSMI_GPU_METRICS_API_CONTENT_VER_2) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(rsmi_gpu_metrics_v_1_2), &smu_v_1_2); map_gpu_metrics_1_2_to_rsmi_gpu_metrics_t(&smu_v_1_2, smu); + ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_2"; + LOG_DEBUG(ss); + log_gpu_metrics(nullptr, &smu_v_1_2, nullptr, smu); } else if (dev->gpu_metrics_ver().content_revision == RSMI_GPU_METRICS_API_CONTENT_VER_3) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, sizeof(rsmi_gpu_metrics_v_1_3), &smu_v_1_3); map_gpu_metrics_1_3_to_rsmi_gpu_metrics_t(&smu_v_1_3, smu); + ss << __PRETTY_FUNCTION__ << " | RSMI_GPU_METRICS_API_CONTENT_VER_3"; + LOG_DEBUG(ss); + log_gpu_metrics(nullptr, nullptr, &smu_v_1_3, smu); } else { ret = GetGPUMetricsFormat1(dv_ind, smu, dev->gpu_metrics_ver().content_revision); + ss << __PRETTY_FUNCTION__ << " | GetGPUMetricsFormat1"; + LOG_DEBUG(ss); + log_gpu_metrics(nullptr, nullptr, nullptr, smu); } if (ret != RSMI_STATUS_SUCCESS) { diff --git a/projects/amdsmi/src/rocm_smi_logger.cc b/projects/amdsmi/src/rocm_smi_logger.cc index c900c613c1..0600654ef3 100644 --- a/projects/amdsmi/src/rocm_smi_logger.cc +++ b/projects/amdsmi/src/rocm_smi_logger.cc @@ -177,6 +177,9 @@ void Logger::error(const char* text) throw() { logIntoFile(data); } else if (m_LogType == CONSOLE) { logOnConsole(data); + } else if (m_LogType == BOTH_FILE_AND_CONSOLE) { + logOnConsole(data); + logIntoFile(data); } } @@ -208,6 +211,9 @@ void Logger::alarm(const char* text) throw() { logIntoFile(data); } else if (m_LogType == CONSOLE) { logOnConsole(data); + } else if (m_LogType == BOTH_FILE_AND_CONSOLE) { + logOnConsole(data); + logIntoFile(data); } } @@ -239,6 +245,9 @@ void Logger::always(const char* text) throw() { logIntoFile(data); } else if (m_LogType == CONSOLE) { logOnConsole(data); + } else if (m_LogType == BOTH_FILE_AND_CONSOLE) { + logOnConsole(data); + logIntoFile(data); } } @@ -303,6 +312,10 @@ void Logger::info(const char* text) throw() { logIntoFile(data); } else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_INFO)) { logOnConsole(data); + } else if ((m_LogType == BOTH_FILE_AND_CONSOLE) + && (m_LogLevel >= LOG_LEVEL_INFO)) { + logOnConsole(data); + logIntoFile(data); } } @@ -333,6 +346,10 @@ void Logger::trace(const char* text) throw() { logIntoFile(data); } else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_TRACE)) { logOnConsole(data); + } else if ((m_LogType == BOTH_FILE_AND_CONSOLE) + && (m_LogLevel >= LOG_LEVEL_TRACE)) { + logOnConsole(data); + logIntoFile(data); } } @@ -363,6 +380,10 @@ void Logger::debug(const char* text) throw() { logIntoFile(data); } else if ((m_LogType == CONSOLE) && (m_LogLevel >= LOG_LEVEL_DEBUG)) { logOnConsole(data); + } else if ((m_LogType == BOTH_FILE_AND_CONSOLE) + && (m_LogLevel >= LOG_LEVEL_DEBUG)) { + logOnConsole(data); + logIntoFile(data); } } @@ -424,6 +445,9 @@ std::string Logger::getLogSettings() { case CONSOLE: logSettings += "LogType = CONSOLE"; break; + case BOTH_FILE_AND_CONSOLE: + logSettings += "LogType = BOTH_FILE_AND_CONSOLE"; + break; default: logSettings += "LogType = "; } @@ -471,7 +495,26 @@ void Logger::initialize_resources() { } m_File.open(logFileName.c_str(), std::ios::out | std::ios::app); m_LogLevel = LOG_LEVEL_TRACE; - m_LogType = FILE_LOG; + // RSMI_LOGGING = 1, output to logs only + // RSMI_LOGGING = 2, output to console only + // RSMI_LOGGING = 3, output to logs and console + switch (amd::smi::RocmSMI::getInstance().getLogSetting()) { + case 0: + m_LogType = NO_LOG; + break; + case 1: + m_LogType = FILE_LOG; + break; + case 2: + m_LogType = CONSOLE; + break; + case 3: + m_LogType = BOTH_FILE_AND_CONSOLE; + break; + default: + m_LogType = NO_LOG; + break; + } if (!m_File.is_open()) { std::cout << "WARNING: Issue opening log file (" << logFileName << ") to write." << std::endl; diff --git a/projects/amdsmi/src/rocm_smi_main.cc b/projects/amdsmi/src/rocm_smi_main.cc index 92ffe5af4f..0ba6d7c50e 100755 --- a/projects/amdsmi/src/rocm_smi_main.cc +++ b/projects/amdsmi/src/rocm_smi_main.cc @@ -458,17 +458,21 @@ static uint32_t GetEnvVarUInteger(const char *ev_str) { // provides a way to get env variable detail in both debug & release // helps enable full logging -static bool getRSMIEnvVar_LoggingEnabled(const char *ev_str) { - bool isLoggingEnabled = false; +// RSMI_LOGGING = 1, output to logs only +// RSMI_LOGGING = 2, output to console only +// RSMI_LOGGING = 3, output to logs and console +static uint32_t getRSMIEnvVar_LoggingEnabled(const char *ev_str) { + uint32_t ret = 0; ev_str = getenv(ev_str); - if (ev_str != nullptr) { - isLoggingEnabled = true; + int ev_ret = atoi(ev_str); + ret = static_cast(ev_ret); } - return isLoggingEnabled; + return ret; } -static std::unordered_set GetEnvVarUIntegerSets(const char *ev_str) { +static inline std::unordered_set GetEnvVarUIntegerSets( + const char *ev_str) { std::unordered_set returnSet; #ifndef DEBUG (void)ev_str; @@ -519,7 +523,16 @@ const RocmSMI_env_vars& RocmSMI::getEnv(void) { } bool RocmSMI::isLoggingOn(void) { + bool isLoggingOn = false; GetEnvVariables(); + if (this->env_vars_.logging_on > 0 + && this->env_vars_.logging_on <= 3) { + isLoggingOn = true; + } + return isLoggingOn; +} + +uint32_t RocmSMI::getLogSetting() { return this->env_vars_.logging_on; } @@ -544,7 +557,9 @@ void RocmSMI::printEnvVarInfo(void) { << ((env_vars_.debug_inf_loop == 0) ? "" : std::to_string(env_vars_.debug_inf_loop)) << std::endl; - bool isLoggingOn = (env_vars_.logging_on) ? true : false; + std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " + << getLogSetting() << std::endl; + bool isLoggingOn = RocmSMI::isLoggingOn() ? true : false; std::cout << __PRETTY_FUNCTION__ << " | env_vars_.logging_on = " << (isLoggingOn ? "true" : "false") << std::endl; std::cout << __PRETTY_FUNCTION__ << " | env_vars_.enum_overrides = {"; diff --git a/projects/amdsmi/src/rocm_smi_utils.cc b/projects/amdsmi/src/rocm_smi_utils.cc index 2cbb936454..3c997ccf9d 100755 --- a/projects/amdsmi/src/rocm_smi_utils.cc +++ b/projects/amdsmi/src/rocm_smi_utils.cc @@ -57,6 +57,8 @@ #include #include #include +#include +#include #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_utils.h" @@ -103,7 +105,7 @@ bool FileExists(char const *filename) { return (stat(filename, &buf) == 0); } -static void debugFilesDiscovered(std::vector files) { +static inline void debugFilesDiscovered(std::vector files) { std::ostringstream ss; int numberOfFilesFound = static_cast(files.size()); ss << "fileName.size() = " << numberOfFilesFound @@ -435,9 +437,13 @@ rsmi_status_t storeTmpFile(uint32_t dv_ind, std::string parameterName, } chmod(fileName, S_IRUSR|S_IRGRP|S_IROTH); - write(fd, storageData.c_str(), storageData.size()); + ssize_t rc_write = write(fd, storageData.c_str(), storageData.size()); close(fd); - return RSMI_STATUS_SUCCESS; + if (rc_write == -1) { + return RSMI_STATUS_FILE_ERROR; + } else { + return RSMI_STATUS_SUCCESS; + } } std::vector getListOfAppTmpFiles() { @@ -573,14 +579,20 @@ std::string getRSMIStatusString(rsmi_status_t ret) { // string domainName = domain name of the the system's node on the network // string os_distribution = pretty name of os distribution // (typically found in /etc/*-release file) +// string endianness = system's endianness. +// Expressed as big endian or little endian. +// Big Endian (BE), multi-bit symbols encoded as big endian (MSB first) +// Little Endian (LE), multi-bit symbols encoded as little endian (LSB first) std::tuple getSystemDetails(void) { + std::string, std::string, std::string, std::string> + getSystemDetails(void) { struct utsname buf; bool errorDetected = false; std::string temp_data; std::string sysname, nodename, release, version, machine; std::string domainName = ""; std::string os_distribution = ""; + std::string endianness = ""; if (uname(&buf) < 0) { errorDetected = true; @@ -608,8 +620,16 @@ std::tuple 64) bytesPerLine = 16; + + size_t i; + unsigned char buff[bytesPerLine + 1]; + const unsigned char *pc // ptr to data (char, 1 byte sized data) + = (const unsigned char *) addr; + + // Output description if given. + // if (desc != NULL) printf("%s:\n", desc); + if (desc != NULL) ss << "\n" << desc << "\n"; + + // Length checks. + if (len == 0) { + // printf(" ZERO LENGTH\n"); + ss << " ZERO LENGTH\n"; + LOG_ERROR(ss); + return; + } + std::string endianness = ""; + if (isSystemBigEndian()) { + endianness = "** System is Big Endian, multi-bit symbols encoded as" + " big endian (MSB first) **"; + } else { + endianness = "** System is Little Endian, multi-bit symbols encoded as" + " little endian (LSB first) **"; + } + ss << "\t" << endianness << "\n"; + + // Process every byte in the data. + for (i = 0; i < len; i++) { + // Multiple of bytesPerLine means new or first line (with line offset). + if ((i % bytesPerLine) == 0) { + // Only print previous-line ASCII buffer for lines beyond first. + // if (i != 0) printf(" %s\n", buff); + if (i != 0) ss << " " << buff << "\n"; + // Output the offset of current line. + // printf(" %08lx ", i); + ss << " " << std::setw(8) << std::setfill('0') << std::hex << i << " "; + } + + // Now the hex code for the specific character. + // printf(" %02x", pc[i]); + + ss << " " << std::setw(2) << std::setfill('0') << std::hex + << static_cast(pc[i]); + + // And buffer a printable ASCII character for later. + // x20 = 32 || x7e = 126 (ascii table range) + if ((pc[i] < 0x20) || (pc[i] > 0x7e)) { // isprint() may be better. + buff[i % bytesPerLine] = '.'; + } else { + buff[i % bytesPerLine] = pc[i]; + } + buff[(i % bytesPerLine) + 1] = '\0'; + } + + // Pad out last line if not exactly bytesPerLine characters. + while ((i % bytesPerLine) != 0) { + // printf(" "); + ss << " "; + i++; + } + + // And print the final ASCII buffer. + // printf(" %s\n", buff); + ss << " " << buff << "\n"; + LOG_DEBUG(ss); +} + +bool isSystemBigEndian() { + int n = 1; + bool isBigEndian = true; + if (*(char *)&n == 1) { + isBigEndian = false; + } + return isBigEndian; +} + } // namespace smi } // namespace amd From 46cdabed5e080e4d8014974e13e134c14d2dc217 Mon Sep 17 00:00:00 2001 From: Ranjith Ramakrishnan Date: Mon, 7 Aug 2023 09:19:57 -0700 Subject: [PATCH 11/15] SWDEV-366827 - Disable file reorg backward compatibility support by default Change-Id: I1de06d0d6a30c8c862d768b58460ef1b49d15e29 [ROCm/amdsmi commit: 9406cdd83264d76c26cdc69f028f8deebce9038e] --- projects/amdsmi/CMakeLists.txt | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/projects/amdsmi/CMakeLists.txt b/projects/amdsmi/CMakeLists.txt index 4199ef556f..87117c9fa6 100755 --- a/projects/amdsmi/CMakeLists.txt +++ b/projects/amdsmi/CMakeLists.txt @@ -186,10 +186,7 @@ set (CPACK_RPM_TESTS_PACKAGE_REQUIRES "python3, rocm-core") add_subdirectory("rocm_smi") add_subdirectory("oam") -# Disable file reorg backward compatibility for ASAN packaging -if(NOT ENABLE_ASAN_PACKAGING) - option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorg with backward compatibility" ON) -endif() +option(FILE_REORG_BACKWARD_COMPATIBILITY "Enable File Reorg with backward compatibility" OFF) # Add tests if(BUILD_TESTS) From 8b4e4ac01eb46a063a9b2873e54815be503731ff Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 11 Jul 2023 06:58:00 -0500 Subject: [PATCH 12/15] Fix rsmitstReadWrite.TestPowerReadWrite test failure Code changes related to the following: * All reinforcement work moved to their own files * Self contained changes only to support them * New files added to CMakeLists.txt Change-Id: I761e91f54392824df9145eaed8b9805986861285 Signed-off-by: Oliveira, Daniel [ROCm/amdsmi commit: cc5ab079dfcdf0106f0a9482ae1e88c08d900957] --- projects/amdsmi/CMakeLists.txt | 2 + .../amdsmi/include/rocm_smi/rocm_smi_device.h | 9 +- .../include/rocm_smi/rocm_smi_properties.h | 160 +++++ .../amdsmi/include/rocm_smi/rocm_smi_utils.h | 1 + projects/amdsmi/src/rocm_smi.cc | 2 + projects/amdsmi/src/rocm_smi_device.cc | 2 + projects/amdsmi/src/rocm_smi_properties.cc | 560 ++++++++++++++++++ projects/amdsmi/src/rocm_smi_utils.cc | 23 + 8 files changed, 758 insertions(+), 1 deletion(-) create mode 100644 projects/amdsmi/include/rocm_smi/rocm_smi_properties.h create mode 100644 projects/amdsmi/src/rocm_smi_properties.cc diff --git a/projects/amdsmi/CMakeLists.txt b/projects/amdsmi/CMakeLists.txt index 87117c9fa6..dd4c5d53f2 100755 --- a/projects/amdsmi/CMakeLists.txt +++ b/projects/amdsmi/CMakeLists.txt @@ -133,6 +133,7 @@ set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_io_link.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_gpu_metrics.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_logger.cc") +set(CMN_SRC_LIST ${CMN_SRC_LIST} "${COMMON_SRC_DIR}/rocm_smi_properties.cc") set(CMN_SRC_LIST ${CMN_SRC_LIST} "${SHR_MUTEX_DIR}/shared_mutex.cc") set(CMN_INC_LIST "${COMMON_INC_DIR}/rocm_smi_device.h") @@ -147,6 +148,7 @@ set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_kfd.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_io_link.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_logger.h") +set(CMN_INC_LIST ${CMN_INC_LIST} "${COMMON_INC_DIR}/rocm_smi_properties.h") set(CMN_INC_LIST ${CMN_INC_LIST} "${SHR_MUTEX_DIR}/shared_mutex.h") ## set components diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h index 3dcf7e1345..a1b2809457 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h @@ -52,12 +52,14 @@ #include #include #include +#include #include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_power_mon.h" #include "rocm_smi/rocm_smi_common.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_counters.h" +#include "rocm_smi/rocm_smi_properties.h" #include "shared_mutex.h" //NOLINT namespace amd { @@ -173,6 +175,7 @@ typedef struct { std::vector variants; } dev_depends_t; + class Device { public: explicit Device(std::string path, RocmSMI_env_vars const *e); @@ -213,7 +216,7 @@ class Device { void set_evt_notif_anon_fd(uint32_t fd) { evt_notif_anon_fd_ = static_cast(fd);} int evt_notif_anon_fd(void) const {return evt_notif_anon_fd_;} - metrics_table_header_t & gpu_metrics_ver(void) {return gpu_metrics_ver_;} + metrics_table_header_t &gpu_metrics_ver(void) {return gpu_metrics_ver_;} void fillSupportedFuncs(void); void DumpSupportedFunctions(void); bool DeviceAPISupported(std::string name, uint64_t variant, @@ -221,6 +224,8 @@ class Device { rsmi_status_t restartAMDGpuDriver(void); rsmi_status_t storeDevicePartitions(uint32_t dv_ind); template std::string readBootPartitionState(uint32_t dv_ind); + rsmi_status_t check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type); + private: std::shared_ptr monitor_; @@ -241,6 +246,7 @@ class Device { int readDevInfoBinary(DevInfoTypes type, std::size_t b_size, void *p_binary_data); int writeDevInfoStr(DevInfoTypes type, std::string valStr); + rsmi_status_t run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query); uint64_t bdfid_; uint64_t kfd_gpu_id_; std::unordered_set, + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_PROPERTIES_H_ +#define INCLUDE_ROCM_SMI_ROCM_SMI_PROPERTIES_H_ + +#include "rocm_smi/rocm_smi_common.h" +#include "rocm_smi/rocm_smi.h" + +#include +#include + + +namespace amd { +namespace smi { + +// +// Property reinforcement check list +// +using AMDGpuPropertyId_t = uint32_t; +using AMDGpuDevIdx_t = uint32_t; +using AMDGpuVerbId_t = uint32_t; +using AMDGpuAsicId_t = uint16_t; +using AMDGpuAsicRevId_t = uint16_t; +using AMDGpuOpModeType_t = uint8_t; + +enum class AMDGpuVerbTypes_t : AMDGpuVerbId_t +{ + kNone = 0, + kSetGpuPciBandwidth, + kSetPowerCap, + kSetGpuPowerProfile, + kSetGpuClkRange, + kSetGpuOdClkInfo, + kSetGpuOdVoltInfo, + kSetGpuPerfLevelV1, + kSetGpuPerfLevel, + kGetGpuPowerProfilePresets, + kResetGpu, + kSetGpuPerfDeterminismMode, + kSetGpuFanSpeed, + kResetGpuFan, + kSetClkFreq, + kSetGpuOverdriveLevelV1, + kSetGpuOverdriveLevel, + kGetGpuFanRpms, + kGetGpuFanSpeed, + kGetGpuFanSpeedMax, + kGetGpuVoltMetric, + kGetGpuOverDriveLevel, + kGetGpuOdVoltInfo, + kGetGpuOdVoltCurveRegions, +}; +using AMDGpuVerbList_t = std::map; + + +enum class AMDGpuPropertyTypesOffset_t : AMDGpuPropertyId_t +{ + kNone = 0, + kDevInfoTypes = (0x1000 << 0), + kMonitorTypes = (0x1000 << 1), + kPerfTypes = (0x1000 << 2), + kClkTypes = (0x1000 << 3), + kVoltMetricTypes = (0x1000 << 4), +}; + +using AMDGpuPropertyOffsetType = std::underlying_type::type; +using AMDGpuPropertyTypesOffsetList_t = std::map; +AMDGpuPropertyTypesOffset_t operator| (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs); +AMDGpuPropertyTypesOffset_t operator& (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs); + + +enum class AMDGpuPropertyOpModeTypes_t : AMDGpuOpModeType_t +{ + kBareMetal = (0x1 << 0), + kSrIov = (0x1 << 1), + kBoth = (0x1 << 2), +}; + +using AMDGpuPropertyOpModeType = std::underlying_type::type; +using AMDGpuOpModeList_t = std::map; +AMDGpuPropertyOpModeTypes_t operator| (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs); +AMDGpuPropertyOpModeTypes_t operator& (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs); + + +struct AMDGpuProperties_t +{ + AMDGpuAsicRevId_t m_pci_rev_id; + AMDGpuPropertyId_t m_property; + AMDGpuVerbTypes_t m_verb_id; + AMDGpuPropertyOpModeTypes_t m_opmode; + bool m_should_be_available; +}; +using AMDGpuPropertyList_t = std::multimap; + +struct AMDGpuPropertyQuery_t +{ + AMDGpuAsicId_t m_asic_id; + AMDGpuAsicRevId_t m_pci_rev_id; + AMDGpuDevIdx_t m_dev_idx; + AMDGpuPropertyId_t m_property; + AMDGpuVerbTypes_t m_verb_id; +}; + + +// +AMDGpuPropertyId_t make_unique_property_id(AMDGpuPropertyTypesOffset_t type_offset, AMDGpuPropertyId_t property_id); +AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id); + +rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, + AMDGpuVerbTypes_t dev_info_type, + rsmi_status_t actual_error_code); + +void dump_amdgpu_property_reinforcement_list(); + + +} // namespace smi +} // namespace amd + +#endif // INCLUDE_ROCM_SMI_ROCM_SMI_DEVICE_H_ diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h index a655b5b136..c574c97508 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h @@ -100,6 +100,7 @@ std::tuple getSystemDetails(void); void logSystemDetails(void); +rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str); void logHexDump(const char *desc, const void *addr, const size_t len, size_t perLine); bool isSystemBigEndian(); diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index 61128c03eb..9d5ea6a367 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -2957,6 +2957,7 @@ rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t reserved, DEVICE_MUTEX rsmi_status_t ret = get_power_profiles(dv_ind, status, nullptr); + return ret; CATCH } @@ -2973,6 +2974,7 @@ rsmi_dev_power_profile_set(uint32_t dv_ind, uint32_t dummy, (void)dummy; DEVICE_MUTEX rsmi_status_t ret = set_power_profile(dv_ind, profile); + return ret; CATCH } diff --git a/projects/amdsmi/src/rocm_smi_device.cc b/projects/amdsmi/src/rocm_smi_device.cc index 87077195ce..ddaf41a44a 100755 --- a/projects/amdsmi/src/rocm_smi_device.cc +++ b/projects/amdsmi/src/rocm_smi_device.cc @@ -59,6 +59,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_device.h" @@ -1385,6 +1386,7 @@ std::string Device::readBootPartitionState( return boot_state; } + #undef RET_IF_NONZERO } // namespace smi } // namespace amd diff --git a/projects/amdsmi/src/rocm_smi_properties.cc b/projects/amdsmi/src/rocm_smi_properties.cc new file mode 100644 index 0000000000..0e606e6874 --- /dev/null +++ b/projects/amdsmi/src/rocm_smi_properties.cc @@ -0,0 +1,560 @@ +/* + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2017-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#include "rocm_smi/rocm_smi_properties.h" +#include "rocm_smi/rocm_smi_common.h" +#include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_device.h" +#include "rocm_smi/rocm_smi_logger.h" + +#include +#include +#include + + +// +// Property reinforcement check list +// +// NOTE: This is a *temporary solution* until we get a better approach, likely +// a driver API that can give us the capabilities of a GPU in question. +// +namespace amd { +namespace smi { + +const AMDGpuOpModeList_t amdgpu_opmode_check_list { + {AMDGpuPropertyOpModeTypes_t::kBareMetal, "Bare Metal"}, + {AMDGpuPropertyOpModeTypes_t::kSrIov, "SR-IOV"}, + {AMDGpuPropertyOpModeTypes_t::kBoth, "Both"}, +}; + +const AMDGpuPropertyTypesOffsetList_t amdgpu_typeoffset_check_list { + {AMDGpuPropertyTypesOffset_t::kNone, "None"}, + {AMDGpuPropertyTypesOffset_t::kDevInfoTypes, "Device Info Type"}, + {AMDGpuPropertyTypesOffset_t::kMonitorTypes, "Monitor Type"}, + {AMDGpuPropertyTypesOffset_t::kPerfTypes, "Performance Type"}, + {AMDGpuPropertyTypesOffset_t::kClkTypes, "Clock Type"}, + {AMDGpuPropertyTypesOffset_t::kVoltMetricTypes, "Volt Metric Type"}, +}; + + +AMDGpuPropertyId_t make_unique_property_id(AMDGpuPropertyTypesOffset_t type_offset, AMDGpuPropertyId_t property_id) { + return (static_cast(type_offset) | (property_id)); +} + +AMDGpuPropertyId_t unmake_unique_property_id(AMDGpuPropertyId_t property_id) { + const auto property_type_offset_mask = + static_cast(AMDGpuPropertyTypesOffset_t::kDevInfoTypes) | + static_cast(AMDGpuPropertyTypesOffset_t::kMonitorTypes) | + static_cast(AMDGpuPropertyTypesOffset_t::kPerfTypes) | + static_cast(AMDGpuPropertyTypesOffset_t::kClkTypes) | + static_cast(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes); + + auto property_type_offset = (static_cast(property_type_offset_mask) & (property_id)); + auto property_type_id = (static_cast(property_id) & ~(property_type_offset_mask)); + + return property_type_id; +} + +AMDGpuPropertyTypesOffset_t operator| (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs) +{ + if (lhs == rhs) { + return lhs; + } + + return AMDGpuPropertyTypesOffset_t(static_cast(lhs) | static_cast(rhs)); +} + +AMDGpuPropertyTypesOffset_t operator& (AMDGpuPropertyTypesOffset_t lhs, AMDGpuPropertyTypesOffset_t rhs) +{ + if (lhs == rhs) { + return lhs; + } + + return AMDGpuPropertyTypesOffset_t(static_cast(lhs) & static_cast(rhs)); +} + +AMDGpuPropertyOpModeTypes_t operator| (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs) +{ + if (lhs == rhs) { + return lhs; + } + + return AMDGpuPropertyOpModeTypes_t(static_cast(lhs) | static_cast(rhs)); +} + +AMDGpuPropertyOpModeTypes_t operator& (AMDGpuPropertyOpModeTypes_t lhs, AMDGpuPropertyOpModeTypes_t rhs) +{ + if (lhs == rhs) { + return lhs; + } + + return AMDGpuPropertyOpModeTypes_t(static_cast(lhs) & static_cast(rhs)); +} + + +// +// Note: Due to the fact that we have different enum elements with the same +// number, keying a hash by the number is not an option; ie: +// - DevInfoTypes::kDevVendorID = 7 +// - MonitorTypes::kMonPowerCapDefault = 7 +// So, we are keying it by a unique key, based on their info types +// +const AMDGpuVerbList_t amdgpu_verb_check_list { + { AMDGpuVerbTypes_t::kNone, "None" }, + { AMDGpuVerbTypes_t::kSetGpuPciBandwidth, "amdsmi_set_gpu_pci_bandwidth" }, + { AMDGpuVerbTypes_t::kSetPowerCap, "amdsmi_set_power_cap" }, + { AMDGpuVerbTypes_t::kSetGpuPowerProfile, "amdsmi_set_gpu_power_profile" }, + { AMDGpuVerbTypes_t::kSetGpuClkRange, "amdsmi_set_gpu_clk_range" }, + { AMDGpuVerbTypes_t::kSetGpuOdClkInfo, "amdsmi_set_gpu_od_clk_info" }, + { AMDGpuVerbTypes_t::kSetGpuOdVoltInfo, "amdsmi_set_gpu_od_volt_info" }, + { AMDGpuVerbTypes_t::kSetGpuPerfLevelV1, "amdsmi_set_gpu_perf_level_v1" }, + { AMDGpuVerbTypes_t::kSetGpuPerfLevel, "amdsmi_set_gpu_perf_level" }, + { AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets, "amdsmi_get_gpu_power_profile_presets" }, + { AMDGpuVerbTypes_t::kResetGpu, "amdsmi_reset_gpu" }, + { AMDGpuVerbTypes_t::kSetGpuPerfDeterminismMode, "amdsmi_set_gpu_perf_determinism_mode" }, + { AMDGpuVerbTypes_t::kSetGpuFanSpeed, "amdsmi_set_gpu_fan_speed" }, + { AMDGpuVerbTypes_t::kResetGpuFan, "amdsmi_reset_gpu_fan" }, + { AMDGpuVerbTypes_t::kSetClkFreq, "amdsmi_set_clk_freq" }, + { AMDGpuVerbTypes_t::kSetGpuOverdriveLevelV1, "amdsmi_set_gpu_overdrive_level_v1" }, + { AMDGpuVerbTypes_t::kSetGpuOverdriveLevel, "amdsmi_set_gpu_overdrive_level" }, + { AMDGpuVerbTypes_t::kGetGpuFanRpms, "amdsmi_get_gpu_fan_rpms" }, + { AMDGpuVerbTypes_t::kGetGpuFanSpeed, "amdsmi_get_gpu_fan_speed" }, + { AMDGpuVerbTypes_t::kGetGpuFanSpeedMax, "amdsmi_get_gpu_fan_speed_max" }, + { AMDGpuVerbTypes_t::kGetGpuVoltMetric, "amdsmi_get_temp_metric" }, + { AMDGpuVerbTypes_t::kGetGpuOverDriveLevel, "amdsmi_get_gpu_overdrive_level" }, + { AMDGpuVerbTypes_t::kGetGpuOdVoltInfo, "amdsmi_get_gpu_od_volt_info" }, + { AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, "amdsmi_get_gpu_od_volt_curve_regions" } +}; + +const uint16_t kDevRevIDAll(0xFFFF); +const AMDGpuPropertyList_t amdgpu_property_reinforcement_list { + // + // {"Asic ID", {"Asic Rev. ID", "Unique Property ID", "Property Op.Mode", "Availability Flag"}} + // DevInfoTypes::kDevPCIEClk = rsmi_dev_pci_bandwidth_get; rsmi_dev_pci_bandwidth_set + // MonitorTypes::kMonPowerCapDefault = rsmi_dev_power_cap_default_get; + // DevInfoTypes::kDevPowerProfileMode = + // rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL = rsmi_dev_clk_range_set; + // + + // AMD Instinct MI210 + {0x740F, {0x02, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerProfileMode), + AMDGpuVerbTypes_t::kSetGpuPowerProfile, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + + // AMD MIxxx + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPCIEClk), + AMDGpuVerbTypes_t::kSetGpuPciBandwidth, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonPowerCapDefault), + AMDGpuVerbTypes_t::kSetPowerCap, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerProfileMode), + AMDGpuVerbTypes_t::kSetGpuPowerProfile, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL), + AMDGpuVerbTypes_t::kSetGpuClkRange, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL), + AMDGpuVerbTypes_t::kSetGpuOdClkInfo, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL), + AMDGpuVerbTypes_t::kSetGpuOdVoltInfo, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_AUTO), + AMDGpuVerbTypes_t::kSetGpuPerfLevelV1, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL), + AMDGpuVerbTypes_t::kSetGpuPerfLevel, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerProfileMode), + AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevGpuReset), + AMDGpuVerbTypes_t::kResetGpu, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, + rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_DETERMINISM), + AMDGpuVerbTypes_t::kSetGpuPerfDeterminismMode, + AMDGpuPropertyOpModeTypes_t::kSrIov, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanSpeed), + AMDGpuVerbTypes_t::kSetGpuFanSpeed, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanCntrlEnable), + AMDGpuVerbTypes_t::kResetGpuFan, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kClkTypes, + rsmi_clk_type::RSMI_CLK_TYPE_FIRST), + AMDGpuVerbTypes_t::kSetClkFreq, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevOverDriveLevel), + AMDGpuVerbTypes_t::kSetGpuOverdriveLevel, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevOverDriveLevel), + AMDGpuVerbTypes_t::kSetGpuOverdriveLevelV1, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanRPMs), + AMDGpuVerbTypes_t::kGetGpuFanRpms, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanSpeed), + AMDGpuVerbTypes_t::kGetGpuFanSpeed, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonMaxFanSpeed), + AMDGpuVerbTypes_t::kGetGpuFanSpeedMax, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kVoltMetricTypes, + rsmi_voltage_metric_t::RSMI_VOLT_CURRENT), + AMDGpuVerbTypes_t::kGetGpuVoltMetric, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevOverDriveLevel), + AMDGpuVerbTypes_t::kGetGpuOverDriveLevel, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerODVoltage), + AMDGpuVerbTypes_t::kGetGpuOdVoltInfo, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + }, + {0x74A1, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, + DevInfoTypes::kDevPowerODVoltage), + AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, + AMDGpuPropertyOpModeTypes_t::kBareMetal, false } + } +}; + + +rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbTypes_t verb_type, rsmi_status_t actual_error_code) +{ + std::ostringstream osstream; + osstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + osstream << __PRETTY_FUNCTION__ << " actual error code: " << actual_error_code << "\n"; + LOG_TRACE(osstream); + + if (actual_error_code == rsmi_status_t::RSMI_STATUS_SUCCESS) { + return actual_error_code; + } + + // + // For property reinforcement query, the possible return values are: + // RSMI_STATUS_SUCCESS: + // - Property found in the reinforcement table, and it *should exist* + // RSMI_STATUS_NOT_SUPPORTED: + // - Property found in the reinforcement table, and it *should not* exist + // RSMI_STATUS_NO_DATA: + // - Could not find the correct dev_id and dev_revision info to build the filter + // RSMI_STATUS_UNKNOWN_ERROR: + // - The results are initialized with that. If that is returned, + // likely the reinforcement table does not contain any entries/rules for the + // dev_id in question. + // + auto amdgpu_property_query_result_hdlr = [](rsmi_status_t query_result) { + switch (query_result) { + case (rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR): + case (rsmi_status_t::RSMI_STATUS_NO_DATA): + return rsmi_status_t::RSMI_STATUS_NOT_FOUND; + break; + + case (rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED): + case (rsmi_status_t::RSMI_STATUS_SUCCESS): + return query_result; + break; + + default: + return rsmi_status_t::RSMI_STATUS_NOT_FOUND; + break; + } + }; + + /// + GET_DEV_FROM_INDX + osstream << __PRETTY_FUNCTION__ << "| ======= about to run property query =======" + << " [query filters: ]" + << " device: " << dv_ind + << " property/verb: " << static_cast(verb_type) << amdgpu_verb_check_list.at(verb_type); + auto reinforcement_query_result = dev->check_amdgpu_property_reinforcement_query(dv_ind, verb_type); + osstream << __PRETTY_FUNCTION__ << "| ======= result from property query =======" + << " query result: " << reinforcement_query_result; + + reinforcement_query_result = amdgpu_property_query_result_hdlr(reinforcement_query_result); + osstream << __PRETTY_FUNCTION__ << "| ======= result from property query =======" + << " query result: " << reinforcement_query_result; + + return reinforcement_query_result; +} + +void dump_amdgpu_property_reinforcement_list() +{ + std::ostringstream osstream; + osstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + if (!amdgpu_property_reinforcement_list.empty()) { + for (const auto& property : amdgpu_property_reinforcement_list) { + osstream << __PRETTY_FUNCTION__ + << " Asic ID: " << property.first + << " Asic Rev.ID: " << property.second.m_pci_rev_id + << " Property ID: " << property.second.m_property + << " Verb ID : " << static_cast(property.second.m_verb_id) + << " Verb Desc: " << amdgpu_verb_check_list.at(property.second.m_verb_id) + << " OpMode: " << static_cast(property.second.m_opmode) + << " OpMode Desc: " << amdgpu_opmode_check_list.at(property.second.m_opmode) + << " Flag Avail.: " << property.second.m_should_be_available; + } + osstream << __PRETTY_FUNCTION__ << "| ======= end ======="; + return; + } + + osstream << __PRETTY_FUNCTION__ << "amdgpu_property_reinforcement_list is empty"; + LOG_TRACE(osstream); +} + + +rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx, AMDGpuVerbTypes_t verb_type) +{ + std::ostringstream osstream; + auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR); + + AMDGpuPropertyQuery_t amdgpu_property_query = [&]() { + AMDGpuPropertyQuery_t amdgpu_property_query_init{}; + amdgpu_property_query_init.m_asic_id = 0; + amdgpu_property_query_init.m_pci_rev_id = 0; + amdgpu_property_query_init.m_dev_idx = dev_idx; + amdgpu_property_query_init.m_property = 0; + amdgpu_property_query_init.m_verb_id = verb_type; + return amdgpu_property_query_init; + }(); + + auto build_asic_id_filters = [&](const AMDGpuPropertyQuery_t& amdgpu_query_validate, bool& is_filter_good) { + auto tmp_amdgpu_query = amdgpu_query_validate; + auto id_filter_result(rsmi_status_t::RSMI_STATUS_SUCCESS); + if (amdgpu_query_validate.m_asic_id == 0) { + id_filter_result = rsmi_dev_id_get(dev_idx, &tmp_amdgpu_query.m_asic_id); + if (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) { + id_filter_result = rsmi_dev_revision_get(dev_idx, &tmp_amdgpu_query.m_pci_rev_id); + } + } + is_filter_good = (id_filter_result == rsmi_status_t::RSMI_STATUS_SUCCESS) ? true : false; + return tmp_amdgpu_query; + }; + + // If the original amdgpu_query is missing parts of the filter, such as; + // asic_id, revision_id, we try to retrieve them based on the dev_idx. + // the property we are searching for, *must be present* . + osstream << __PRETTY_FUNCTION__ << "| ======= start ======="; + LOG_TRACE(osstream); + + bool is_proper_query(false); + amdgpu_property_query = build_asic_id_filters(amdgpu_property_query, is_proper_query); + if (!is_proper_query) { + rsmi_status = rsmi_status_t::RSMI_STATUS_NO_DATA; + osstream << __PRETTY_FUNCTION__ << "| ======= end =======" + << ", Missing Query Filters were not successfully retrieved: " + << " [query filters: ]" + << " device: " << dev_idx + << " asic id: " << amdgpu_property_query.m_asic_id + << " revision id: " << amdgpu_property_query.m_pci_rev_id + << " property: " << amdgpu_property_query.m_property + << " verb: " << static_cast(amdgpu_property_query.m_verb_id) + << " proper_query: " << is_proper_query + << " error: " << rsmi_status; + LOG_TRACE(osstream); + return rsmi_status; + } + + return run_amdgpu_property_reinforcement_query(amdgpu_property_query); +} + +rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query) +{ + std::ostringstream osstream; + auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR); + + auto contains = [](const uint16_t asic_id) { + return (amdgpu_property_reinforcement_list.find(asic_id) != amdgpu_property_reinforcement_list.end()); + }; + + auto ends_with = [](const std::string& value, const std::string& ending) { + if (value.size() < ending.size()) { + return false; + } + return std::equal(ending.rbegin(), ending.rend(), value.rbegin()); + }; + + // Traverse through all values for a given key + osstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n"; + LOG_TRACE(osstream); + if (contains(amdgpu_property_query.m_asic_id)) { + osstream << __PRETTY_FUNCTION__ << " asic id found in table: " << amdgpu_property_query.m_asic_id << "\n"; + auto itr_begin = amdgpu_property_reinforcement_list.lower_bound(amdgpu_property_query.m_asic_id); + auto itr_end = amdgpu_property_reinforcement_list.upper_bound(amdgpu_property_query.m_asic_id); + while (itr_begin != itr_end) { + // Still same key, and... + if (itr_begin->first == amdgpu_property_query.m_asic_id) { + osstream << __PRETTY_FUNCTION__ << " asic id found: " << itr_begin->first << "\n"; + // Pci_rev_id matches the filter or ALL Revisions + if ((itr_begin->second.m_pci_rev_id == amdgpu_property_query.m_pci_rev_id) || + (itr_begin->second.m_pci_rev_id == kDevRevIDAll)) { + osstream << __PRETTY_FUNCTION__ << " asic rev.id found: " << itr_begin->second.m_pci_rev_id << "\n"; + // Do we have the property we are looking for? + if (((amdgpu_property_query.m_property != 0) && + (itr_begin->second.m_property == amdgpu_property_query.m_property)) || + ((amdgpu_property_query.m_verb_id != AMDGpuVerbTypes_t::kNone) && + (itr_begin->second.m_verb_id == amdgpu_property_query.m_verb_id))) { + osstream << __PRETTY_FUNCTION__ + << " property found: " << itr_begin->second.m_property + << " verb found: " << static_cast(itr_begin->second.m_verb_id) + << " " << amdgpu_verb_check_list.at(amdgpu_property_query.m_verb_id) + << " should_be_available: " << itr_begin->second.m_should_be_available << "\n"; + // and if we do, should we consider it available, or forcefully + // considered it unavailable + osstream << __PRETTY_FUNCTION__ << "| ======= validating =======" + << ", Property found in the table for this device and flagged as *Not Available* : " + << " [query filters: ]" + << " device: " << amdgpu_property_query.m_dev_idx + << " asic id: " << amdgpu_property_query.m_asic_id + << " revision id: " << amdgpu_property_query.m_pci_rev_id + << " reinf.tbl.rev. id: " << itr_begin->second.m_pci_rev_id; + // + // The property is set in the reinforcement table to 'it should not be available' + if (!itr_begin->second.m_should_be_available) { + // If the property is found and set to not available + // (rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED), + // it should be all good (rsmi_status_t::RSMI_STATUS_SUCCESS); + rsmi_status = rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; + osstream << __PRETTY_FUNCTION__ + << " should_be_available: " << itr_begin->second.m_should_be_available + << " result: " << rsmi_status << "\n"; + LOG_TRACE(osstream); + return rsmi_status; + } + // + // The property is set in the reinforcement table to 'it should be available' + rsmi_status = rsmi_status_t::RSMI_STATUS_SUCCESS; + osstream << __PRETTY_FUNCTION__ + << " should_be_available: " << itr_begin->second.m_should_be_available + << " result: " << rsmi_status << "\n"; + LOG_TRACE(osstream); + return rsmi_status; + } + } + } + itr_begin++; + } + } + + osstream << __PRETTY_FUNCTION__ << "| ======= end =======" + << "Done searching for the Property in reinforcement table for this device: " + << " device: " << amdgpu_property_query.m_dev_idx + << " asic id: " << amdgpu_property_query.m_asic_id + << " revision id: " << amdgpu_property_query.m_pci_rev_id + << " property id: " << amdgpu_property_query.m_property + << " error: " << rsmi_status; + LOG_TRACE(osstream); + return rsmi_status; +} + + +} // namespace smi +} // namespace amd diff --git a/projects/amdsmi/src/rocm_smi_utils.cc b/projects/amdsmi/src/rocm_smi_utils.cc index 3c997ccf9d..7f1268a995 100755 --- a/projects/amdsmi/src/rocm_smi_utils.cc +++ b/projects/amdsmi/src/rocm_smi_utils.cc @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -748,5 +749,27 @@ bool isSystemBigEndian() { return isBigEndian; } +rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str) +{ + auto result = rsmi_status_t::RSMI_STATUS_SUCCESS; + auto bus_id = static_cast((bdf_id & 0x0000FF00) >> 8); + auto dev_id = static_cast((bdf_id & 0x000000F8) >> 3); + auto func_id = static_cast(bdf_id & 0x00000003); + + bfd_str = std::string(); + if (!(bus_id > 0)) { + result = rsmi_status_t::RSMI_STATUS_NO_DATA; + return result; + } + + std::stringstream bdf_sstream; + bdf_sstream << std::hex << std::setfill('0') << std::setw(sizeof(uint8_t) * 2) << +bus_id << ":"; + bdf_sstream << std::hex << std::setfill('0') << std::setw(sizeof(uint8_t) * 2) << +dev_id << "."; + bdf_sstream << std::hex << std::setfill('0') << +func_id; + bfd_str = bdf_sstream.str(); + return result; +} + + } // namespace smi } // namespace amd From 8915ef543c36c3e5c43048a50652917b2d1d622f Mon Sep 17 00:00:00 2001 From: Charis Poag Date: Thu, 10 Aug 2023 18:25:02 -0500 Subject: [PATCH 13/15] [SWDEV-399953] Smart Temperature detection + partitioning display * Updates: - Fix for devices which do not have edge sensors, but junction - Added partitioning (memory and dynamic) displays for base rocm-smi CLI calls - Added subheading for base rocm-smi call output - Added better hwmon and device detection logging Change-Id: I8219884b2e532d6ed379527cacdc1f2b232a5451 Signed-off-by: Charis Poag [ROCm/amdsmi commit: 755e14dbad2832416ebe1dcb71e5bbe233d93ebf] --- .../include/rocm_smi/rocm_smi_monitor.h | 38 +++++++++ .../amdsmi/include/rocm_smi/rocm_smi_utils.h | 2 + projects/amdsmi/python_smi_tools/rocm_smi.py | 85 ++++++++++++++++--- projects/amdsmi/src/rocm_smi.cc | 71 +++++++++++++++- projects/amdsmi/src/rocm_smi_main.cc | 41 ++++++++- projects/amdsmi/src/rocm_smi_monitor.cc | 11 +++ projects/amdsmi/src/rocm_smi_utils.cc | 39 +++++++-- 7 files changed, 258 insertions(+), 29 deletions(-) diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_monitor.h b/projects/amdsmi/include/rocm_smi/rocm_smi_monitor.h index 648e159b65..ea639eae35 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_monitor.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_monitor.h @@ -94,6 +94,44 @@ enum MonitorTypes { kMonInvalid = 0xFFFFFFFF, }; +const std::map monitorTypesToString { + {MonitorTypes::kMonName, "amd::smi::kMonName"}, + {MonitorTypes::kMonTemp, "amd::smi::kMonName"}, + {MonitorTypes::kMonFanSpeed, "amd::smi::kMonName"}, + {MonitorTypes::kMonMaxFanSpeed, "amd::smi::kMonName"}, + {MonitorTypes::kMonFanRPMs, "amd::smi::kMonName"}, + {MonitorTypes::kMonFanCntrlEnable, "amd::smi::kMonName"}, + {MonitorTypes::kMonPowerCap, "amd::smi::kMonName"}, + {MonitorTypes::kMonPowerCapDefault, "amd::smi::kMonName"}, + {MonitorTypes::kMonPowerCapMax, "amd::smi::kMonName"}, + {MonitorTypes::kMonPowerCapMin, "amd::smi::kMonName"}, + {MonitorTypes::kMonPowerAve, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempMax, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempMin, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempMaxHyst, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempMinHyst, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempCritical, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempCriticalHyst, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempEmergency, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempEmergencyHyst, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempCritMin, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempCritMinHyst, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempOffset, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempLowest, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempHighest, "amd::smi::kMonName"}, + {MonitorTypes::kMonTempLabel, "amd::smi::kMonName"}, + {MonitorTypes::kMonVolt, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltMax, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltMinCrit, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltMin, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltMaxCrit, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltAverage, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltLowest, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltHighest, "amd::smi::kMonName"}, + {MonitorTypes::kMonVoltLabel, "amd::smi::kMonName"}, + {MonitorTypes::kMonInvalid, "amd::smi::kMonName"}, +}; + class Monitor { public: diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h index c574c97508..5ba813a273 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h @@ -87,6 +87,8 @@ std::tuple readTmpFile( std::string stateName, std::string parameterName); void displayAppTmpFilesContent(void); +std::string debugVectorContent(std::vector v); +std::string displayAllDevicePaths(std::vector> v); rsmi_status_t handleException(); rsmi_status_t GetDevValueVec(amd::smi::DevInfoTypes type, diff --git a/projects/amdsmi/python_smi_tools/rocm_smi.py b/projects/amdsmi/python_smi_tools/rocm_smi.py index 27302cc50a..003efadc66 100755 --- a/projects/amdsmi/python_smi_tools/rocm_smi.py +++ b/projects/amdsmi/python_smi_tools/rocm_smi.py @@ -47,7 +47,7 @@ headerString = ' ROCm System Management Interface ' footerString = ' End of ROCm SMI Log ' # Output formatting -appWidth = 84 +appWidth = 100 deviceList = [] # Enable or disable serialized format @@ -393,6 +393,25 @@ def getTemp(device, sensor): return temp.value / 1000 return 'N/A' +def findFirstAvailableTemp(device): + """ Discovers the first available device temperature to display + + Returns a tuple of (temp_type, temp_value) for the device specified + @param device: DRM device identifier + """ + temp = c_int64(0) + metric = rsmi_temperature_metric_t.RSMI_TEMP_CURRENT + ret_temp = "N/A" + ret_temp_type = "(Unknown)" + for i, templist_val in enumerate(temp_type_lst): + ret = rocmsmi.rsmi_dev_temp_metric_get(c_uint32(device), i, metric, byref(temp)) + if rsmi_ret_ok(ret, device, 'get_temp_metric_' + templist_val, silent=True): + ret_temp = temp.value / 1000 + ret_temp_type = '(' + templist_val.capitalize() + ')' + break + else: + continue + return (ret_temp_type, ret_temp) def getVbiosVersion(device): """ Returns the VBIOS version for a given device @@ -429,7 +448,7 @@ def getComputePartition(device): ret = rocmsmi.rsmi_dev_compute_partition_get(device, currentComputePartition, 256) if rsmi_ret_ok(ret, device, 'get_compute_partition', silent=True) and currentComputePartition.value.decode(): return str(currentComputePartition.value.decode()) - return "UNKNOWN" + return "N/A" def getMemoryPartition(device): @@ -441,7 +460,7 @@ def getMemoryPartition(device): ret = rocmsmi.rsmi_dev_nps_mode_get(device, currentNPSMode, 256) if rsmi_ret_ok(ret, device, 'get_NPS_mode', silent=True) and currentNPSMode.value.decode(): return str(currentNPSMode.value.decode()) - return "UNKNOWN" + return "N/A" def print2DArray(dataArray): @@ -544,13 +563,20 @@ def printEventList(device, delay, eventList): print2DArray([['\rGPU[%d]:\t' % (data.dv_ind), ctime().split()[3], notification_type_names[data.event.value - 1], data.message.decode('utf8') + '\r']]) -def printLog(device, metricName, value=None, extraSpace=False): +def printLog(device, metricName, value=None, extraSpace=False, useItalics=False): """ Print out to the SMI log @param device: DRM device identifier @param metricName: Title of the item to print to the log @param value: The item's value to print to the log """ + red = '\033[91m' + green = '\033[92m' + blue = '\033[94m' + bold = '\033[1m' + italics = '\033[3m' + underline = '\033[4m' + end = '\033[0m' global PRINT_JSON if PRINT_JSON: if value is not None and device is not None: @@ -567,6 +593,8 @@ def printLog(device, metricName, value=None, extraSpace=False): # Force thread safe printing lock = multiprocessing.Lock() lock.acquire() + if useItalics: + logstr = italics + logstr + end if extraSpace: print('\n' + logstr + '\n', end='', flush=True) else: @@ -1544,18 +1572,39 @@ def showAllConcise(deviceList): print('ERROR: Cannot print JSON/CSV output for concise output') sys.exit(1) printLogSpacer(' Concise Info ') - header = ['GPU', 'Temp (DieEdge)', 'AvgPwr', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%'] + deviceList.sort() + (temp_type, _) = findFirstAvailableTemp(deviceList[0]) + available_temp_type = temp_type.lower() + available_temp_type = available_temp_type.replace('(', '') + available_temp_type = available_temp_type.replace(')', '') + header = ['GPU', 'Temp', 'AvgPwr', 'Partitions', 'SCLK', 'MCLK', 'Fan', 'Perf', 'PwrCap', 'VRAM%', 'GPU%'] + subheader = ['', temp_type, '', '(Mem, Compute)', '', '', '', '', '', '', ''] + # add additional spaces to match header + for idx, item in enumerate(subheader): + header_size = len(header[idx]) + subheader_size = len(subheader[idx]) + if header_size != subheader_size: + numSpacesToFill_subheader = header_size - subheader_size + numSpacesToFill_header = subheader_size - header_size + #take pos spaces to mean, we need to match size of the other + if numSpacesToFill_subheader > 0: + subheader[idx] = subheader[idx] + (' ' * numSpacesToFill_subheader) + if numSpacesToFill_header > 0: + header[idx] = header[idx] + (' ' * numSpacesToFill_header) head_widths = [len(head) + 2 for head in header] values = {} + degree_sign = u'\N{DEGREE SIGN}' for device in deviceList: - temp = str(getTemp(device, 'edge')) - if temp != 'N/A': - temp += 'c' + temp_val = str(getTemp(device, available_temp_type)) + if temp_val != 'N/A': + temp_val += degree_sign + 'C' avgPwr = str(getPower(device)) if avgPwr != '0.0' and avgPwr != 'N/A': avgPwr += 'W' else: avgPwr = 'N/A' + combined_partition = (getMemoryPartition(device) + ", " + + getComputePartition(device)) concise = True sclk = showCurrentClocks([device], 'sclk', concise) mclk = showCurrentClocks([device], 'mclk', concise) @@ -1579,7 +1628,9 @@ def showAllConcise(deviceList): mem_use_pct='Unsupported' if vram_used != None and vram_total != None and float(vram_total) != 0: mem_use_pct = '% 3.0f%%' % (100 * (float(vram_used) / float(vram_total))) - values['card%s' % (str(device))] = [device, temp, avgPwr, sclk, mclk, fan, str(perf).lower(), pwrCap, + values['card%s' % (str(device))] = [device, temp_val, avgPwr, + combined_partition, sclk, mclk, + fan, str(perf).lower(), pwrCap, mem_use_pct, gpu_busy] val_widths = {} for device in deviceList: @@ -1589,6 +1640,9 @@ def showAllConcise(deviceList): for col in range(len(val_widths[device])): max_widths[col] = max(max_widths[col], val_widths[device][col]) printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), header)), None) + printLog(None, "".join(word.ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), subheader)), + None, useItalics=True) + printLogSpacer(fill='=') for device in deviceList: printLog(None, "".join(str(word).ljust(max_widths[col]) for col, word in zip(range(len(max_widths)), values['card%s' % (str(device))])), None) @@ -2548,7 +2602,7 @@ def showEvents(deviceList, eventTypes): break -def printTempGraph(deviceList, delay): +def printTempGraph(deviceList, delay, temp_type): # deviceList must be in ascending order deviceList.sort() devices = 0 @@ -2562,7 +2616,7 @@ def printTempGraph(deviceList, delay): terminalWidth = os.get_terminal_size()[0] printStrings = list() for device in deviceList: - temp = getTemp(device, 'edge') + temp = getTemp(device, temp_type) if temp == 'N/A': percentage = 0 else: @@ -2635,11 +2689,16 @@ def getGraphColor(percentage): def showTempGraph(deviceList): - printLogSpacer(' Temperature Graph ') + deviceList.sort() + (temp_type, temp_value) = findFirstAvailableTemp(deviceList[0]) + printLogSpacer(' Temperature Graph ' + temp_type + ' ') + temp_type = temp_type.lower() + temp_type = temp_type.replace('(', '') + temp_type = temp_type.replace(')', '') # Start a thread for constantly printing try: # Create a thread (call print function, devices, delay in ms) - _thread.start_new_thread(printTempGraph, (deviceList, 150)) + _thread.start_new_thread(printTempGraph, (deviceList, 150, temp_type)) except Exception as e: printErrLog(device, 'Unable to start new thread. %s' % (e)) # Catch user input for program termination diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index 9d5ea6a367..5ae4895c9d 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -78,6 +78,7 @@ #include "rocm_smi/rocm_smi_logger.h" using namespace ROCmLogging; +using namespace amd::smi; static const uint32_t kMaxOverdriveLevel = 20; static const float kEnergyCounterResolution = 15.3f; @@ -2475,7 +2476,16 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, } if (temperature == nullptr) { - return RSMI_STATUS_INVALID_ARGS; + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: temperature was a null ptr reference" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; + LOG_ERROR(ss); + return RSMI_STATUS_INVALID_ARGS; } // The HBM temperature is retreived from the gpu_metrics @@ -2484,12 +2494,32 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, || sensor_type == RSMI_TEMP_TYPE_HBM_2 || sensor_type == RSMI_TEMP_TYPE_HBM_3) { if (metric != RSMI_TEMP_CURRENT) { // only support RSMI_TEMP_CURRENT + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: To retreive HBM temp, we only support metric = " + << "RSMI_TEMP_CURRENT" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } rsmi_gpu_metrics_t gpu_metrics; ret = rsmi_dev_gpu_metrics_info_get(dv_ind, &gpu_metrics); if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: rsmi_dev_gpu_metrics_info_get returned " + << getRSMIStatusString(ret) + << " | Returning = " + << getRSMIStatusString(ret) << " |"; + LOG_ERROR(ss); return ret; } @@ -2509,11 +2539,28 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, default: return RSMI_STATUS_INVALID_ARGS; } - if (val_ui16 == UINT16_MAX) + if (val_ui16 == UINT16_MAX) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: Reached UINT16 max value, overflow" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; - else + } else *temperature = val_ui16 * CENTRIGRADE_TO_MILLI_CENTIGRADE; + ss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Data: " << *temperature + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " | "; + LOG_INFO(ss); return RSMI_STATUS_SUCCESS; } // end HBM temperature @@ -2522,6 +2569,15 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, GET_DEV_FROM_INDX if (dev->monitor() == nullptr) { + ss << __PRETTY_FUNCTION__ + << " | ======= end ======= " + << " | Fail " + << " | Device #: " << dv_ind + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Cause: monitor returned nullptr" + << " | Returning = " + << getRSMIStatusString(RSMI_STATUS_NOT_SUPPORTED) << " |"; + LOG_ERROR(ss); return RSMI_STATUS_NOT_SUPPORTED; } std::shared_ptr m = dev->monitor(); @@ -2535,6 +2591,15 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, CHK_API_SUPPORT_ONLY(temperature, metric, sensor_index) ret = get_dev_mon_value(mon_type, dv_ind, sensor_index, temperature); + ss << __PRETTY_FUNCTION__ << " | ======= end ======= " + << " | Success " + << " | Device #: " << dv_ind + << " | Sensor_index: " << sensor_index + << " | Type: " << monitorTypesToString.at(mon_type) + << " | Data: " << *temperature + << " | Returning = " + << getRSMIStatusString(ret) << " | "; + LOG_INFO(ss); return ret; CATCH diff --git a/projects/amdsmi/src/rocm_smi_main.cc b/projects/amdsmi/src/rocm_smi_main.cc index 0ba6d7c50e..8cb95fe7f2 100755 --- a/projects/amdsmi/src/rocm_smi_main.cc +++ b/projects/amdsmi/src/rocm_smi_main.cc @@ -170,6 +170,7 @@ static uint32_t GetDeviceIndex(const std::string s) { // computed for cardX. // On success, return drm_minor which is >= 128 otherwise return 0 static uint32_t GetDrmRenderMinor(const std::string s) { + std::ostringstream ss; std::string drm_path = s; int drm_minor = 0; const std::string render_file_prefix = "renderD"; @@ -195,6 +196,10 @@ static uint32_t GetDrmRenderMinor(const std::string s) { if (closedir(drm_dir)) { return 0; } + + ss << __PRETTY_FUNCTION__ << " | Discovered drmRenderMinor = " + << std::to_string(drm_minor) << " | For drm_path = " << drm_path << " | "; + LOG_DEBUG(ss); return static_cast(drm_minor); } @@ -377,11 +382,15 @@ RocmSMI::Initialize(uint64_t flags) { // Remove any drm nodes that don't have a corresponding readable kfd node. // kfd nodes will not be added if their properties file is not readable. + std::ostringstream ss; auto dev_iter = devices_.begin(); while (dev_iter != devices_.end()) { uint64_t bdfid = (*dev_iter)->bdfid(); if (tmp_map.find(bdfid) == tmp_map.end()) { + ss << __PRETTY_FUNCTION__ << " | removing device = " + << (*dev_iter)->path(); dev_iter = devices_.erase(dev_iter); + LOG_DEBUG(ss); continue; } dev_iter++; @@ -411,6 +420,9 @@ RocmSMI::Initialize(uint64_t flags) { } // Leaving below to help debug temp file issues // displayAppTmpFilesContent(); + std::string amdGPUDeviceList = displayAllDevicePaths(devices_); + ss << __PRETTY_FUNCTION__ << " | current device paths = " << amdGPUDeviceList; + LOG_DEBUG(ss); } void @@ -646,6 +658,9 @@ RocmSMI::FindMonitor(std::string monitor_path) { } void RocmSMI::AddToDeviceList(std::string dev_name) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); auto dev_path = std::string(kPathDRMRoot); dev_path += "/"; dev_path += dev_name; @@ -662,6 +677,10 @@ RocmSMI::AddToDeviceList(std::string dev_name) { GetSupportedEventGroups(card_indx, dev->supported_event_groups()); devices_.push_back(dev); + ss << __PRETTY_FUNCTION__ << " | Adding to device list dev_name = " + << dev_name << " | path = " << dev_path + << " | card index = " << std::to_string(card_indx) << " | "; + LOG_DEBUG(ss); return; } @@ -669,16 +688,26 @@ RocmSMI::AddToDeviceList(std::string dev_name) { static const uint32_t kAmdGpuId = 0x1002; static bool isAMDGPU(std::string dev_path) { + bool isAmdGpu = false; + std::ostringstream ss; std::string vend_path = dev_path + "/device/vendor"; if (!FileExists(vend_path.c_str())) { - return false; + ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path + << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": + "is an amdgpu device - FALSE"); + LOG_DEBUG(ss); + return isAmdGpu; } std::ifstream fs; fs.open(vend_path); if (!fs.is_open()) { - return false; + ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path + << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": + "is an amdgpu device - FALSE"); + LOG_DEBUG(ss); + return isAmdGpu; } uint32_t vendor_id; @@ -688,9 +717,13 @@ static bool isAMDGPU(std::string dev_path) { fs.close(); if (vendor_id == kAmdGpuId) { - return true; + isAmdGpu = true; } - return false; + ss << __PRETTY_FUNCTION__ << " | device_path = " << dev_path + << " is " << (isAmdGpu ? "is an amdgpu device - TRUE": + "is an amdgpu device - FALSE"); + LOG_DEBUG(ss); + return isAmdGpu; } uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { diff --git a/projects/amdsmi/src/rocm_smi_monitor.cc b/projects/amdsmi/src/rocm_smi_monitor.cc index 7d49ef0711..00035e6307 100755 --- a/projects/amdsmi/src/rocm_smi_monitor.cc +++ b/projects/amdsmi/src/rocm_smi_monitor.cc @@ -313,6 +313,7 @@ int Monitor::writeMonitor(MonitorTypes type, uint32_t sensor_id, // This string version should work for all valid monitor types int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id, std::string *val) { + std::ostringstream ss; assert(val != nullptr); std::string temp_str; @@ -320,11 +321,21 @@ int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id, DBG_FILE_ERROR(sysfs_path, (std::string *)nullptr) int ret = ReadSysfsStr(sysfs_path, val); + ss << __PRETTY_FUNCTION__ + << " | Success | Read hwmon file: " << sysfs_path + << " | Type: " << monitorTypesToString.at(type) + << " | Sensor id: " << std::to_string(sensor_id) + << " | Data: " << *val + << " | Returning: " << std::to_string(ret) << " |"; + LOG_INFO(ss); return ret; } int32_t Monitor::setTempSensorLabelMap(void) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); std::string type_str; int ret; diff --git a/projects/amdsmi/src/rocm_smi_utils.cc b/projects/amdsmi/src/rocm_smi_utils.cc index 7f1268a995..670d90faec 100755 --- a/projects/amdsmi/src/rocm_smi_utils.cc +++ b/projects/amdsmi/src/rocm_smi_utils.cc @@ -204,9 +204,10 @@ int ReadSysfsStr(std::string path, std::string *retStr) { if (!fs.is_open()) { ret = errno; errno = 0; - oss << "Could not read SYSFS file (" << path << ")" - << ", returning " << std::to_string(ret) << " (" - << std::strerror(ret) << ")"; + oss << __PRETTY_FUNCTION__ + << " | Fail | Cause: file does not exist or permissions issue" + << " | SYSFS file: " << path + << " | Returning: " << std::strerror(ret) << " |"; LOG_ERROR(oss); return ret; } @@ -516,19 +517,39 @@ void displayAppTmpFilesContent() { } // Used to debug vector string list and their content -void displayVectorContent(std::vector v) { - std::cout << "Vector = {"; +std::string debugVectorContent(std::vector v) { + std::ostringstream ss; + ss << "Vector = {"; if (v.size() > 0) { for (auto it=v.begin(); it < v.end(); it++) { - std::cout << *it; + ss << *it; auto temp_it = it; if(++temp_it != v.end()) { - std::cout << ", "; + ss << ", "; } } - } else { - std::cout << "}" << std::endl; } + ss << "}" << std::endl; + + return ss.str(); +} + +// Used to debug vector string list and their content +std::string displayAllDevicePaths(std::vector> v) { + std::ostringstream ss; + ss << "Vector = {"; + if (v.size() > 0) { + for (auto it=v.begin(); it < v.end(); it++) { + ss << (*it)->path(); + auto temp_it = it; + if(++temp_it != v.end()) { + ss << ", "; + } + } + } + ss << "}" << std::endl; + + return ss.str(); } // Attempts to read application specific temporary file From 05b5ef35b356379245fa6d25e443268200ba5526 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Thu, 17 Aug 2023 14:36:03 -0500 Subject: [PATCH 14/15] Fallback to kfd node when VRAM sysfs not available The driver may not expose VRAM sysfs in certain system. Add a fallback to it. Change-Id: Ib3be71b4f4d2c79318d5026b0a97f3657d8a97b6 [ROCm/amdsmi commit: a10f00bf5742f3d710242726310e09f5235adf1b] --- projects/amdsmi/include/rocm_smi/kfd_ioctl.h | 10 ++ .../amdsmi/include/rocm_smi/rocm_smi_kfd.h | 4 + projects/amdsmi/src/rocm_smi.cc | 19 ++++ projects/amdsmi/src/rocm_smi_kfd.cc | 92 +++++++++++++++++++ 4 files changed, 125 insertions(+) diff --git a/projects/amdsmi/include/rocm_smi/kfd_ioctl.h b/projects/amdsmi/include/rocm_smi/kfd_ioctl.h index 5817833eae..3b781ce129 100755 --- a/projects/amdsmi/include/rocm_smi/kfd_ioctl.h +++ b/projects/amdsmi/include/rocm_smi/kfd_ioctl.h @@ -36,6 +36,12 @@ struct kfd_ioctl_get_version_args { __u32 minor_version; /* from KFD */ }; +struct kfd_ioctl_get_available_memory_args { + __u64 available; /* from KFD */ + __u32 gpu_id; /* to KFD */ + __u32 pad; +}; + /* For kfd_ioctl_create_queue_args.queue_type. */ #define KFD_IOC_QUEUE_TYPE_COMPUTE 0x0 #define KFD_IOC_QUEUE_TYPE_SDMA 0x1 @@ -726,6 +732,10 @@ struct kfd_ioctl_cross_memory_copy_args { #define AMDKFD_IOC_CROSS_MEMORY_COPY \ AMDKFD_IOWR(0x22, struct kfd_ioctl_cross_memory_copy_args) + +#define AMDKFD_IOC_AVAILABLE_MEMORY \ + AMDKFD_IOWR(0x23, struct kfd_ioctl_get_available_memory_args) + #define AMDKFD_COMMAND_START 0x01 #undef AMDKFD_COMMAND_END #define AMDKFD_COMMAND_END 0x22 diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_kfd.h b/projects/amdsmi/include/rocm_smi/rocm_smi_kfd.h index a0c8f5fe2d..9cf8fd8e40 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_kfd.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_kfd.h @@ -80,6 +80,10 @@ class KFDNode { uint32_t amdgpu_dev_index(void) const {return amdgpu_dev_index_;} void set_amdgpu_dev_index(uint32_t val) {amdgpu_dev_index_ = val;} + // Get memory from kfd + int get_total_memory(uint64_t* total); + int get_used_memory(uint64_t* used); + private: uint32_t node_indx_; uint32_t amdgpu_dev_index_; diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index 5ae4895c9d..38c0023466 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -3077,6 +3077,14 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, DEVICE_MUTEX ret = get_dev_value_int(mem_type_file, dv_ind, total); + // Fallback to KFD reported memory if VRAM total is 0 + if (mem_type == RSMI_MEM_TYPE_VRAM && *total == 0) { + GET_DEV_AND_KFDNODE_FROM_INDX + if (kfd_node->get_total_memory(total) == 0 && *total > 0) { + return RSMI_STATUS_SUCCESS; + } + } + return ret; CATCH } @@ -3113,6 +3121,17 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, DEVICE_MUTEX ret = get_dev_value_int(mem_type_file, dv_ind, used); + // Fallback to KFD reported memory if no VRAM + if (mem_type == RSMI_MEM_TYPE_VRAM && *used == 0) { + GET_DEV_AND_KFDNODE_FROM_INDX + uint64_t total = 0; + ret = get_dev_value_int(amd::smi::kDevMemTotVRAM, dv_ind, &total); + if (total != 0) return ret; // do not need to fallback + if ( kfd_node->get_used_memory(used) == 0 ) { + return RSMI_STATUS_SUCCESS; + } + } + return ret; CATCH } diff --git a/projects/amdsmi/src/rocm_smi_kfd.cc b/projects/amdsmi/src/rocm_smi_kfd.cc index 13aed64588..092bcb3414 100755 --- a/projects/amdsmi/src/rocm_smi_kfd.cc +++ b/projects/amdsmi/src/rocm_smi_kfd.cc @@ -43,6 +43,9 @@ #include #include +#include +#include +#include #include #include @@ -770,6 +773,95 @@ KFDNode::get_io_link_bandwidth(uint32_t node_to, uint64_t *max_bandwidth, return 0; } +// /sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties +// size_in_bytes 68702699520 +int KFDNode::get_total_memory(uint64_t* total) { + if (total == nullptr) return EINVAL; + *total = 0; + + std::string f_path = kKFDNodesPathRoot; + f_path += "/"; + f_path += std::to_string(node_indx_); + f_path += "/mem_banks"; + + auto kfd_node_dir = opendir(f_path.c_str()); + if (kfd_node_dir == nullptr) { + return errno; + } + auto dentry = readdir(kfd_node_dir); + while (dentry != nullptr) { + if (dentry->d_name[0] == '.') { + dentry = readdir(kfd_node_dir); + continue; + } + + if (!is_number(dentry->d_name)) { + dentry = readdir(kfd_node_dir); + continue; + } + + // read "size_in_bytes 68702699520" line + const std::string size_in_bytes_property = "size_in_bytes "; + std::string memory_bank_file = f_path + "/" + + dentry->d_name + "/properties"; + std::ifstream fs(memory_bank_file); + if (!fs) { + dentry = readdir(kfd_node_dir); + continue; + } + std::string line; + while (std::getline(fs, line)) { + if (line.substr(0, size_in_bytes_property.length()) + == size_in_bytes_property) { + auto bytes = line.substr(size_in_bytes_property.length()); + try { + *total += std::stol(bytes); + break; + } catch(...) { + dentry = readdir(kfd_node_dir); + continue; + } + } + } // end loop for lines in property file + } // end loop for mem_bank directory + + if (closedir(kfd_node_dir)) { + std::string err_str = "Failed to close KFD node directory "; + err_str += f_path; + err_str += "."; + perror(err_str.c_str()); + return 1; + } + return 0; +} + +// ioctl on kfd node device +int KFDNode::get_used_memory(uint64_t* used) { + if (used == nullptr) return EINVAL; + static const char *kPathKFDIoctl = "/dev/kfd"; + + int kfd_fd = open(kPathKFDIoctl, O_RDWR | O_CLOEXEC); + if (kfd_fd <= 0) { + return 1; + } + struct kfd_ioctl_get_available_memory_args mem = {0, 0, 0}; + mem.gpu_id = gpu_id_; + if (ioctl(kfd_fd, AMDKFD_IOC_AVAILABLE_MEMORY , &mem) != 0) { + close(kfd_fd); + return 1; + } + close(kfd_fd); + + // used = total - available + uint64_t total = 0; + int ret = get_total_memory(&total); + if (ret == 0 && total > 0 && mem.available < total) { + *used = total - mem.available; + return 0; + } + + return 1; +} } // namespace smi } // namespace amd From dbc8684d08c23ca73b111bc6b9b9dcafc10fa1b3 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Mon, 21 Aug 2023 11:57:41 -0500 Subject: [PATCH 15/15] TESTS - Use gpu version as a workaround for a missing name Depends-On: Ifbd38f11fbde7ba28af4be1d611310dea1b5112a Change-Id: Ia7b7975f03424854df0a470b2719cf2ff2cf8e40 Signed-off-by: Galantsev, Dmitrii [ROCm/amdsmi commit: 62f01cb15034d965bd6f3f49e30a88a19d87ef7a] --- projects/amdsmi/tests/rocm_smi_test/rsmitst.exclude | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/projects/amdsmi/tests/rocm_smi_test/rsmitst.exclude b/projects/amdsmi/tests/rocm_smi_test/rsmitst.exclude index f0ea842f7f..7d229b68f4 100644 --- a/projects/amdsmi/tests/rocm_smi_test/rsmitst.exclude +++ b/projects/amdsmi/tests/rocm_smi_test/rsmitst.exclude @@ -57,7 +57,13 @@ $BLACKLIST_ALL_ASICS\ "rsmitstReadWrite.TestPerfLevelReadWrite" # SWDEV-391407 -FILTER[aqua_vanjaram]=\ +FILTER[90400]=\ +$BLACKLIST_ALL_ASICS\ +"rsmitstReadOnly.TestVoltCurvRead:"\ +"rsmitstReadOnly.TestFrequenciesRead:"\ +"rsmitstReadWrite.TestFrequenciesReadWrite:"\ +"rsmitstReadWrite.TestPowerReadWrite" +FILTER[90401]=\ $BLACKLIST_ALL_ASICS\ "rsmitstReadOnly.TestVoltCurvRead:"\ "rsmitstReadOnly.TestFrequenciesRead:"\