diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index 2d96cf3c73..8834e518b5 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -1791,15 +1791,19 @@ def showClocks(deviceList): for clk_type in sorted(rsmi_clk_names_dict): if rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], None) == 1: ret = rocmsmi.rsmi_dev_gpu_clk_freq_get(device, rsmi_clk_names_dict[clk_type], byref(freq)) - if rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True): - printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None) - for x in range(freq.num_supported): - fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000) - if x == freq.current: - printLog(device, str(x), str(fr) + ' *') - else: - printLog(device, str(x), str(fr)) - printLog(device, '', None) + if ret == rsmi_status_t.RSMI_STATUS_UNEXPECTED_DATA: + printLog(device, 'Clock [%s] on device [%s] exists but EMPTY! Likely driver error!' % (clk_type, str(device))) + continue + if not rsmi_ret_ok(ret, device, 'get_clk_freq_' + clk_type, True): + continue + printLog(device, 'Supported %s frequencies on GPU%s' % (clk_type, str(device)), None) + for x in range(freq.num_supported): + fr = '{:>.0f}Mhz'.format(freq.frequency[x] / 1000000) + if x == freq.current: + printLog(device, str(x), str(fr) + ' *') + else: + printLog(device, str(x), str(fr)) + printLog(device, '', None) else: logging.debug('{} frequency is unsupported on device[{}]'.format(clk_type, device)) printLog(device, '', None) diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 892f6664c7..ed96a7ee65 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -3740,6 +3740,10 @@ rsmi_dev_memory_reserved_pages_get(uint32_t dv_ind, uint32_t *num_pages, ret = GetDevValueVec(amd::smi::kDevMemPageBad, dv_ind, &val_vec); + // file is empty, which is valid for no errors + if (ret == RSMI_STATUS_UNEXPECTED_DATA) { + ret = RSMI_STATUS_SUCCESS; + } if (ret == RSMI_STATUS_FILE_ERROR) { return RSMI_STATUS_NOT_SUPPORTED; } diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc index b01734f0b1..03a38dc749 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_device.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc @@ -858,8 +858,8 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, ss << "Read devInfoMultiLineStr for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ")" << ", but contained no string lines"; - LOG_INFO(ss); - return 0; + LOG_ERROR(ss); + return ENXIO; } // Remove any *trailing* empty (whitespace) lines while (!retVec->empty() && @@ -882,6 +882,7 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type, << RocmSMI::devInfoTypesStrings.at(type) << ")" << ", but lines were empty"; LOG_INFO(ss); + return ENXIO; } return 0; } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/frequencies_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/frequencies_read.cc index 2b5466ae93..37bb9ec0b2 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/frequencies_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/frequencies_read.cc @@ -123,16 +123,22 @@ void TestFrequenciesRead::Run(void) { // Verify api support checking functionality is working err = rsmi_dev_gpu_clk_freq_get(i, t, nullptr); ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); - } else { - CHK_ERR_ASRT(err) - IF_VERB(STANDARD) { - std::cout << "\t**Supported " << name << " clock frequencies: "; - std::cout << f.num_supported << std::endl; - print_frequencies(&f); - // Verify api support checking functionality is working - err = rsmi_dev_gpu_clk_freq_get(i, t, nullptr); - ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); - } + } + + // special driver issue, shouldn't normally occur + if (err == RSMI_STATUS_UNEXPECTED_DATA) { + std::cerr << "WARN: Clock file [" << FreqEnumToStr(t) << "] exists on device [" << i << "] but empty!" << std::endl; + std::cerr << " Likely a driver issue!" << std::endl; + } + + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**Supported " << name << " clock frequencies: "; + std::cout << f.num_supported << std::endl; + print_frequencies(&f); + // Verify api support checking functionality is working + err = rsmi_dev_gpu_clk_freq_get(i, t, nullptr); + ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); } }; diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/frequencies_read_write.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/frequencies_read_write.cc index 9fce5c429e..5ad627cb5f 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/frequencies_read_write.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/frequencies_read_write.cc @@ -114,14 +114,20 @@ void TestFrequenciesReadWrite::Run(void) { std::cout << "\t**Set " << FreqEnumToStr(rsmi_clk) << ": Not supported on this machine" << std::endl; return false; - } else { - // CHK_ERR_ASRT(ret) - IF_VERB(STANDARD) { - std::cout << "Initial frequency for clock " << - FreqEnumToStr(rsmi_clk) << " is " << f.current << std::endl; - } - return true; } + + // special driver issue, shouldn't normally occur + if (ret == RSMI_STATUS_UNEXPECTED_DATA) { + std::cerr << "WARN: Clock file [" << FreqEnumToStr(rsmi_clk) << "] exists on device [" << dv_ind << "] but empty!" << std::endl; + std::cerr << " Likely a driver issue!" << std::endl; + } + + // CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "Initial frequency for clock " << + FreqEnumToStr(rsmi_clk) << " is " << f.current << std::endl; + } + return true; }; auto freq_write = [&]() { @@ -177,44 +183,6 @@ void TestFrequenciesReadWrite::Run(void) { } freq_write(); CHK_ERR_ASRT(ret) -#if 0 - ret = rsmi_dev_gpu_clk_freq_get(dv_ind, rsmi_clk, &f); - CHK_ERR_ASRT(ret) - - IF_VERB(STANDARD) { - std::cout << "Initial frequency for clock " << rsmi_clk << " is " << - f.current << std::endl; - } - // Set clocks to something other than the usual default of the lowest - // frequency. - freq_bitmask = 0b01100; // Try the 3rd and 4th clocks - - std::string freq_bm_str = - std::bitset(freq_bitmask).to_string(); - - freq_bm_str.erase(0, std::min(freq_bm_str.find_first_not_of('0'), - freq_bm_str.size()-1)); - - IF_VERB(STANDARD) { - std::cout << "Setting frequency mask for clock " << rsmi_clk << - " to 0b" << freq_bm_str << " ..." << std::endl; - } - ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, freq_bitmask); - CHK_ERR_ASRT(ret) - - ret = rsmi_dev_gpu_clk_freq_get(dv_ind, rsmi_clk, &f); - CHK_ERR_ASRT(ret) - - IF_VERB(STANDARD) { - std::cout << "Frequency is now index " << f.current << std::endl; - std::cout << "Resetting mask to all frequencies." << std::endl; - } - ret = rsmi_dev_gpu_clk_freq_set(dv_ind, rsmi_clk, 0xFFFFFFFF); - CHK_ERR_ASRT(ret) - - ret = rsmi_dev_perf_level_set(dv_ind, RSMI_DEV_PERF_LEVEL_AUTO); - CHK_ERR_ASRT(ret) -#endif } } }