diff --git a/projects/rocm-smi-lib/CMakeLists.txt b/projects/rocm-smi-lib/CMakeLists.txt index 6a9a003561..3ff760b83c 100755 --- a/projects/rocm-smi-lib/CMakeLists.txt +++ b/projects/rocm-smi-lib/CMakeLists.txt @@ -72,7 +72,7 @@ endif() ## Compiler flags set(CMAKE_CXX_FLAGS - "${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti") + "${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti -std=c++17") if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64 -msse -msse2") diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index a3396ae4a2..9b169f9827 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -363,16 +363,16 @@ typedef rsmi_clk_type_t rsmi_clk_type; */ typedef enum { RSMI_COMPUTE_PARTITION_INVALID = 0, - RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with - //!< shared memory - RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work - //!< together with shared memory - RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work - //!< together with shared memory - RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs - //!< work together with shared memory - RSMI_COMPUTE_PARTITION_QPX //!< Quad GPU mode (QPX)- Quarter XCCs - //!< work together with shared memory + RSMI_COMPUTE_PARTITION_CPX = 1, //!< Core mode (CPX)- Per-chip XCC with + //!< shared memory + RSMI_COMPUTE_PARTITION_SPX = 2, //!< Single GPU mode (SPX)- All XCCs work + //!< together with shared memory + RSMI_COMPUTE_PARTITION_DPX = 3, //!< Dual GPU mode (DPX)- Half XCCs work + //!< together with shared memory + RSMI_COMPUTE_PARTITION_TPX = 4, //!< Triple GPU mode (TPX)- One-third XCCs + //!< work together with shared memory + RSMI_COMPUTE_PARTITION_QPX = 5, //!< Quad GPU mode (QPX)- Quarter XCCs + //!< work together with shared memory } rsmi_compute_partition_type_t; /// \cond Ignore in docs. typedef rsmi_compute_partition_type_t rsmi_compute_partition_type; @@ -3783,6 +3783,8 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition, * unavailable for current device * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not * support this function + * @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired + * because it is already being used - device is busy * */ rsmi_status_t @@ -3802,6 +3804,8 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, * @retval ::RSMI_STATUS_PERMISSION function requires root access * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not * support this function + * @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired + * because it is already being used - device is busy * */ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind); @@ -3866,6 +3870,8 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition, * support this function * @retval ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart * the amdgpu driver + * @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired + * because it is already being used - device is busy * */ rsmi_status_t @@ -3887,6 +3893,8 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, * support this function * @retval ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart * the amdgpu driver + * @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired + * because it is already being used - device is busy * */ rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind); diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h index 0880350a3e..21b8101407 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h @@ -260,7 +260,8 @@ class Device { std::vector *retVec); int readDevInfoBinary(DevInfoTypes type, std::size_t b_size, void *p_binary_data); - int writeDevInfoStr(DevInfoTypes type, std::string valStr); + int writeDevInfoStr(DevInfoTypes type, std::string valStr, + bool returnWriteErr = false); rsmi_status_t run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query); diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h index 90c7f6ff3b..e13ea003ba 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_kfd.h @@ -84,6 +84,9 @@ class KFDNode { int get_total_memory(uint64_t* total); int get_used_memory(uint64_t* used); + // Get gfx target version from kfd + int get_gfx_target_version(uint64_t* gfx_target_version); + private: uint32_t node_indx_; uint32_t amdgpu_dev_index_; diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h index 9c1ef0290d..9d255d9c90 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_utils.h @@ -51,6 +51,8 @@ #include #include #include +#include +#include #include "rocm_smi/rocm_smi_device.h" @@ -97,10 +99,10 @@ rsmi_status_t GetDevBinaryBlob(amd::smi::DevInfoTypes type, uint32_t dv_ind, std::size_t b_size, void* p_binary_data); rsmi_status_t ErrnoToRsmiStatus(int err); -std::string getRSMIStatusString(rsmi_status_t ret); +std::string getRSMIStatusString(rsmi_status_t ret, bool fullStatus = true); std::tuple + std::string, std::string, std::string, std::string, std::string> getSystemDetails(void); void logSystemDetails(void); rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str); @@ -109,11 +111,20 @@ void logHexDump(const char *desc, const void *addr, const size_t len, bool isSystemBigEndian(); std::string getBuildType(); std::string getMyLibPath(); +std::string getFileCreationDate(std::string path); int subDirectoryCountInPath(const std::string path); +std::queue getAllDeviceGfxVers(); std::string monitor_type_string(amd::smi::MonitorTypes type); std::string power_type_string(RSMI_POWER_TYPE type); +std::string splitString(std::string str, char delim); +std::string print_rsmi_od_volt_freq_data_t(rsmi_od_volt_freq_data_t *odv); +std::string print_rsmi_od_volt_freq_regions(uint32_t num_regions, + rsmi_freq_volt_region_t *regions); +bool is_sudo_user(); +rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind, + std::string *gfx_version); template -std::string print_int_as_hex(T i, bool showHexNotation=true) { + std::string print_int_as_hex(T i, bool showHexNotation = true) { std::stringstream ss; if (showHexNotation) { ss << "0x" << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex; @@ -132,7 +143,7 @@ std::string print_int_as_hex(T i, bool showHexNotation=true) { } ss << std::dec; return ss.str(); -}; +} template std::string print_unsigned_int(T i) { @@ -263,7 +274,7 @@ class ScopedAcquire { LockType* lock_; bool doRelease; /// @brief: Disable copiable and assignable ability. - DISALLOW_COPY_AND_ASSIGN(ScopedAcquire); + DISALLOW_COPY_AND_ASSIGN(ScopedAcquire) }; } // namespace smi diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py index 6af7ac098b..6c1de2d7a7 100755 --- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py +++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py @@ -966,6 +966,9 @@ def resetComputePartition(deviceList): printLog(device, 'Permission denied', None) elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: printLog(device, 'Not supported on the given system', None) + elif ret == rsmi_status_t.RSMI_STATUS_BUSY: + printLog(device, 'Device is currently busy, try again later', + None) else: rsmi_ret_ok(ret, device, 'reset_compute_partition') printErrLog(device, 'Failed to reset the compute partition to boot state') @@ -1002,6 +1005,9 @@ def resetMemoryPartition(deviceList): printLog(device, 'Permission denied', None, addExtraLine) elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: printLog(device, 'Not supported on the given system', None, addExtraLine) + elif ret == rsmi_status_t.RSMI_STATUS_BUSY: + printLog(device, 'Device is currently busy, try again later', + None) else: rsmi_ret_ok(ret, device, 'reset_memory_partition') printErrLog(device, 'Failed to reset memory partition to boot state') @@ -1603,6 +1609,9 @@ def setComputePartition(deviceList, computePartitionType): %computePartitionType, None) elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: printLog(device, 'Not supported on the given system', None) + elif ret == rsmi_status_t.RSMI_STATUS_BUSY: + printLog(device, 'Device is currently busy, try again later', + None) else: rsmi_ret_ok(ret, device, 'set_compute_partition') printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.') @@ -1673,6 +1682,9 @@ def setMemoryPartition(deviceList, memoryPartition): printLog(device, 'Permission denied', None, addExtraLine) elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: printLog(device, 'Not supported on the given system', None, addExtraLine) + elif ret == rsmi_status_t.RSMI_STATUS_BUSY: + printLog(device, 'Device is currently busy, try again later', + None, addExtraLine) else: rsmi_ret_ok(ret, device, 'set_memory_partition') printErrLog(device, 'Failed to retrieve memory partition, even though device supports it.') diff --git a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc index fa01b42978..0515d75d3c 100755 --- a/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc +++ b/projects/rocm-smi-lib/rocm_smi/example/rocm_smi_example.cc @@ -156,6 +156,15 @@ } \ } +void print_function_header_with_rsmi_ret( + rsmi_status_t myReturn, std::string header = "") { + std::cout << "\t** "; + if (!header.empty()) { + std::cout << header << ": "; + } + std::cout << amd::smi::getRSMIStatusString(myReturn, false) << "\n"; +} + static void print_test_header(const char *str, uint32_t dv_ind) { std::cout << "********************************" << "\n"; std::cout << "*** " << str << "\n"; @@ -254,14 +263,24 @@ perf_level_string(rsmi_dev_perf_level_t perf_lvl) { } } -static bool isUserRunningAsSudo() { - bool isRunningWithSudo = false; - auto myUID = getuid(); - auto myPrivledges = geteuid(); - if ((myUID == myPrivledges) && (myPrivledges == 0)) { - isRunningWithSudo = true; +static const std::string +clock_type_string(rsmi_clk_type_t clk) { + switch (clk) { + case RSMI_CLK_TYPE_SYS: + return "RSMI_CLK_TYPE_SYS"; + case RSMI_CLK_TYPE_DF: + return "RSMI_CLK_TYPE_DF"; + case RSMI_CLK_TYPE_DCEF: + return "RSMI_CLK_TYPE_DCEF"; + case RSMI_CLK_TYPE_SOC: + return "RSMI_CLK_TYPE_SOC"; + case RSMI_CLK_TYPE_MEM: + return "RSMI_CLK_TYPE_MEM"; + case RSMI_CLK_TYPE_PCIE: + return "RSMI_CLK_TYPE_PCIE"; + default: + return "RSMI_CLK_INVALID"; } - return isRunningWithSudo; } static bool isFileWritable(rsmi_status_t response) { @@ -271,7 +290,7 @@ static bool isFileWritable(rsmi_status_t response) { // isFileWritable(ret) - intends to capture this // response situation. bool fileWritable = true; - if (isUserRunningAsSudo() && (response == RSMI_STATUS_PERMISSION)) { + if (amd::smi::is_sudo_user() && (response == RSMI_STATUS_PERMISSION)) { std::cout << "[WARN] User is running with sudo " << "permissions, file is not writable." << "\n"; fileWritable = false; @@ -574,9 +593,19 @@ static rsmi_status_t test_set_freq(uint32_t dv_ind) { } static void print_frequencies(rsmi_frequencies_t *f) { - assert(f != nullptr); + bool hasDeepSleep = false; + if (f == nullptr) { + std::cout << "Freq was nullptr\n"; + return; + } for (uint32_t j = 0; j < f->num_supported; ++j) { - std::cout << "\t** " << j << ": " << std::to_string(f->frequency[j]); + if (f->has_deep_sleep && j == 0) { + std::cout << "\t** S: " << std::to_string(f->frequency[j]); + hasDeepSleep = true; + } else { + std::cout << "\t** " << (hasDeepSleep ? j-1 : j) + << ": " << std::to_string(f->frequency[j]); + } if (j == f->current) { std::cout << " *"; } @@ -714,6 +743,7 @@ int main() { rsmi_frequencies_t f; uint32_t num_monitor_devs = 0; rsmi_gpu_metrics_t p; + std::string val_str; RSMI_POWER_TYPE power_type = RSMI_INVALID_POWER; rsmi_num_monitor_devices(&num_monitor_devs); @@ -725,6 +755,8 @@ int main() { ret = rsmi_dev_revision_get(i, &val_ui16); CHK_RSMI_RET_I(ret) std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << val_ui16 << "\n"; + ret = amd::smi::rsmi_get_gfx_target_version(i , &val_str); + std::cout << "\t**Target Graphics Version: " << val_str << "\n"; char current_compute_partition[256]; current_compute_partition[0] = '\0'; @@ -736,7 +768,7 @@ int main() { ? "UNKNOWN" : current_compute_partition); if (ret != RSMI_STATUS_SUCCESS) { std::cout << ", RSMI_STATUS = "; -} else { + } else { std::cout << "\n"; } CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret) @@ -773,8 +805,38 @@ int main() { } ret = rsmi_dev_gpu_metrics_info_get(i, &p); - CHK_AND_PRINT_RSMI_ERR_RET(ret) - std::cout << "\t**GPU METRICS" << "\n"; + print_test_header("GPU METRICS", i); + print_function_header_with_rsmi_ret(ret, + "rsmi_dev_gpu_metrics_info_get(" + std::to_string(i) + ", &p)"); + std::cout << "\t**p.average_gfxclk_frequency: " << std::dec + << p.average_gfxclk_frequency << "\n"; + std::cout << "\t**p.average_socclk_frequency: " << std::dec + << p.average_socclk_frequency << "\n"; + std::cout << "\t**p.average_uclk_frequency: " << std::dec + << p.average_uclk_frequency << "\n"; + std::cout << "\t**p.average_vclk0_frequency: " << std::dec + << p.average_vclk0_frequency << "\n"; + std::cout << "\t**p.average_dclk0_frequency: " << std::dec + << p.average_dclk0_frequency << "\n"; + std::cout << "\t**p.average_vclk1_frequency: " << std::dec + << p.average_vclk1_frequency << "\n"; + std::cout << "\t**p.average_dclk1_frequency: " << std::dec + << p.average_dclk1_frequency << "\n"; + + std::cout << "\t**p.current_gfxclk: " << std::dec + << p.current_gfxclk << "\n"; + std::cout << "\t**p.current_socclk: " << std::dec + << p.current_socclk << "\n"; + std::cout << "\t**p.current_uclk: " << std::dec + << p.current_uclk << "\n"; + std::cout << "\t**p.current_vclk0: " << std::dec + << p.current_vclk0 << "\n"; + std::cout << "\t**p.current_dclk0: " << std::dec + << p.current_dclk0 << "\n"; + std::cout << "\t**p.current_vclk1: " << std::dec + << p.current_vclk1 << "\n"; + std::cout << "\t**p.current_dclk1: " << std::dec + << p.current_dclk1 << "\n"; ret = rsmi_dev_perf_level_get(i, &pfl); CHK_AND_PRINT_RSMI_ERR_RET(ret) @@ -784,25 +846,25 @@ int main() { CHK_AND_PRINT_RSMI_ERR_RET(ret) std::cout << "\t**OverDrive Level:" << val_ui32 << "\n"; - ret = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_MEM, &f); - CHK_AND_PRINT_RSMI_ERR_RET(ret) - std::cout << "\t**Supported GPU Memory clock frequencies: "; - std::cout << f.num_supported << "\n"; - print_frequencies(&f); - - ret = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_SYS, &f); - CHK_AND_PRINT_RSMI_ERR_RET(ret) - std::cout << "\t**Supported GPU clock frequencies: "; - std::cout << f.num_supported << "\n"; - print_frequencies(&f); - - ret = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_SOC, &f); - CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret) - std::cout << "\t**Supported GPU clock frequencies (SOC clk): "; - std::cout << f.num_supported << "\n"; - std::cout << "\t**Current value (SOC clk): "; - std::cout << f.current << "\n"; - print_frequencies(&f); + print_test_header("GPU Clocks", i); + for (int clkType = static_cast(RSMI_CLK_TYPE_SYS); + clkType <= static_cast(RSMI_CLK_TYPE_PCIE); + clkType++) { + rsmi_clk_type_t type = static_cast(clkType); + ret = rsmi_dev_gpu_clk_freq_get(i, type, &f); + print_function_header_with_rsmi_ret(ret, + "rsmi_dev_gpu_clk_freq_get(" + std::to_string(i) + + ", " + clock_type_string(type) + ", &f)"); + if (ret != RSMI_STATUS_SUCCESS) { + continue; + } + std::cout << "\t** " << clock_type_string(type) + << " - Supported # of freqs: "; + std::cout << f.num_supported << "\n"; + std::cout << "\t** " << clock_type_string(type) << " f.current: " + << f.current << "\n"; + print_frequencies(&f); + } std::cout << "\t**Monitor name: "; char name[128]; @@ -892,7 +954,7 @@ int main() { } std::cout << "***** Testing write api's" << "\n"; - if (isUserRunningAsSudo() == false) { + if (amd::smi::is_sudo_user() == false) { std::cout << "Write APIs require users to execute with sudo. " << "Cannot proceed." << "\n"; return 0; diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 93536bedc9..9a7bd5d3ac 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -929,6 +929,9 @@ rsmi_status_t rsmi_perf_determinism_mode_set(uint32_t dv_ind, uint64_t clkvalue) { TRY DEVICE_MUTEX + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); // Set perf. level to performance determinism so that we can then set the power profile rsmi_status_t ret = rsmi_dev_perf_level_set_v1(dv_ind, @@ -1510,6 +1513,9 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, static void get_vc_region(uint32_t start_ind, std::vector *val_vec, rsmi_freq_volt_region_t *p) { + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); assert(p != nullptr); assert(val_vec != nullptr); THROW_IF_NULLPTR_DEREF(p) @@ -1520,6 +1526,9 @@ static void get_vc_region(uint32_t start_ind, assert((*val_vec)[kOD_OD_RANGE_label_array_index] == "OD_RANGE:"); if ((val_vec->size() < kOD_OD_RANGE_label_array_index + 2) || ((*val_vec)[kOD_OD_RANGE_label_array_index] != "OD_RANGE:") ) { + ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning " + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA); + LOG_TRACE(ss); throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_DATA, __FUNCTION__); } od_value_pair_str_to_range((*val_vec)[start_ind], &p->freq_range); @@ -1539,6 +1548,7 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind, TRY std::vector val_vec; rsmi_status_t ret; + std::ostringstream ss; assert(num_regions != nullptr); assert(p != nullptr); @@ -1547,12 +1557,20 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind, ret = GetDevValueVec(amd::smi::kDevPowerODVoltage, dv_ind, &val_vec); if (ret != RSMI_STATUS_SUCCESS) { + ss << __PRETTY_FUNCTION__ + << " | Issue: could not retreive kDevPowerODVoltage" << "; returning " + << getRSMIStatusString(ret); + LOG_ERROR(ss); return ret; } // This is a work-around to handle systems where kDevPowerODVoltage is not // fully supported yet. if (val_vec.size() < 2) { + ss << __PRETTY_FUNCTION__ + << " | Issue: val_vec.size() < 2" << "; returning " + << getRSMIStatusString(RSMI_STATUS_NOT_YET_IMPLEMENTED); + LOG_ERROR(ss); return RSMI_STATUS_NOT_YET_IMPLEMENTED; } @@ -1560,8 +1578,17 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind, assert((val_vec_size - kOD_VDDC_CURVE_start_index) > 0); assert((val_vec_size - kOD_VDDC_CURVE_start_index)%2 == 0); + ss << __PRETTY_FUNCTION__ + << " | val_vec_size = " << std::dec + << val_vec_size + << " | kOD_VDDC_CURVE_start_index = " << kOD_VDDC_CURVE_start_index; + LOG_DEBUG(ss); if (((val_vec_size - kOD_VDDC_CURVE_start_index) <= 0) || (((val_vec_size - kOD_VDDC_CURVE_start_index)%2 != 0))) { + ss << __PRETTY_FUNCTION__ << " | Issue: od vdd curve returned unexpected " + << "data" << "; returning " + << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_SIZE); + LOG_ERROR(ss); throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_SIZE, __FUNCTION__); } @@ -2749,6 +2776,9 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) { ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); DEVICE_MUTEX + if (odv == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } CHK_SUPPORT_NAME_ONLY(odv) rsmi_status_t ret = get_od_clk_volt_info(dv_ind, odv); @@ -2779,7 +2809,7 @@ rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind, uint32_t *num_regions, rsmi_freq_volt_region_t *buffer) { TRY std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << "| ======= start ======="; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; LOG_TRACE(ss); CHK_SUPPORT_NAME_ONLY((num_regions == nullptr || buffer == nullptr) ? @@ -2791,6 +2821,12 @@ rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind, DEVICE_MUTEX rsmi_status_t ret = get_od_clk_volt_curve_regions(dv_ind, num_regions, buffer); + if (*num_regions == 0) { + ret = RSMI_STATUS_NOT_SUPPORTED; + } + ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning " + << getRSMIStatusString(ret); + LOG_TRACE(ss); return ret; CATCH } @@ -4468,7 +4504,7 @@ get_compute_partition(uint32_t dv_ind, std::string &compute_partition) { return ret; } - switch (mapStringToRSMIComputePartitionTypes[compute_partition_str]) { + switch (mapStringToRSMIComputePartitionTypes.at(compute_partition_str)) { case RSMI_COMPUTE_PARTITION_CPX: case RSMI_COMPUTE_PARTITION_SPX: case RSMI_COMPUTE_PARTITION_DPX: @@ -4585,9 +4621,12 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, ss << __PRETTY_FUNCTION__ << "| ======= start ======="; LOG_TRACE(ss); REQUIRE_ROOT_ACCESS + if (!amd::smi::is_sudo_user()) { + return RSMI_STATUS_PERMISSION; + } DEVICE_MUTEX std::string newComputePartitionStr - = mapRSMIToStringComputePartitionTypes[compute_partition]; + = mapRSMIToStringComputePartitionTypes.at(compute_partition); std::string currentComputePartition; switch (compute_partition) { @@ -4605,6 +4644,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, << " | Device #: " << dv_ind << " | Type: " << devInfoTypesStrings.at(amd::smi::kDevComputePartition) + << " | Data: " << newComputePartitionStr << " | Cause: requested setting was invalid" << " | Returning = " << getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |"; @@ -4623,6 +4663,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, << " | Device #: " << dv_ind << " | Type: " << devInfoTypesStrings.at(amd::smi::kDevComputePartition) + << " | Data: " << newComputePartitionStr << " | Cause: not an available compute partition setting" << " | Returning = " << getRSMIStatusString(available_ret) << " |"; @@ -4650,7 +4691,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, return ret_get; } rsmi_compute_partition_type_t currRSMIComputePartition - = mapStringToRSMIComputePartitionTypes[currentComputePartition]; + = mapStringToRSMIComputePartitionTypes.at(currentComputePartition); if (currRSMIComputePartition == compute_partition) { ss << __PRETTY_FUNCTION__ << " | ======= end ======= " @@ -4665,6 +4706,15 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind, return RSMI_STATUS_SUCCESS; } + ss << __PRETTY_FUNCTION__ << " | about to try writing |" + << newComputePartitionStr + << "| size of string = " << newComputePartitionStr.size() + << "| size of c-string = "<< std::dec + << sizeof(newComputePartitionStr.c_str())/sizeof(newComputePartitionStr[0]) + << "| sizeof string = " << std::dec + << sizeof(newComputePartitionStr); + LOG_DEBUG(ss); + GET_DEV_FROM_INDX int ret = dev->writeDevInfo(amd::smi::kDevComputePartition, newComputePartitionStr); @@ -4699,7 +4749,7 @@ static rsmi_status_t get_memory_partition(uint32_t dv_ind, return ret; } - switch (mapStringToMemoryPartitionTypes[val_str]) { + switch (mapStringToMemoryPartitionTypes.at(val_str)) { case RSMI_MEMORY_PARTITION_NPS1: case RSMI_MEMORY_PARTITION_NPS2: case RSMI_MEMORY_PARTITION_NPS4: @@ -4755,7 +4805,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, } std::string newMemoryPartition - = mapRSMIToStringMemoryPartitionTypes[memory_partition]; + = mapRSMIToStringMemoryPartitionTypes.at(memory_partition); std::string currentMemoryPartition; switch (memory_partition) { @@ -4798,7 +4848,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind, return ret_get; } rsmi_memory_partition_type_t currRSMIMemoryPartition - = mapStringToMemoryPartitionTypes[currentMemoryPartition]; + = mapStringToMemoryPartitionTypes.at(currentMemoryPartition); if (currRSMIMemoryPartition == memory_partition) { ss << __PRETTY_FUNCTION__ << " | ======= end ======= " @@ -4942,7 +4992,7 @@ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) { // Likely due to device not supporting it if (bootState != "UNKNOWN") { rsmi_compute_partition_type_t compute_partition = - mapStringToRSMIComputePartitionTypes[bootState]; + mapStringToRSMIComputePartitionTypes.at(bootState); ret = rsmi_dev_compute_partition_set(dv_ind, compute_partition); } ss << __PRETTY_FUNCTION__ @@ -4981,7 +5031,7 @@ rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) { // Likely due to device not supporting it if (bootState != "UNKNOWN") { rsmi_memory_partition_type_t memory_partition = - mapStringToMemoryPartitionTypes[bootState]; + mapStringToMemoryPartitionTypes.at(bootState); ret = rsmi_dev_memory_partition_set(dv_ind, memory_partition); } ss << __PRETTY_FUNCTION__ diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc index bd3ac5da9d..bd357c35d1 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_device.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc @@ -598,14 +598,17 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { int ret = isRegularFile(sysfs_path, ®_file); if (ret != 0) { - ss << "File did not exist - SYSFS file (" << sysfs_path + ss << __PRETTY_FUNCTION__ << " | Issue: File did not exist - SYSFS file (" + << sysfs_path << ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; } if (!reg_file) { - ss << "File is not a regular file - SYSFS file (" << sysfs_path << ") for " + ss << __PRETTY_FUNCTION__ + << " | Issue: File is not a regular file - SYSFS file (" + << sysfs_path << ") for " << "DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ")," << " returning ENOENT (" << std::strerror(ENOENT) << ")"; LOG_ERROR(ss); @@ -615,7 +618,8 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { fs->open(sysfs_path); if (!fs->is_open()) { - ss << "Could not open - SYSFS file (" << sysfs_path << ") for " + ss << __PRETTY_FUNCTION__ + << " | Issue: Could not open - SYSFS file (" << sysfs_path << ") for " << "DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << "), " << ", returning " << std::to_string(errno) << " (" << std::strerror(errno) << ")"; @@ -623,7 +627,8 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { return errno; } - ss << "Successfully opened SYSFS file (" << sysfs_path + ss << __PRETTY_FUNCTION__ << " | Successfully opened SYSFS file (" + << sysfs_path << ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << ")"; LOG_INFO(ss); @@ -671,32 +676,51 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) { ret = openSysfsFileStream(type, &fs); if (ret != 0) { ss << "Could not read device info string for DevInfoType (" - << RocmSMI::devInfoTypesStrings.at(type)<< "), returning " + << RocmSMI::devInfoTypesStrings.at(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); return ret; } fs >> *retStr; - std::string info = "Successfully read device info string for DevInfoType (" + - RocmSMI::devInfoTypesStrings.at(type) + "): " + - *retStr; - LOG_INFO(info); fs.close(); - + ss << __PRETTY_FUNCTION__ + << "Successfully read device info string for DevInfoType (" + + RocmSMI::devInfoTypesStrings.at(type) + "): " + *retStr + << " | " + << (fs.is_open() ? " File stream is opened" : " File stream is closed") + << " | " << (fs.bad() ? "[ERROR] Bad read operation" : + "[GOOD] No bad bit read, successful read operation") + << " | " << (fs.fail() ? "[ERROR] Failed read - format error" : + "[GOOD] No fail - Successful read operation") + << " | " << (fs.eof() ? "[ERROR] Failed read - EOF error" : + "[GOOD] No eof error - Successful read operation") + << " | " << (fs.good() ? "[GOOD] read good - Successful read operation" : + "[ERROR] Failed read - good error"); + LOG_INFO(ss); return 0; } -int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr) { - auto tempPath = path_; +int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr, + bool returnWriteErr) { + // returnWriteErr = false, backwards compatability (old calls) + // returnWriteErr = true, improvement - allows us to detect errors + // when writing to file + // (such as EBUSY) + auto sysfs_path = path_; + sysfs_path += "/device/"; + sysfs_path += kDevAttribNameMap.at(type); std::ofstream fs; int ret; std::ostringstream ss; - fs.rdbuf()->pubsetbuf(nullptr,0); + fs.flush(); + fs.rdbuf()->pubsetbuf(0, 0); ret = openSysfsFileStream(type, &fs, valStr.c_str()); if (ret != 0) { - ss << "Could not write device info string (" << valStr + fs.close(); + ss << __PRETTY_FUNCTION__ << " | Issue: Could not open fileStream; " + << "Could not write device info string (" << valStr << ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << "), returning " << std::to_string(ret); LOG_ERROR(ss); @@ -705,19 +729,39 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr) { // We'll catch any exceptions in rocm_smi.cc code. if (fs << valStr) { + fs.flush(); + fs.close(); ss << "Successfully wrote device info string (" << valStr << ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << "), returning RSMI_STATUS_SUCCESS"; LOG_INFO(ss); ret = RSMI_STATUS_SUCCESS; } else { - ss << "Could not write device info string (" << valStr + if (returnWriteErr) { + ret = errno; + } else { + ret = RSMI_STATUS_NOT_SUPPORTED; + } + fs.flush(); + fs.close(); + ss << __PRETTY_FUNCTION__ << " | Issue: Could not write to file; " + << "Could not write device info string (" << valStr << ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type) - << "), returning RSMI_STATUS_NOT_SUPPORTED"; + << "), returning " << getRSMIStatusString(ErrnoToRsmiStatus(ret)); + ss << " | " + << (fs.is_open() ? "[ERROR] File stream open" : + "[GOOD] File stream closed") + << " | " << (fs.bad() ? "[ERROR] Bad write operation" : + "[GOOD] No bad bit write, successful write operation") + << " | " << (fs.fail() ? "[ERROR] Failed write - format error" : + "[GOOD] No fail - Successful write operation") + << " | " << (fs.eof() ? "[ERROR] Failed write - EOF error" : + "[GOOD] No eof error - Successful write operation") + << " | " << (fs.good() ? + "[GOOD] Write good - Successful write operation" : + "[ERROR] Failed write - good error"); LOG_ERROR(ss); - ret = RSMI_STATUS_NOT_SUPPORTED; } - fs.close(); return ret; } @@ -756,6 +800,9 @@ int Device::writeDevInfo(DevInfoTypes type, uint64_t val) { } int Device::writeDevInfo(DevInfoTypes type, std::string val) { + auto sysfs_path = path_; + sysfs_path += "/device/"; + sysfs_path += kDevAttribNameMap.at(type); switch (type) { case kDevGPUMClk: case kDevDCEFClk: @@ -764,9 +811,10 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) { case kDevPCIEClk: case kDevPowerODVoltage: case kDevSOCClk: + return writeDevInfoStr(type, val); case kDevComputePartition: case kDevMemoryPartition: - return writeDevInfoStr(type, val); + return writeDevInfoStr(type, val, true); default: return EINVAL; @@ -899,6 +947,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { std::string tempStr; int ret; int tmp_val; + std::ostringstream ss; switch (type) { case kDevDevID: diff --git a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc index 6123eae147..b055876398 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc @@ -496,6 +496,12 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) { // a specific version. *smu = {}; + uint8_t dev_content_revision = dev->gpu_metrics_ver().content_revision; + if (dev_content_revision != RSMI_GPU_METRICS_API_CONTENT_VER_1 || + dev_content_revision != RSMI_GPU_METRICS_API_CONTENT_VER_2 || + dev_content_revision != RSMI_GPU_METRICS_API_CONTENT_VER_3) { + return RSMI_STATUS_NOT_SUPPORTED; + } if (dev->gpu_metrics_ver().content_revision == RSMI_GPU_METRICS_API_CONTENT_VER_1) { ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind, diff --git a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc index 40984b430b..3109781e39 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_kfd.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_kfd.cc @@ -971,5 +971,26 @@ int get_gpu_id(uint32_t node, uint64_t *gpu_id) { return retVal; } +// /sys/class/kfd/kfd/topology/nodes/*/properties | grep gfx_target_version +int KFDNode::get_gfx_target_version(uint64_t *gfx_target_version) { + std::ostringstream ss; + std::string properties_path = "/sys/class/kfd/kfd/topology/nodes/" + + std::to_string(this->node_indx_) + "/properties"; + uint64_t gfx_version = 0; + int ret = read_node_properties(this->node_indx_, "gfx_target_version", + &gfx_version); + *gfx_target_version = gfx_version; + ss << __PRETTY_FUNCTION__ + << " | File: " << properties_path + << " | Successfully read node #" << std::to_string(this->node_indx_) + << " for gfx_target_version" + << " | Data (gfx_target_version) *gfx_target_version = " + << std::to_string(*gfx_target_version) + << " | return = " << std::to_string(ret) + << " | "; + LOG_DEBUG(ss); + return ret; +} + } // namespace smi } // namespace amd diff --git a/projects/rocm-smi-lib/src/rocm_smi_main.cc b/projects/rocm-smi-lib/src/rocm_smi_main.cc index 3647f8e35d..44b40646d3 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_main.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_main.cc @@ -445,6 +445,12 @@ RocmSMI::Initialize(uint64_t flags) { // store each device boot partition state, if file doesn't exist dev->storeDevicePartitions(dv_ind); } + + // Assists displaying GPU information after device enumeration + // Otherwise GPU related info will not be discoverable + if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) { + logSystemDetails(); + } // Leaving below to help debug temp file issues // displayAppTmpFilesContent(); std::string amdGPUDeviceList = displayAllDevicePaths(devices_); diff --git a/projects/rocm-smi-lib/src/rocm_smi_utils.cc b/projects/rocm-smi-lib/src/rocm_smi_utils.cc index 3d74c7e7f1..680352055b 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_utils.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_utils.cc @@ -599,9 +599,19 @@ std::tuple readTmpFile(uint32_t dv_ind, } // wrapper to return string expression of a rsmi_status_t return -std::string getRSMIStatusString(rsmi_status_t ret) { +// rsmi_status_t ret - return value of RSMI API function +// bool fullStatus - defaults to true, set to false to chop off description +// Returns: +// string - if fullStatus == true, returns full decription of return value +// ex. 'RSMI_STATUS_SUCCESS: The function has been executed successfully.' +// string - if fullStatus == false, returns a minimalized return value +// ex. 'RSMI_STATUS_SUCCESS' +std::string getRSMIStatusString(rsmi_status_t ret, bool fullStatus) { const char *err_str; rsmi_status_string(ret, &err_str); + if (!fullStatus) { + return splitString(std::string(err_str), ':'); + } return std::string(err_str); } @@ -620,9 +630,13 @@ std::string getRSMIStatusString(rsmi_status_t ret) { // Expressed as big endian or little endian. // Big Endian (BE), multi-bit symbols encoded as big endian (MSB first) // Little Endian (LE), multi-bit symbols encoded as little endian (LSB first) +// string rocm_lib_path = Path to library +// string rocm_build_type = Release or debug +// string rocm_build_date = Creation date of library +// string dev_gfx_versions = GPU target graphics version std::tuple + std::string, std::string, std::string, std::string, std::string> getSystemDetails(void) { struct utsname buf; bool errorDetected = false; @@ -637,7 +651,9 @@ std::tuple devGraphicsVersions = getAllDeviceGfxVers(); + if (devGraphicsVersions.empty() == false) { + dev_gfx_versions = ""; + while (devGraphicsVersions.empty() == false) { + dev_gfx_versions += "\n\t" + devGraphicsVersions.front(); + devGraphicsVersions.pop(); + } + } return std::make_tuple(errorDetected, sysname, nodename, release, version, machine, domainName, os_distribution, endianness, rocm_build_type, rocm_lib_path, - rocm_env_variables); + rocm_build_date, rocm_env_variables, dev_gfx_versions); } // If logging is enabled through RSMI_LOGGING environment variable. @@ -687,10 +712,11 @@ void logSystemDetails(void) { std::ostringstream ss; bool errorDetected; std::string sysname, node, release, version, machine, domain, distName, - endianness, rocm_build_type, lib_path, rocm_env_vars; + endianness, rocm_build_type, lib_path, build_date, rocm_env_vars, + dev_gfx_versions; std::tie(errorDetected, sysname, node, release, version, machine, domain, - distName, endianness, rocm_build_type, lib_path, - rocm_env_vars) = getSystemDetails(); + distName, endianness, rocm_build_type, lib_path, build_date, + rocm_env_vars, dev_gfx_versions) = getSystemDetails(); if (errorDetected == false) { ss << "====== Gathered system details ============\n" << "SYSTEM NAME: " << sysname << "\n" @@ -703,7 +729,9 @@ void logSystemDetails(void) { << "ENDIANNESS: " << endianness << "\n" << "ROCM BUILD TYPE: " << rocm_build_type << "\n" << "ROCM-SMI-LIB PATH: " << lib_path << "\n" - << "ROCM ENV VARIABLES: " << rocm_env_vars << "\n"; + << "ROCM-SMI-LIB BUILD DATE: " << build_date << "\n" + << "ROCM ENV VARIABLES: " << rocm_env_vars + << "AMD GFX VERSIONS: " << dev_gfx_versions << "\n"; LOG_INFO(ss); } else { ss << "====== Gathered system details ============\n" @@ -831,6 +859,13 @@ std::string getMyLibPath(void) { return path; } +std::string getFileCreationDate(std::string path) { + struct stat t_stat; + stat(path.c_str(), &t_stat); + struct tm *timeinfo = localtime(&t_stat.st_ctime); // NOLINT + return removeNewLines(std::string(asctime(timeinfo))); // NOLINT +} + rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str) { auto result = rsmi_status_t::RSMI_STATUS_SUCCESS; @@ -974,5 +1009,164 @@ std::string power_type_string(RSMI_POWER_TYPE type) { return powerTypesToString.at(type); } +std::string splitString(std::string str, char delim) { + std::vector tokens; + std::stringstream ss(str); + std::string token; + + if (str.empty()) { + return ""; + } + + while (std::getline(ss, token, delim)) { + tokens.push_back(token); + return token; // return 1st match + } +} + +static std::string pt_rng_Mhz(std::string title, rsmi_range *r) { + std::ostringstream ss; + if (r == nullptr) { + ss << "pt_rng_Mhz | rsmi_range r = nullptr\n"; + return ss.str(); + } + + ss << title; + ss << r->lower_bound/1000000 << " to " + << r->upper_bound/1000000 << " MHz" << "\n"; + return ss.str(); +} + +static std::string pt_rng_mV(std::string title, rsmi_range *r) { + std::ostringstream ss; + if (r == nullptr) { + ss << "pt_rng_mV | rsmi_range r = nullptr\n"; + return ss.str(); + } + + ss << title; + ss << r->lower_bound << " to " << r->upper_bound + << " mV" << "\n"; + return ss.str(); +} + +static std::string print_pnt(rsmi_od_vddc_point_t *pt) { + std::ostringstream ss; + ss << "\t\t** Frequency: " << pt->frequency/1000000 << " MHz\n"; + ss << "\t\t** Voltage: " << pt->voltage << " mV\n"; + return ss.str(); +} +static std::string pt_vddc_curve(rsmi_od_volt_curve *c) { + std::ostringstream ss; + if (c == nullptr) { + ss << "pt_vddc_curve | rsmi_od_volt_curve c = nullptr\n"; + return ss.str(); + } + + for (uint32_t i = 0; i < RSMI_NUM_VOLTAGE_CURVE_POINTS; ++i) { + ss << print_pnt(&c->vc_points[i]); + } + return ss.str(); +} + +std::string print_rsmi_od_volt_freq_data_t(rsmi_od_volt_freq_data_t *odv) { + std::ostringstream ss; + if (odv == nullptr) { + ss << "rsmi_od_volt_freq_data_t odv = nullptr\n"; + return ss.str(); + } + + ss << pt_rng_Mhz("\t**Current SCLK frequency range: ", &odv->curr_sclk_range); + ss << pt_rng_Mhz("\t**Current MCLK frequency range: ", &odv->curr_mclk_range); + ss << pt_rng_Mhz("\t**Min/Max Possible SCLK frequency range: ", + &odv->sclk_freq_limits); + ss << pt_rng_Mhz("\t**Min/Max Possible MCLK frequency range: ", + &odv->mclk_freq_limits); + + ss << "\t**Current Freq/Volt. curve: " << "\n"; + ss << pt_vddc_curve(&odv->curve); + + ss << "\t**Number of Freq./Volt. regions: " << odv->num_regions << "\n\n"; + return ss.str(); +} + +std::string print_odv_region(rsmi_freq_volt_region_t *region) { + std::ostringstream ss; + ss << pt_rng_Mhz("\t\tFrequency range: ", ®ion->freq_range); + ss << pt_rng_mV("\t\tVoltage range: ", ®ion->volt_range); + return ss.str(); +} + +std::string print_rsmi_od_volt_freq_regions(uint32_t num_regions, + rsmi_freq_volt_region_t *regions) { + std::ostringstream ss; + if (regions == nullptr) { + ss << "rsmi_freq_volt_region_t regions = nullptr\n"; + return ss.str(); + } + for (uint32_t i = 0; i < num_regions; ++i) { + ss << "\tRegion " << i << ": " << "\n"; + ss << print_odv_region(®ions[i]); + } + return ss.str(); +} + +bool is_sudo_user() { + std::ostringstream ss; + bool isRunningWithSudo = false; + auto myUID = getuid(); + auto myPrivledges = geteuid(); + if ((myUID == myPrivledges) && (myPrivledges == 0)) { + isRunningWithSudo = true; + } + ss << __PRETTY_FUNCTION__ << (isRunningWithSudo ? " | running as sudoer" : + " | NOT running as sudoer"); + LOG_DEBUG(ss); + return isRunningWithSudo; +} + +rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind, + std::string *gfx_version) { + std::ostringstream ss; + uint64_t kfd_gfx_version = 0; + GET_DEV_AND_KFDNODE_FROM_INDX + + int ret = kfd_node->get_gfx_target_version(&kfd_gfx_version); + if (ret == 0) { + ss << "gfx" << kfd_gfx_version; + *gfx_version = ss.str(); + return RSMI_STATUS_SUCCESS; + } else { + *gfx_version = "Unknown"; + return RSMI_STATUS_NOT_SUPPORTED; + } +} + +std::queue getAllDeviceGfxVers() { + uint32_t num_monitor_devs = 0; + rsmi_status_t ret; + std::queue deviceGfxVersions; + std::string response = ""; + std::string dev_gfx_ver = ""; + + ret = rsmi_num_monitor_devices(&num_monitor_devs); + if (ret != RSMI_STATUS_SUCCESS || num_monitor_devs == 0) { + response = "N/A - No AMD devices detected"; + deviceGfxVersions.push(response); + return deviceGfxVersions; + } + + for (uint32_t i = 0; i < num_monitor_devs; ++i) { + ret = amd::smi::rsmi_get_gfx_target_version(i , &dev_gfx_ver); + response = "Device[" + std::to_string(i) + "]: "; + if (ret != RSMI_STATUS_SUCCESS) { + deviceGfxVersions.push(response + getRSMIStatusString(ret, false)); + } else { + deviceGfxVersions.push(response + std::string(dev_gfx_ver)); + } + } + return deviceGfxVersions; +} + } // namespace smi } // namespace amd diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt b/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt index 2253327813..bd7c827ce4 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt @@ -21,6 +21,14 @@ message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib) message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin) message("") +## Compiler flags +set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti -std=c++17") +if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64") + set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -m64 -msse -msse2") +endif() + set(SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(RSMITST "rsmitst") diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc index 06297a9371..ebfc9488f7 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/computepartition_read_write.cc @@ -54,6 +54,7 @@ #include "gtest/gtest.h" #include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi_test/functional/computepartition_read_write.h" #include "rocm_smi_test/test_common.h" @@ -118,6 +119,24 @@ computePartitionString(rsmi_compute_partition_type computeParitionType) { } } +static void system_wait(int seconds) { + // Adding a delay - since changing partitions depends on gpus not + // being in an active state, we'll wait a few seconds before starting + // full testing + auto start = std::chrono::high_resolution_clock::now(); + int waitTime = seconds; + std::cout << "** Waiting for " + << std::dec << waitTime + << " seconds, for any GPU" + << " activity to clear up. **" << std::endl; + sleep(waitTime); + auto stop = std::chrono::high_resolution_clock::now(); + auto duration = + std::chrono::duration_cast(stop - start); + std::cout << "** Waiting took " << duration.count() / 1000000 + << " seconds **" << std::endl; +} + static const std::map mapStringToRSMIComputePartitionTypes { {"CPX", RSMI_COMPUTE_PARTITION_CPX}, @@ -141,21 +160,7 @@ void TestComputePartitionReadWrite::Run(void) { // Confirm system supports compute partition, before executing wait ret = rsmi_dev_compute_partition_get(0, orig_char_computePartition, 255); if (ret == RSMI_STATUS_SUCCESS) { - // Adding a delay - since changing partitions depends on gpus not - // being in an active state, we'll wait a few seconds before starting - // full testing - auto start = std::chrono::high_resolution_clock::now(); - int waitTime = 20; - std::cout << "** Waiting for " - << std::dec << waitTime - << " seconds, for any GPU" - << " activity to clear up. **" << std::endl; - sleep(waitTime); - auto stop = std::chrono::high_resolution_clock::now(); - auto duration = - std::chrono::duration_cast(stop - start); - std::cout << "** Waiting took " << duration.count() / 1000000 - << " seconds **" << std::endl; + system_wait(25); } for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { @@ -165,6 +170,7 @@ void TestComputePartitionReadWrite::Run(void) { } } PrintDeviceHeader(dv_ind); + bool devicePartitionUpdated = false; // Standard checks to see if API is supported, before running full tests ret = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition, @@ -231,9 +237,8 @@ void TestComputePartitionReadWrite::Run(void) { } // Verify api support checking functionality is working - rsmi_compute_partition_type_t newPartition - = rsmi_compute_partition_type_t::RSMI_COMPUTE_PARTITION_INVALID; - err = rsmi_dev_compute_partition_set(dv_ind, newPartition); + err = rsmi_dev_compute_partition_set(dv_ind, + RSMI_COMPUTE_PARTITION_INVALID); ASSERT_TRUE((err == RSMI_STATUS_INVALID_ARGS) || (err == RSMI_STATUS_NOT_SUPPORTED) || (err == RSMI_STATUS_PERMISSION)); @@ -270,27 +275,40 @@ void TestComputePartitionReadWrite::Run(void) { * //!< work together with shared memory */ - for (int partition = - rsmi_compute_partition_type_t::RSMI_COMPUTE_PARTITION_CPX; - partition <= rsmi_compute_partition_type_t::RSMI_COMPUTE_PARTITION_QPX; + for (int partition = static_cast(RSMI_COMPUTE_PARTITION_CPX); + partition <= static_cast(RSMI_COMPUTE_PARTITION_QPX); partition++) { - newPartition = static_cast(partition); + rsmi_compute_partition_type_t updatePartition + = static_cast(partition); IF_VERB(STANDARD) { std::cout << std::endl; std::cout << "\t**" << "======== TEST RSMI_COMPUTE_PARTITION_" - << computePartitionString(newPartition) + << computePartitionString(updatePartition) << " ===============" << std::endl; } + ret = rsmi_dev_compute_partition_set(dv_ind, updatePartition); IF_VERB(STANDARD) { std::cout << "\t**" - << "Attempting to set compute partition to: " - << computePartitionString(newPartition) << std::endl; + << "rsmi_dev_compute_partition_set(dv_ind, updatePartition): " + << amd::smi::getRSMIStatusString(ret, false) << "\n" + << "\t**New Partition (set): " + << computePartitionString(updatePartition) << "\n"; } - ret = rsmi_dev_compute_partition_set(dv_ind, newPartition); + ASSERT_TRUE((ret == RSMI_STATUS_SETTING_UNAVAILABLE) + || (ret== RSMI_STATUS_PERMISSION) + || (ret == RSMI_STATUS_SUCCESS) + || ret == RSMI_STATUS_BUSY); + + if (ret == RSMI_STATUS_BUSY) { + IF_VERB(STANDARD) { + std::cout << "\t**Device is currently busy.. continue\n"; + } + system_wait(5); + continue; + } + bool isSettingUnavailable = false; - ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) || - (ret == RSMI_STATUS_SETTING_UNAVAILABLE)); if (ret == RSMI_STATUS_SETTING_UNAVAILABLE) { isSettingUnavailable = true; } @@ -306,7 +324,7 @@ void TestComputePartitionReadWrite::Run(void) { } if (isSettingUnavailable) { ASSERT_EQ(RSMI_STATUS_SETTING_UNAVAILABLE, ret); - ASSERT_STRNE(computePartitionString(newPartition).c_str(), + ASSERT_STRNE(computePartitionString(updatePartition).c_str(), current_char_computePartition); IF_VERB(STANDARD) { std::cout << "\t**" @@ -314,23 +332,30 @@ void TestComputePartitionReadWrite::Run(void) { << "RSMI_STATUS_SETTING_UNAVAILABLE,\n\t current compute " << "partition (" << current_char_computePartition << ") did not update to (" - << computePartitionString(newPartition) << ")" + << computePartitionString(updatePartition) << ")" << std::endl; } } else { + if (strcmp(orig_char_computePartition, current_char_computePartition) != + 0) { + devicePartitionUpdated = true; + } else { + devicePartitionUpdated = false; + } + ASSERT_EQ(RSMI_STATUS_SUCCESS, ret); - ASSERT_STREQ(computePartitionString(newPartition).c_str(), + ASSERT_STREQ(computePartitionString(updatePartition).c_str(), current_char_computePartition); IF_VERB(STANDARD) { std::cout << "\t**" << "Confirmed current compute partition (" << current_char_computePartition << ") matches" << "\n\t requested compute partition (" - << computePartitionString(newPartition) << ")" + << computePartitionString(updatePartition) << ")" << std::endl; } } - } + } // END looping through partition changes /* TEST RETURN TO BOOT COMPUTE PARTITION SETTING */ IF_VERB(STANDARD) { @@ -342,8 +367,14 @@ void TestComputePartitionReadWrite::Run(void) { std::string oldPartition = current_char_computePartition; bool wasResetSuccess = false; ret = rsmi_dev_compute_partition_reset(dv_ind); + IF_VERB(STANDARD) { + std::cout << "\t**" + << "rsmi_dev_compute_partition_reset(dv_ind): " + << amd::smi::getRSMIStatusString(ret, false) << "\n"; + } ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) || - (ret == RSMI_STATUS_NOT_SUPPORTED)); + (ret == RSMI_STATUS_NOT_SUPPORTED) || + (ret == RSMI_STATUS_BUSY)); if (ret == RSMI_STATUS_SUCCESS) { wasResetSuccess = true; } @@ -352,9 +383,15 @@ void TestComputePartitionReadWrite::Run(void) { CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { std::cout << "\t**" << "Current compute partition: " - << current_char_computePartition << std::endl; + << current_char_computePartition << "\n" + << "\t**" << "Original compute partition: " + << orig_char_computePartition << "\n" + << "\t**" << "Reset Successful: " + << (wasResetSuccess ? "TRUE" : "FALSE") << "\n" + << "\t**" << "Partitions Updated: " + << (devicePartitionUpdated ? "TRUE" : "FALSE") << "\n"; } - if (wasResetSuccess) { + if (wasResetSuccess && devicePartitionUpdated) { ASSERT_STRNE(oldPartition.c_str(), current_char_computePartition); IF_VERB(STANDARD) { std::cout << "\t**" @@ -379,7 +416,7 @@ void TestComputePartitionReadWrite::Run(void) { << "=========== TEST RETURN TO ORIGINAL COMPUTE PARTITION " << "SETTING ========" << std::endl; } - newPartition + rsmi_compute_partition_type_t newPartition = mapStringToRSMIComputePartitionTypes.at( std::string(orig_char_computePartition)); ret = rsmi_dev_compute_partition_set(dv_ind, newPartition); @@ -401,5 +438,5 @@ void TestComputePartitionReadWrite::Run(void) { ASSERT_EQ(RSMI_STATUS_SUCCESS, ret); ASSERT_STREQ(computePartitionString(newPartition).c_str(), current_char_computePartition); - } + } // END looping through devices } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/perf_determinism.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/perf_determinism.cc index b52fcb2690..790fc13fc3 100644 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/perf_determinism.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/perf_determinism.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2020, Advanced Micro Devices, Inc. + * Copyright (c) 2020-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -56,6 +56,7 @@ #include "rocm_smi/rocm_smi.h" #include "rocm_smi_test/functional/perf_determinism.h" #include "rocm_smi_test/test_common.h" +#include "rocm_smi/rocm_smi_utils.h" TestPerfDeterminism::TestPerfDeterminism() : TestBase() { @@ -103,23 +104,49 @@ void TestPerfDeterminism::Run(void) { for (uint32_t i = 0; i < num_monitor_devs(); ++i) { PrintDeviceHeader(i); + std::cout << "\t**Resetting performance determinism\n"; + err = rsmi_dev_perf_level_set(i, RSMI_DEV_PERF_LEVEL_AUTO); + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_perf_level_set(i, RSMI_DEV_PERF_LEVEL_AUTO): " + << amd::smi::getRSMIStatusString(err, false) + << "\n"; + } + CHK_ERR_ASRT(err) + ret = rsmi_dev_perf_level_get(i, &pfl); + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_perf_level_get(i, &pfl): " + << amd::smi::getRSMIStatusString(ret, false) << "\n"; + } + CHK_ERR_ASRT(ret) err = rsmi_dev_od_volt_info_get(i, &odv); + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_od_volt_info_get(i, &odv): " + << amd::smi::getRSMIStatusString(err, false) + << "\n" + << amd::smi::print_rsmi_od_volt_freq_data_t(&odv) + << "\n"; + } if (err == RSMI_STATUS_NOT_SUPPORTED) { IF_VERB(STANDARD) { - std::cout << "\t** Not supported on this machine" << std::endl; + std::cout << "\t** Not supported on this machine\n"; } return; - } - else{ + } else if (err == RSMI_STATUS_SUCCESS) { clkvalue = (odv.curr_sclk_range.lower_bound/1000000) + 50; + } else { + IF_VERB(STANDARD) { + std::cout << "\t** Unable to retrieve lower bound sclk, continue.. \n"; + } + continue; } + std::cout << "About to rsmi_perf_determinism_mode_set() -->\n"; err = rsmi_perf_determinism_mode_set(i, clkvalue); if (err == RSMI_STATUS_NOT_SUPPORTED) { IF_VERB(STANDARD) { std::cout << "\t**Not supported on this machine" << std::endl; } - return; + continue; } else { ret = rsmi_dev_perf_level_get(i, &pfl); CHK_ERR_ASRT(ret) @@ -130,7 +157,7 @@ void TestPerfDeterminism::Run(void) { } std::cout << "\t**Resetting performance determinism" << std::endl; - err = rsmi_dev_perf_level_set(i, RSMI_DEV_PERF_LEVEL_AUTO);; + err = rsmi_dev_perf_level_set(i, RSMI_DEV_PERF_LEVEL_AUTO); CHK_ERR_ASRT(err) ret = rsmi_dev_perf_level_get(i, &pfl); CHK_ERR_ASRT(ret) @@ -138,7 +165,6 @@ void TestPerfDeterminism::Run(void) { std::cout << "\t**New Perf Level:" << GetPerfLevelStr(pfl) << std::endl; } - return; - } - } + } // END - SET SUPPORTED + } // END - DEVICE LOOP } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/volt_freq_curv_read.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/volt_freq_curv_read.cc index 84f77d6afc..50b6ac057c 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/volt_freq_curv_read.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/volt_freq_curv_read.cc @@ -5,7 +5,7 @@ * The University of Illinois/NCSA * Open Source License (NCSA) * - * Copyright (c) 2019, Advanced Micro Devices, Inc. + * Copyright (c) 2019-2023, Advanced Micro Devices, Inc. * All rights reserved. * * Developed by: @@ -53,6 +53,7 @@ #include "rocm_smi/rocm_smi.h" #include "rocm_smi_test/functional/volt_freq_curv_read.h" #include "rocm_smi_test/test_common.h" +#include "rocm_smi/rocm_smi_utils.h" TestVoltCurvRead::TestVoltCurvRead() : TestBase() { set_title("RSMI Voltage-Frequency Curve Read Test"); @@ -84,69 +85,10 @@ void TestVoltCurvRead::Close() { TestBase::Close(); } -static void pt_rng_Mhz(std::string title, rsmi_range *r) { - assert(r != nullptr); - - std::cout << title << std::endl; - std::cout << "\t\t** " << r->lower_bound/1000000 << " to " << - r->upper_bound/1000000 << " MHz" << std::endl; -} - -static void pt_rng_mV(std::string title, rsmi_range *r) { - assert(r != nullptr); - - std::cout << title << std::endl; - std::cout << "\t\t** " << r->lower_bound << " to " << r->upper_bound << - " mV" << std::endl; -} - -static void print_pnt(rsmi_od_vddc_point_t *pt) { - std::cout << "\t\t** Frequency: " << pt->frequency/1000000 << "MHz" << - std::endl; - std::cout << "\t\t** Voltage: " << pt->voltage << "mV" << std::endl; -} -static void pt_vddc_curve(rsmi_od_volt_curve *c) { - assert(c != nullptr); - - for (uint32_t i = 0; i < RSMI_NUM_VOLTAGE_CURVE_POINTS; ++i) { - print_pnt(&c->vc_points[i]); - } -} - -static void print_rsmi_od_volt_freq_data_t(rsmi_od_volt_freq_data_t *odv) { - assert(odv != nullptr); - - std::cout.setf(std::ios::dec, std::ios::basefield); - pt_rng_Mhz("\t\tCurrent SCLK frequency range:", &odv->curr_sclk_range); - pt_rng_Mhz("\t\tCurrent MCLK frequency range:", &odv->curr_mclk_range); - pt_rng_Mhz("\t\tMin/Max Possible SCLK frequency range:", - &odv->sclk_freq_limits); - pt_rng_Mhz("\t\tMin/Max Possible MCLK frequency range:", - &odv->mclk_freq_limits); - - std::cout << "\t\tCurrent Freq/Volt. curve:" << std::endl; - pt_vddc_curve(&odv->curve); - - std::cout << "\tNumber of Freq./Volt. regions: " << - odv->num_regions << std::endl; -} - -static void print_odv_region(rsmi_freq_volt_region_t *region) { - pt_rng_Mhz("\t\tFrequency range:", ®ion->freq_range); - pt_rng_mV("\t\tVoltage range:", ®ion->volt_range); -} - -static void print_rsmi_od_volt_freq_regions(uint32_t num_regions, - rsmi_freq_volt_region_t *regions) { - for (uint32_t i = 0; i < num_regions; ++i) { - std::cout << "\tRegion " << i << ":" << std::endl; - print_odv_region(®ions[i]); - } -} - void TestVoltCurvRead::Run(void) { - rsmi_status_t err; + rsmi_status_t err, ret; rsmi_od_volt_freq_data_t odv; + rsmi_dev_perf_level_t pfl; TestBase::Run(); if (setup_failed_) { @@ -157,26 +99,57 @@ void TestVoltCurvRead::Run(void) { for (uint32_t i = 0; i < num_monitor_devs(); ++i) { PrintDeviceHeader(i); + std::cout << "\n\t**Resetting performance determinism to auto\n"; + err = rsmi_dev_perf_level_set(i, RSMI_DEV_PERF_LEVEL_AUTO); + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_perf_level_set(i, RSMI_DEV_PERF_LEVEL_AUTO): " + << amd::smi::getRSMIStatusString(err, false) + << "\n"; + } + CHK_ERR_ASRT(err) + ret = rsmi_dev_perf_level_get(i, &pfl); + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_perf_level_get(i, &pfl): " + << amd::smi::getRSMIStatusString(ret, false) << "\n"; + } + CHK_ERR_ASRT(ret) err = rsmi_dev_od_volt_info_get(i, &odv); - if (err == RSMI_STATUS_NOT_SUPPORTED) { + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_od_volt_info_get(i, &odv): " + << amd::smi::getRSMIStatusString(err, false) + << "\n" + << amd::smi::print_rsmi_od_volt_freq_data_t(&odv) + << "\n"; + } + if (err != RSMI_STATUS_SUCCESS) { IF_VERB(STANDARD) { std::cout << "\t**rsmi_dev_od_volt_info_get: Not supported on this machine" << std::endl; } - // Verify api support checking functionality is working - err = rsmi_dev_od_volt_info_get(i, nullptr); - ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED); - } else { - CHK_ERR_ASRT(err) - // Verify api support checking functionality is working - err = rsmi_dev_od_volt_info_get(i, nullptr); - ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS); + continue; + } + // Verify api support checking functionality is working + err = rsmi_dev_od_volt_info_get(i, nullptr); + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_od_volt_info_get(i, nullptr): " + << amd::smi::getRSMIStatusString(err, false) << "\n"; + // << "\n" + // << amd::smi::print_rsmi_od_volt_freq_data_t(&odv) + // << "\n"; + } + ASSERT_TRUE(err == RSMI_STATUS_INVALID_ARGS); + err = rsmi_dev_od_volt_info_get(i, &odv); + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_od_volt_info_get(i, &odv): " + << amd::smi::getRSMIStatusString(err, false) << "\n" + << amd::smi::print_rsmi_od_volt_freq_data_t(&odv) + << "\t**odv.num_regions = " << std::dec + << odv.num_regions << "\n"; } - if (err == RSMI_STATUS_SUCCESS) { - std::cout << "\t**Frequency-voltage curve data:" << std::endl; - print_rsmi_od_volt_freq_data_t(&odv); + std::cout << "\t**Frequency-voltage curve data:" << "\n"; + std::cout << amd::smi::print_rsmi_od_volt_freq_data_t(&odv); rsmi_freq_volt_region_t *regions; uint32_t num_regions; @@ -185,11 +158,30 @@ void TestVoltCurvRead::Run(void) { num_regions = odv.num_regions; err = rsmi_dev_od_volt_curve_regions_get(i, &num_regions, regions); + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_od_volt_curve_regions_get(" + << "i, &num_regions, regions): " + << amd::smi::getRSMIStatusString(err, false) << "\n" + << "\t**Number of regions: " << std::dec << num_regions + << "\n"; + } + ASSERT_TRUE(err == RSMI_STATUS_SUCCESS + || err == RSMI_STATUS_NOT_SUPPORTED + || err == RSMI_STATUS_UNEXPECTED_DATA + || err == RSMI_STATUS_UNEXPECTED_SIZE); + if (err != RSMI_STATUS_SUCCESS) { + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_od_volt_curve_regions_get: " + "Not supported on this machine" << std::endl; + } + continue; + } CHK_ERR_ASRT(err) ASSERT_TRUE(num_regions == odv.num_regions); std::cout << "\t**Frequency-voltage curve regions:" << std::endl; - print_rsmi_od_volt_freq_regions(num_regions, regions); + std::cout << amd::smi::print_rsmi_od_volt_freq_regions(num_regions, + regions); delete []regions; } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc index 8e140f4634..638b4224c8 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc @@ -163,6 +163,14 @@ TEST(rsmitstReadOnly, TestPerfLevelRead) { TestPerfLevelRead tst; RunGenericTest(&tst); } +TEST(rsmitstReadWrite, TestComputePartitionReadWrite) { + TestComputePartitionReadWrite tst; + RunGenericTest(&tst); +} +TEST(rsmitstReadWrite, TestMemoryPartitionReadWrite) { + TestMemoryPartitionReadWrite tst; + RunGenericTest(&tst); +} TEST(rsmitstReadWrite, TestPerfLevelReadWrite) { TestPerfLevelReadWrite tst; RunGenericTest(&tst); @@ -267,14 +275,6 @@ TEST(rsmitstReadOnly, TestMutualExclusion) { tst.Run(); RunCustomTestEpilog(&tst); } -TEST(rsmitstReadWrite, TestComputePartitionReadWrite) { - TestComputePartitionReadWrite tst; - RunGenericTest(&tst); -} -TEST(rsmitstReadWrite, TestMemoryPartitionReadWrite) { - TestMemoryPartitionReadWrite tst; - RunGenericTest(&tst); -} TEST(rsmitstReadWrite, TestEvtNotifReadWrite) { TestEvtNotifReadWrite tst; RunGenericTest(&tst);