Partition EBUSY with RSMI_STATUS_BUSY & invalid GPU Metrics check
* Updates:
- [API/CLI] rsmi_dev_*_partition_set &
rsmi_dev_*_partition_reset - exposed RSMI_STATUS_BUSY for
EBUSY writes + cleaned up accidental map insertions
(maplookup[] can insert values that are not in the map,
map.at(key) fixes this potential issue)
- [API] rsmi_dev_gpu_metrics_info_get() - returns
RSMI_STATUS_NOT_SUPPORTED for unsupported metric tables
outside of 1v1/1v2/1v3
- [API] writeDevInfoStr() - exposes RSMI_STATUS_BUSY for
EBUSY write errors; kept backward compatibility
for other writes which do not care about these states
- [API] rsmi_dev_od_volt_info_get()
& rsmi_dev_od_volt_curve_regions_get() have better logging
+ Expose more details on why they are erroring
- [Utils/logs/example] Expose AMD GPU gfx target version to aid in
system troubleshooting
- [Utils] Added test methods that look at od volt
freq & regions into here - for easier access across
several tests
- [Utils] Updated getRSMIStatusString(new argument - fullstatus;
default to true for backwards compatibility)
-> true shows shortened RSMI STATUS response
- [Utils] Added splitString to cut out noisy return responses
(used in getRSMIStatusString(), when fullstatus = true)
- [Utils] Added getFileCreationDate() to expose build date
of the library - helpful for local builds or experimental builds
- [Utils] Macro cleanup
- [Example] Added a few gpu_metric checks - helpful for upcoming
updates
- [Device] SYSFS/DebugFS - now have better r/w displayed in logs
- [LOGS] Expose library build date - see above for details
- [Tests] Add more warnings/errors to test builds
- [Tests] Moved up Partition tests for ordered test runs - helped
identify issues with GPU BUSY writes
- [Tests] compute_partition_read_write - handles RSMI_STATUS_BUSY
with waits for busy status found & cleaned up how we checked
for partition changes - with RSMI responses exposed more clearly
- [Tests] perf_determinism - multi gpu now properly runs through
with full resets as needed
- [Tests] volt_freq_curv_read - better error handling with more
verbose output
Change-Id: Ie94c6abb6a9aab95c345996d3ad3843cf6734977
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
[ROCm/amdsmi commit: 57b6135e54]
Этот коммит содержится в:
@@ -72,7 +72,7 @@ endif()
|
||||
|
||||
## Compiler flags
|
||||
set(CMAKE_CXX_FLAGS
|
||||
"${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti")
|
||||
"${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti -std=c++17")
|
||||
if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64")
|
||||
set(CMAKE_CXX_FLAGS
|
||||
"${CMAKE_CXX_FLAGS} -m64 -msse -msse2")
|
||||
|
||||
@@ -363,16 +363,16 @@ typedef rsmi_clk_type_t rsmi_clk_type;
|
||||
*/
|
||||
typedef enum {
|
||||
RSMI_COMPUTE_PARTITION_INVALID = 0,
|
||||
RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with
|
||||
//!< shared memory
|
||||
RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work
|
||||
//!< together with shared memory
|
||||
RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work
|
||||
//!< together with shared memory
|
||||
RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs
|
||||
//!< work together with shared memory
|
||||
RSMI_COMPUTE_PARTITION_QPX //!< Quad GPU mode (QPX)- Quarter XCCs
|
||||
//!< work together with shared memory
|
||||
RSMI_COMPUTE_PARTITION_CPX = 1, //!< Core mode (CPX)- Per-chip XCC with
|
||||
//!< shared memory
|
||||
RSMI_COMPUTE_PARTITION_SPX = 2, //!< Single GPU mode (SPX)- All XCCs work
|
||||
//!< together with shared memory
|
||||
RSMI_COMPUTE_PARTITION_DPX = 3, //!< Dual GPU mode (DPX)- Half XCCs work
|
||||
//!< together with shared memory
|
||||
RSMI_COMPUTE_PARTITION_TPX = 4, //!< Triple GPU mode (TPX)- One-third XCCs
|
||||
//!< work together with shared memory
|
||||
RSMI_COMPUTE_PARTITION_QPX = 5, //!< Quad GPU mode (QPX)- Quarter XCCs
|
||||
//!< work together with shared memory
|
||||
} rsmi_compute_partition_type_t;
|
||||
/// \cond Ignore in docs.
|
||||
typedef rsmi_compute_partition_type_t rsmi_compute_partition_type;
|
||||
@@ -3783,6 +3783,8 @@ rsmi_dev_compute_partition_get(uint32_t dv_ind, char *compute_partition,
|
||||
* unavailable for current device
|
||||
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
|
||||
* support this function
|
||||
* @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired
|
||||
* because it is already being used - device is busy
|
||||
*
|
||||
*/
|
||||
rsmi_status_t
|
||||
@@ -3802,6 +3804,8 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
|
||||
* @retval ::RSMI_STATUS_PERMISSION function requires root access
|
||||
* @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not
|
||||
* support this function
|
||||
* @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired
|
||||
* because it is already being used - device is busy
|
||||
*
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind);
|
||||
@@ -3866,6 +3870,8 @@ rsmi_dev_memory_partition_get(uint32_t dv_ind, char *memory_partition,
|
||||
* support this function
|
||||
* @retval ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart
|
||||
* the amdgpu driver
|
||||
* @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired
|
||||
* because it is already being used - device is busy
|
||||
*
|
||||
*/
|
||||
rsmi_status_t
|
||||
@@ -3887,6 +3893,8 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
|
||||
* support this function
|
||||
* @retval ::RSMI_STATUS_AMDGPU_RESTART_ERR could not successfully restart
|
||||
* the amdgpu driver
|
||||
* @retval ::RSMI_STATUS_BUSY A resource or mutex could not be acquired
|
||||
* because it is already being used - device is busy
|
||||
*
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind);
|
||||
|
||||
@@ -260,7 +260,8 @@ class Device {
|
||||
std::vector<std::string> *retVec);
|
||||
int readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
|
||||
void *p_binary_data);
|
||||
int writeDevInfoStr(DevInfoTypes type, std::string valStr);
|
||||
int writeDevInfoStr(DevInfoTypes type, std::string valStr,
|
||||
bool returnWriteErr = false);
|
||||
rsmi_status_t run_amdgpu_property_reinforcement_query(const AMDGpuPropertyQuery_t& amdgpu_property_query);
|
||||
|
||||
|
||||
|
||||
@@ -84,6 +84,9 @@ class KFDNode {
|
||||
int get_total_memory(uint64_t* total);
|
||||
int get_used_memory(uint64_t* used);
|
||||
|
||||
// Get gfx target version from kfd
|
||||
int get_gfx_target_version(uint64_t* gfx_target_version);
|
||||
|
||||
private:
|
||||
uint32_t node_indx_;
|
||||
uint32_t amdgpu_dev_index_;
|
||||
|
||||
@@ -51,6 +51,8 @@
|
||||
#include <sstream>
|
||||
#include <iomanip>
|
||||
#include <type_traits>
|
||||
#include <tuple>
|
||||
#include <queue>
|
||||
|
||||
#include "rocm_smi/rocm_smi_device.h"
|
||||
|
||||
@@ -97,10 +99,10 @@ rsmi_status_t
|
||||
GetDevBinaryBlob(amd::smi::DevInfoTypes type,
|
||||
uint32_t dv_ind, std::size_t b_size, void* p_binary_data);
|
||||
rsmi_status_t ErrnoToRsmiStatus(int err);
|
||||
std::string getRSMIStatusString(rsmi_status_t ret);
|
||||
std::string getRSMIStatusString(rsmi_status_t ret, bool fullStatus = true);
|
||||
std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string>
|
||||
std::string, std::string, std::string, std::string, std::string>
|
||||
getSystemDetails(void);
|
||||
void logSystemDetails(void);
|
||||
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str);
|
||||
@@ -109,11 +111,20 @@ void logHexDump(const char *desc, const void *addr, const size_t len,
|
||||
bool isSystemBigEndian();
|
||||
std::string getBuildType();
|
||||
std::string getMyLibPath();
|
||||
std::string getFileCreationDate(std::string path);
|
||||
int subDirectoryCountInPath(const std::string path);
|
||||
std::queue<std::string> getAllDeviceGfxVers();
|
||||
std::string monitor_type_string(amd::smi::MonitorTypes type);
|
||||
std::string power_type_string(RSMI_POWER_TYPE type);
|
||||
std::string splitString(std::string str, char delim);
|
||||
std::string print_rsmi_od_volt_freq_data_t(rsmi_od_volt_freq_data_t *odv);
|
||||
std::string print_rsmi_od_volt_freq_regions(uint32_t num_regions,
|
||||
rsmi_freq_volt_region_t *regions);
|
||||
bool is_sudo_user();
|
||||
rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind,
|
||||
std::string *gfx_version);
|
||||
template <typename T>
|
||||
std::string print_int_as_hex(T i, bool showHexNotation=true) {
|
||||
std::string print_int_as_hex(T i, bool showHexNotation = true) {
|
||||
std::stringstream ss;
|
||||
if (showHexNotation) {
|
||||
ss << "0x" << std::setfill('0') << std::setw(sizeof(T) * 2) << std::hex;
|
||||
@@ -132,7 +143,7 @@ std::string print_int_as_hex(T i, bool showHexNotation=true) {
|
||||
}
|
||||
ss << std::dec;
|
||||
return ss.str();
|
||||
};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::string print_unsigned_int(T i) {
|
||||
@@ -263,7 +274,7 @@ class ScopedAcquire {
|
||||
LockType* lock_;
|
||||
bool doRelease;
|
||||
/// @brief: Disable copiable and assignable ability.
|
||||
DISALLOW_COPY_AND_ASSIGN(ScopedAcquire);
|
||||
DISALLOW_COPY_AND_ASSIGN(ScopedAcquire)
|
||||
};
|
||||
|
||||
} // namespace smi
|
||||
|
||||
@@ -966,6 +966,9 @@ def resetComputePartition(deviceList):
|
||||
printLog(device, 'Permission denied', None)
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
printLog(device, 'Not supported on the given system', None)
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_BUSY:
|
||||
printLog(device, 'Device is currently busy, try again later',
|
||||
None)
|
||||
else:
|
||||
rsmi_ret_ok(ret, device, 'reset_compute_partition')
|
||||
printErrLog(device, 'Failed to reset the compute partition to boot state')
|
||||
@@ -1002,6 +1005,9 @@ def resetMemoryPartition(deviceList):
|
||||
printLog(device, 'Permission denied', None, addExtraLine)
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
printLog(device, 'Not supported on the given system', None, addExtraLine)
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_BUSY:
|
||||
printLog(device, 'Device is currently busy, try again later',
|
||||
None)
|
||||
else:
|
||||
rsmi_ret_ok(ret, device, 'reset_memory_partition')
|
||||
printErrLog(device, 'Failed to reset memory partition to boot state')
|
||||
@@ -1603,6 +1609,9 @@ def setComputePartition(deviceList, computePartitionType):
|
||||
%computePartitionType, None)
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
printLog(device, 'Not supported on the given system', None)
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_BUSY:
|
||||
printLog(device, 'Device is currently busy, try again later',
|
||||
None)
|
||||
else:
|
||||
rsmi_ret_ok(ret, device, 'set_compute_partition')
|
||||
printErrLog(device, 'Failed to retrieve compute partition, even though device supports it.')
|
||||
@@ -1673,6 +1682,9 @@ def setMemoryPartition(deviceList, memoryPartition):
|
||||
printLog(device, 'Permission denied', None, addExtraLine)
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED:
|
||||
printLog(device, 'Not supported on the given system', None, addExtraLine)
|
||||
elif ret == rsmi_status_t.RSMI_STATUS_BUSY:
|
||||
printLog(device, 'Device is currently busy, try again later',
|
||||
None, addExtraLine)
|
||||
else:
|
||||
rsmi_ret_ok(ret, device, 'set_memory_partition')
|
||||
printErrLog(device, 'Failed to retrieve memory partition, even though device supports it.')
|
||||
|
||||
@@ -156,6 +156,15 @@
|
||||
} \
|
||||
}
|
||||
|
||||
void print_function_header_with_rsmi_ret(
|
||||
rsmi_status_t myReturn, std::string header = "") {
|
||||
std::cout << "\t** ";
|
||||
if (!header.empty()) {
|
||||
std::cout << header << ": ";
|
||||
}
|
||||
std::cout << amd::smi::getRSMIStatusString(myReturn, false) << "\n";
|
||||
}
|
||||
|
||||
static void print_test_header(const char *str, uint32_t dv_ind) {
|
||||
std::cout << "********************************" << "\n";
|
||||
std::cout << "*** " << str << "\n";
|
||||
@@ -254,14 +263,24 @@ perf_level_string(rsmi_dev_perf_level_t perf_lvl) {
|
||||
}
|
||||
}
|
||||
|
||||
static bool isUserRunningAsSudo() {
|
||||
bool isRunningWithSudo = false;
|
||||
auto myUID = getuid();
|
||||
auto myPrivledges = geteuid();
|
||||
if ((myUID == myPrivledges) && (myPrivledges == 0)) {
|
||||
isRunningWithSudo = true;
|
||||
static const std::string
|
||||
clock_type_string(rsmi_clk_type_t clk) {
|
||||
switch (clk) {
|
||||
case RSMI_CLK_TYPE_SYS:
|
||||
return "RSMI_CLK_TYPE_SYS";
|
||||
case RSMI_CLK_TYPE_DF:
|
||||
return "RSMI_CLK_TYPE_DF";
|
||||
case RSMI_CLK_TYPE_DCEF:
|
||||
return "RSMI_CLK_TYPE_DCEF";
|
||||
case RSMI_CLK_TYPE_SOC:
|
||||
return "RSMI_CLK_TYPE_SOC";
|
||||
case RSMI_CLK_TYPE_MEM:
|
||||
return "RSMI_CLK_TYPE_MEM";
|
||||
case RSMI_CLK_TYPE_PCIE:
|
||||
return "RSMI_CLK_TYPE_PCIE";
|
||||
default:
|
||||
return "RSMI_CLK_INVALID";
|
||||
}
|
||||
return isRunningWithSudo;
|
||||
}
|
||||
|
||||
static bool isFileWritable(rsmi_status_t response) {
|
||||
@@ -271,7 +290,7 @@ static bool isFileWritable(rsmi_status_t response) {
|
||||
// isFileWritable(ret) - intends to capture this
|
||||
// response situation.
|
||||
bool fileWritable = true;
|
||||
if (isUserRunningAsSudo() && (response == RSMI_STATUS_PERMISSION)) {
|
||||
if (amd::smi::is_sudo_user() && (response == RSMI_STATUS_PERMISSION)) {
|
||||
std::cout << "[WARN] User is running with sudo "
|
||||
<< "permissions, file is not writable." << "\n";
|
||||
fileWritable = false;
|
||||
@@ -574,9 +593,19 @@ static rsmi_status_t test_set_freq(uint32_t dv_ind) {
|
||||
}
|
||||
|
||||
static void print_frequencies(rsmi_frequencies_t *f) {
|
||||
assert(f != nullptr);
|
||||
bool hasDeepSleep = false;
|
||||
if (f == nullptr) {
|
||||
std::cout << "Freq was nullptr\n";
|
||||
return;
|
||||
}
|
||||
for (uint32_t j = 0; j < f->num_supported; ++j) {
|
||||
std::cout << "\t** " << j << ": " << std::to_string(f->frequency[j]);
|
||||
if (f->has_deep_sleep && j == 0) {
|
||||
std::cout << "\t** S: " << std::to_string(f->frequency[j]);
|
||||
hasDeepSleep = true;
|
||||
} else {
|
||||
std::cout << "\t** " << (hasDeepSleep ? j-1 : j)
|
||||
<< ": " << std::to_string(f->frequency[j]);
|
||||
}
|
||||
if (j == f->current) {
|
||||
std::cout << " *";
|
||||
}
|
||||
@@ -714,6 +743,7 @@ int main() {
|
||||
rsmi_frequencies_t f;
|
||||
uint32_t num_monitor_devs = 0;
|
||||
rsmi_gpu_metrics_t p;
|
||||
std::string val_str;
|
||||
RSMI_POWER_TYPE power_type = RSMI_INVALID_POWER;
|
||||
|
||||
rsmi_num_monitor_devices(&num_monitor_devs);
|
||||
@@ -725,6 +755,8 @@ int main() {
|
||||
ret = rsmi_dev_revision_get(i, &val_ui16);
|
||||
CHK_RSMI_RET_I(ret)
|
||||
std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << val_ui16 << "\n";
|
||||
ret = amd::smi::rsmi_get_gfx_target_version(i , &val_str);
|
||||
std::cout << "\t**Target Graphics Version: " << val_str << "\n";
|
||||
|
||||
char current_compute_partition[256];
|
||||
current_compute_partition[0] = '\0';
|
||||
@@ -736,7 +768,7 @@ int main() {
|
||||
? "UNKNOWN" : current_compute_partition);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
std::cout << ", RSMI_STATUS = ";
|
||||
} else {
|
||||
} else {
|
||||
std::cout << "\n";
|
||||
}
|
||||
CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret)
|
||||
@@ -773,8 +805,38 @@ int main() {
|
||||
}
|
||||
|
||||
ret = rsmi_dev_gpu_metrics_info_get(i, &p);
|
||||
CHK_AND_PRINT_RSMI_ERR_RET(ret)
|
||||
std::cout << "\t**GPU METRICS" << "\n";
|
||||
print_test_header("GPU METRICS", i);
|
||||
print_function_header_with_rsmi_ret(ret,
|
||||
"rsmi_dev_gpu_metrics_info_get(" + std::to_string(i) + ", &p)");
|
||||
std::cout << "\t**p.average_gfxclk_frequency: " << std::dec
|
||||
<< p.average_gfxclk_frequency << "\n";
|
||||
std::cout << "\t**p.average_socclk_frequency: " << std::dec
|
||||
<< p.average_socclk_frequency << "\n";
|
||||
std::cout << "\t**p.average_uclk_frequency: " << std::dec
|
||||
<< p.average_uclk_frequency << "\n";
|
||||
std::cout << "\t**p.average_vclk0_frequency: " << std::dec
|
||||
<< p.average_vclk0_frequency << "\n";
|
||||
std::cout << "\t**p.average_dclk0_frequency: " << std::dec
|
||||
<< p.average_dclk0_frequency << "\n";
|
||||
std::cout << "\t**p.average_vclk1_frequency: " << std::dec
|
||||
<< p.average_vclk1_frequency << "\n";
|
||||
std::cout << "\t**p.average_dclk1_frequency: " << std::dec
|
||||
<< p.average_dclk1_frequency << "\n";
|
||||
|
||||
std::cout << "\t**p.current_gfxclk: " << std::dec
|
||||
<< p.current_gfxclk << "\n";
|
||||
std::cout << "\t**p.current_socclk: " << std::dec
|
||||
<< p.current_socclk << "\n";
|
||||
std::cout << "\t**p.current_uclk: " << std::dec
|
||||
<< p.current_uclk << "\n";
|
||||
std::cout << "\t**p.current_vclk0: " << std::dec
|
||||
<< p.current_vclk0 << "\n";
|
||||
std::cout << "\t**p.current_dclk0: " << std::dec
|
||||
<< p.current_dclk0 << "\n";
|
||||
std::cout << "\t**p.current_vclk1: " << std::dec
|
||||
<< p.current_vclk1 << "\n";
|
||||
std::cout << "\t**p.current_dclk1: " << std::dec
|
||||
<< p.current_dclk1 << "\n";
|
||||
|
||||
ret = rsmi_dev_perf_level_get(i, &pfl);
|
||||
CHK_AND_PRINT_RSMI_ERR_RET(ret)
|
||||
@@ -784,25 +846,25 @@ int main() {
|
||||
CHK_AND_PRINT_RSMI_ERR_RET(ret)
|
||||
std::cout << "\t**OverDrive Level:" << val_ui32 << "\n";
|
||||
|
||||
ret = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_MEM, &f);
|
||||
CHK_AND_PRINT_RSMI_ERR_RET(ret)
|
||||
std::cout << "\t**Supported GPU Memory clock frequencies: ";
|
||||
std::cout << f.num_supported << "\n";
|
||||
print_frequencies(&f);
|
||||
|
||||
ret = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_SYS, &f);
|
||||
CHK_AND_PRINT_RSMI_ERR_RET(ret)
|
||||
std::cout << "\t**Supported GPU clock frequencies: ";
|
||||
std::cout << f.num_supported << "\n";
|
||||
print_frequencies(&f);
|
||||
|
||||
ret = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_SOC, &f);
|
||||
CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret)
|
||||
std::cout << "\t**Supported GPU clock frequencies (SOC clk): ";
|
||||
std::cout << f.num_supported << "\n";
|
||||
std::cout << "\t**Current value (SOC clk): ";
|
||||
std::cout << f.current << "\n";
|
||||
print_frequencies(&f);
|
||||
print_test_header("GPU Clocks", i);
|
||||
for (int clkType = static_cast<int>(RSMI_CLK_TYPE_SYS);
|
||||
clkType <= static_cast<int>(RSMI_CLK_TYPE_PCIE);
|
||||
clkType++) {
|
||||
rsmi_clk_type_t type = static_cast<rsmi_clk_type_t>(clkType);
|
||||
ret = rsmi_dev_gpu_clk_freq_get(i, type, &f);
|
||||
print_function_header_with_rsmi_ret(ret,
|
||||
"rsmi_dev_gpu_clk_freq_get(" + std::to_string(i) +
|
||||
", " + clock_type_string(type) + ", &f)");
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
std::cout << "\t** " << clock_type_string(type)
|
||||
<< " - Supported # of freqs: ";
|
||||
std::cout << f.num_supported << "\n";
|
||||
std::cout << "\t** " << clock_type_string(type) << " f.current: "
|
||||
<< f.current << "\n";
|
||||
print_frequencies(&f);
|
||||
}
|
||||
|
||||
std::cout << "\t**Monitor name: ";
|
||||
char name[128];
|
||||
@@ -892,7 +954,7 @@ int main() {
|
||||
}
|
||||
|
||||
std::cout << "***** Testing write api's" << "\n";
|
||||
if (isUserRunningAsSudo() == false) {
|
||||
if (amd::smi::is_sudo_user() == false) {
|
||||
std::cout << "Write APIs require users to execute with sudo. "
|
||||
<< "Cannot proceed." << "\n";
|
||||
return 0;
|
||||
|
||||
@@ -929,6 +929,9 @@ rsmi_status_t
|
||||
rsmi_perf_determinism_mode_set(uint32_t dv_ind, uint64_t clkvalue) {
|
||||
TRY
|
||||
DEVICE_MUTEX
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
// Set perf. level to performance determinism so that we can then set the power profile
|
||||
rsmi_status_t ret = rsmi_dev_perf_level_set_v1(dv_ind,
|
||||
@@ -1510,6 +1513,9 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint,
|
||||
|
||||
static void get_vc_region(uint32_t start_ind,
|
||||
std::vector<std::string> *val_vec, rsmi_freq_volt_region_t *p) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
assert(p != nullptr);
|
||||
assert(val_vec != nullptr);
|
||||
THROW_IF_NULLPTR_DEREF(p)
|
||||
@@ -1520,6 +1526,9 @@ static void get_vc_region(uint32_t start_ind,
|
||||
assert((*val_vec)[kOD_OD_RANGE_label_array_index] == "OD_RANGE:");
|
||||
if ((val_vec->size() < kOD_OD_RANGE_label_array_index + 2) ||
|
||||
((*val_vec)[kOD_OD_RANGE_label_array_index] != "OD_RANGE:") ) {
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning "
|
||||
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA);
|
||||
LOG_TRACE(ss);
|
||||
throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_DATA, __FUNCTION__);
|
||||
}
|
||||
od_value_pair_str_to_range((*val_vec)[start_ind], &p->freq_range);
|
||||
@@ -1539,6 +1548,7 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind,
|
||||
TRY
|
||||
std::vector<std::string> val_vec;
|
||||
rsmi_status_t ret;
|
||||
std::ostringstream ss;
|
||||
|
||||
assert(num_regions != nullptr);
|
||||
assert(p != nullptr);
|
||||
@@ -1547,12 +1557,20 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind,
|
||||
|
||||
ret = GetDevValueVec(amd::smi::kDevPowerODVoltage, dv_ind, &val_vec);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: could not retreive kDevPowerODVoltage" << "; returning "
|
||||
<< getRSMIStatusString(ret);
|
||||
LOG_ERROR(ss);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// This is a work-around to handle systems where kDevPowerODVoltage is not
|
||||
// fully supported yet.
|
||||
if (val_vec.size() < 2) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: val_vec.size() < 2" << "; returning "
|
||||
<< getRSMIStatusString(RSMI_STATUS_NOT_YET_IMPLEMENTED);
|
||||
LOG_ERROR(ss);
|
||||
return RSMI_STATUS_NOT_YET_IMPLEMENTED;
|
||||
}
|
||||
|
||||
@@ -1560,8 +1578,17 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind,
|
||||
assert((val_vec_size - kOD_VDDC_CURVE_start_index) > 0);
|
||||
assert((val_vec_size - kOD_VDDC_CURVE_start_index)%2 == 0);
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | val_vec_size = " << std::dec
|
||||
<< val_vec_size
|
||||
<< " | kOD_VDDC_CURVE_start_index = " << kOD_VDDC_CURVE_start_index;
|
||||
LOG_DEBUG(ss);
|
||||
if (((val_vec_size - kOD_VDDC_CURVE_start_index) <= 0) ||
|
||||
(((val_vec_size - kOD_VDDC_CURVE_start_index)%2 != 0))) {
|
||||
ss << __PRETTY_FUNCTION__ << " | Issue: od vdd curve returned unexpected "
|
||||
<< "data" << "; returning "
|
||||
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_SIZE);
|
||||
LOG_ERROR(ss);
|
||||
throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_SIZE, __FUNCTION__);
|
||||
}
|
||||
|
||||
@@ -2749,6 +2776,9 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) {
|
||||
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
DEVICE_MUTEX
|
||||
if (odv == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
CHK_SUPPORT_NAME_ONLY(odv)
|
||||
rsmi_status_t ret = get_od_clk_volt_info(dv_ind, odv);
|
||||
|
||||
@@ -2779,7 +2809,7 @@ rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind,
|
||||
uint32_t *num_regions, rsmi_freq_volt_region_t *buffer) {
|
||||
TRY
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
CHK_SUPPORT_NAME_ONLY((num_regions == nullptr || buffer == nullptr) ?
|
||||
@@ -2791,6 +2821,12 @@ rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind,
|
||||
DEVICE_MUTEX
|
||||
rsmi_status_t ret = get_od_clk_volt_curve_regions(dv_ind, num_regions,
|
||||
buffer);
|
||||
if (*num_regions == 0) {
|
||||
ret = RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning "
|
||||
<< getRSMIStatusString(ret);
|
||||
LOG_TRACE(ss);
|
||||
return ret;
|
||||
CATCH
|
||||
}
|
||||
@@ -4468,7 +4504,7 @@ get_compute_partition(uint32_t dv_ind, std::string &compute_partition) {
|
||||
return ret;
|
||||
}
|
||||
|
||||
switch (mapStringToRSMIComputePartitionTypes[compute_partition_str]) {
|
||||
switch (mapStringToRSMIComputePartitionTypes.at(compute_partition_str)) {
|
||||
case RSMI_COMPUTE_PARTITION_CPX:
|
||||
case RSMI_COMPUTE_PARTITION_SPX:
|
||||
case RSMI_COMPUTE_PARTITION_DPX:
|
||||
@@ -4585,9 +4621,12 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
|
||||
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
REQUIRE_ROOT_ACCESS
|
||||
if (!amd::smi::is_sudo_user()) {
|
||||
return RSMI_STATUS_PERMISSION;
|
||||
}
|
||||
DEVICE_MUTEX
|
||||
std::string newComputePartitionStr
|
||||
= mapRSMIToStringComputePartitionTypes[compute_partition];
|
||||
= mapRSMIToStringComputePartitionTypes.at(compute_partition);
|
||||
std::string currentComputePartition;
|
||||
|
||||
switch (compute_partition) {
|
||||
@@ -4605,6 +4644,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: "
|
||||
<< devInfoTypesStrings.at(amd::smi::kDevComputePartition)
|
||||
<< " | Data: " << newComputePartitionStr
|
||||
<< " | Cause: requested setting was invalid"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_INVALID_ARGS) << " |";
|
||||
@@ -4623,6 +4663,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: "
|
||||
<< devInfoTypesStrings.at(amd::smi::kDevComputePartition)
|
||||
<< " | Data: " << newComputePartitionStr
|
||||
<< " | Cause: not an available compute partition setting"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(available_ret) << " |";
|
||||
@@ -4650,7 +4691,7 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
|
||||
return ret_get;
|
||||
}
|
||||
rsmi_compute_partition_type_t currRSMIComputePartition
|
||||
= mapStringToRSMIComputePartitionTypes[currentComputePartition];
|
||||
= mapStringToRSMIComputePartitionTypes.at(currentComputePartition);
|
||||
if (currRSMIComputePartition == compute_partition) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
@@ -4665,6 +4706,15 @@ rsmi_dev_compute_partition_set(uint32_t dv_ind,
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | about to try writing |"
|
||||
<< newComputePartitionStr
|
||||
<< "| size of string = " << newComputePartitionStr.size()
|
||||
<< "| size of c-string = "<< std::dec
|
||||
<< sizeof(newComputePartitionStr.c_str())/sizeof(newComputePartitionStr[0])
|
||||
<< "| sizeof string = " << std::dec
|
||||
<< sizeof(newComputePartitionStr);
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
GET_DEV_FROM_INDX
|
||||
int ret = dev->writeDevInfo(amd::smi::kDevComputePartition,
|
||||
newComputePartitionStr);
|
||||
@@ -4699,7 +4749,7 @@ static rsmi_status_t get_memory_partition(uint32_t dv_ind,
|
||||
return ret;
|
||||
}
|
||||
|
||||
switch (mapStringToMemoryPartitionTypes[val_str]) {
|
||||
switch (mapStringToMemoryPartitionTypes.at(val_str)) {
|
||||
case RSMI_MEMORY_PARTITION_NPS1:
|
||||
case RSMI_MEMORY_PARTITION_NPS2:
|
||||
case RSMI_MEMORY_PARTITION_NPS4:
|
||||
@@ -4755,7 +4805,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
|
||||
}
|
||||
|
||||
std::string newMemoryPartition
|
||||
= mapRSMIToStringMemoryPartitionTypes[memory_partition];
|
||||
= mapRSMIToStringMemoryPartitionTypes.at(memory_partition);
|
||||
std::string currentMemoryPartition;
|
||||
|
||||
switch (memory_partition) {
|
||||
@@ -4798,7 +4848,7 @@ rsmi_dev_memory_partition_set(uint32_t dv_ind,
|
||||
return ret_get;
|
||||
}
|
||||
rsmi_memory_partition_type_t currRSMIMemoryPartition
|
||||
= mapStringToMemoryPartitionTypes[currentMemoryPartition];
|
||||
= mapStringToMemoryPartitionTypes.at(currentMemoryPartition);
|
||||
if (currRSMIMemoryPartition == memory_partition) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
@@ -4942,7 +4992,7 @@ rsmi_status_t rsmi_dev_compute_partition_reset(uint32_t dv_ind) {
|
||||
// Likely due to device not supporting it
|
||||
if (bootState != "UNKNOWN") {
|
||||
rsmi_compute_partition_type_t compute_partition =
|
||||
mapStringToRSMIComputePartitionTypes[bootState];
|
||||
mapStringToRSMIComputePartitionTypes.at(bootState);
|
||||
ret = rsmi_dev_compute_partition_set(dv_ind, compute_partition);
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
@@ -4981,7 +5031,7 @@ rsmi_status_t rsmi_dev_memory_partition_reset(uint32_t dv_ind) {
|
||||
// Likely due to device not supporting it
|
||||
if (bootState != "UNKNOWN") {
|
||||
rsmi_memory_partition_type_t memory_partition =
|
||||
mapStringToMemoryPartitionTypes[bootState];
|
||||
mapStringToMemoryPartitionTypes.at(bootState);
|
||||
ret = rsmi_dev_memory_partition_set(dv_ind, memory_partition);
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
|
||||
@@ -598,14 +598,17 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
|
||||
|
||||
int ret = isRegularFile(sysfs_path, ®_file);
|
||||
if (ret != 0) {
|
||||
ss << "File did not exist - SYSFS file (" << sysfs_path
|
||||
ss << __PRETTY_FUNCTION__ << " | Issue: File did not exist - SYSFS file ("
|
||||
<< sysfs_path
|
||||
<< ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type)
|
||||
<< "), returning " << std::to_string(ret);
|
||||
LOG_ERROR(ss);
|
||||
return ret;
|
||||
}
|
||||
if (!reg_file) {
|
||||
ss << "File is not a regular file - SYSFS file (" << sysfs_path << ") for "
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: File is not a regular file - SYSFS file ("
|
||||
<< sysfs_path << ") for "
|
||||
<< "DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << "),"
|
||||
<< " returning ENOENT (" << std::strerror(ENOENT) << ")";
|
||||
LOG_ERROR(ss);
|
||||
@@ -615,7 +618,8 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
|
||||
fs->open(sysfs_path);
|
||||
|
||||
if (!fs->is_open()) {
|
||||
ss << "Could not open - SYSFS file (" << sysfs_path << ") for "
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue: Could not open - SYSFS file (" << sysfs_path << ") for "
|
||||
<< "DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << "), "
|
||||
<< ", returning " << std::to_string(errno) << " ("
|
||||
<< std::strerror(errno) << ")";
|
||||
@@ -623,7 +627,8 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
|
||||
return errno;
|
||||
}
|
||||
|
||||
ss << "Successfully opened SYSFS file (" << sysfs_path
|
||||
ss << __PRETTY_FUNCTION__ << " | Successfully opened SYSFS file ("
|
||||
<< sysfs_path
|
||||
<< ") for DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type)
|
||||
<< ")";
|
||||
LOG_INFO(ss);
|
||||
@@ -671,32 +676,51 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) {
|
||||
ret = openSysfsFileStream(type, &fs);
|
||||
if (ret != 0) {
|
||||
ss << "Could not read device info string for DevInfoType ("
|
||||
<< RocmSMI::devInfoTypesStrings.at(type)<< "), returning "
|
||||
<< RocmSMI::devInfoTypesStrings.at(type) << "), returning "
|
||||
<< std::to_string(ret);
|
||||
LOG_ERROR(ss);
|
||||
return ret;
|
||||
}
|
||||
|
||||
fs >> *retStr;
|
||||
std::string info = "Successfully read device info string for DevInfoType (" +
|
||||
RocmSMI::devInfoTypesStrings.at(type) + "): " +
|
||||
*retStr;
|
||||
LOG_INFO(info);
|
||||
fs.close();
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< "Successfully read device info string for DevInfoType (" +
|
||||
RocmSMI::devInfoTypesStrings.at(type) + "): " + *retStr
|
||||
<< " | "
|
||||
<< (fs.is_open() ? " File stream is opened" : " File stream is closed")
|
||||
<< " | " << (fs.bad() ? "[ERROR] Bad read operation" :
|
||||
"[GOOD] No bad bit read, successful read operation")
|
||||
<< " | " << (fs.fail() ? "[ERROR] Failed read - format error" :
|
||||
"[GOOD] No fail - Successful read operation")
|
||||
<< " | " << (fs.eof() ? "[ERROR] Failed read - EOF error" :
|
||||
"[GOOD] No eof error - Successful read operation")
|
||||
<< " | " << (fs.good() ? "[GOOD] read good - Successful read operation" :
|
||||
"[ERROR] Failed read - good error");
|
||||
LOG_INFO(ss);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr) {
|
||||
auto tempPath = path_;
|
||||
int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr,
|
||||
bool returnWriteErr) {
|
||||
// returnWriteErr = false, backwards compatability (old calls)
|
||||
// returnWriteErr = true, improvement - allows us to detect errors
|
||||
// when writing to file
|
||||
// (such as EBUSY)
|
||||
auto sysfs_path = path_;
|
||||
sysfs_path += "/device/";
|
||||
sysfs_path += kDevAttribNameMap.at(type);
|
||||
std::ofstream fs;
|
||||
int ret;
|
||||
std::ostringstream ss;
|
||||
|
||||
fs.rdbuf()->pubsetbuf(nullptr,0);
|
||||
fs.flush();
|
||||
fs.rdbuf()->pubsetbuf(0, 0);
|
||||
ret = openSysfsFileStream(type, &fs, valStr.c_str());
|
||||
if (ret != 0) {
|
||||
ss << "Could not write device info string (" << valStr
|
||||
fs.close();
|
||||
ss << __PRETTY_FUNCTION__ << " | Issue: Could not open fileStream; "
|
||||
<< "Could not write device info string (" << valStr
|
||||
<< ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type)
|
||||
<< "), returning " << std::to_string(ret);
|
||||
LOG_ERROR(ss);
|
||||
@@ -705,19 +729,39 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr) {
|
||||
|
||||
// We'll catch any exceptions in rocm_smi.cc code.
|
||||
if (fs << valStr) {
|
||||
fs.flush();
|
||||
fs.close();
|
||||
ss << "Successfully wrote device info string (" << valStr
|
||||
<< ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type)
|
||||
<< "), returning RSMI_STATUS_SUCCESS";
|
||||
LOG_INFO(ss);
|
||||
ret = RSMI_STATUS_SUCCESS;
|
||||
} else {
|
||||
ss << "Could not write device info string (" << valStr
|
||||
if (returnWriteErr) {
|
||||
ret = errno;
|
||||
} else {
|
||||
ret = RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
fs.flush();
|
||||
fs.close();
|
||||
ss << __PRETTY_FUNCTION__ << " | Issue: Could not write to file; "
|
||||
<< "Could not write device info string (" << valStr
|
||||
<< ") for DevInfoType (" << RocmSMI::devInfoTypesStrings.at(type)
|
||||
<< "), returning RSMI_STATUS_NOT_SUPPORTED";
|
||||
<< "), returning " << getRSMIStatusString(ErrnoToRsmiStatus(ret));
|
||||
ss << " | "
|
||||
<< (fs.is_open() ? "[ERROR] File stream open" :
|
||||
"[GOOD] File stream closed")
|
||||
<< " | " << (fs.bad() ? "[ERROR] Bad write operation" :
|
||||
"[GOOD] No bad bit write, successful write operation")
|
||||
<< " | " << (fs.fail() ? "[ERROR] Failed write - format error" :
|
||||
"[GOOD] No fail - Successful write operation")
|
||||
<< " | " << (fs.eof() ? "[ERROR] Failed write - EOF error" :
|
||||
"[GOOD] No eof error - Successful write operation")
|
||||
<< " | " << (fs.good() ?
|
||||
"[GOOD] Write good - Successful write operation" :
|
||||
"[ERROR] Failed write - good error");
|
||||
LOG_ERROR(ss);
|
||||
ret = RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
fs.close();
|
||||
|
||||
return ret;
|
||||
}
|
||||
@@ -756,6 +800,9 @@ int Device::writeDevInfo(DevInfoTypes type, uint64_t val) {
|
||||
}
|
||||
|
||||
int Device::writeDevInfo(DevInfoTypes type, std::string val) {
|
||||
auto sysfs_path = path_;
|
||||
sysfs_path += "/device/";
|
||||
sysfs_path += kDevAttribNameMap.at(type);
|
||||
switch (type) {
|
||||
case kDevGPUMClk:
|
||||
case kDevDCEFClk:
|
||||
@@ -764,9 +811,10 @@ int Device::writeDevInfo(DevInfoTypes type, std::string val) {
|
||||
case kDevPCIEClk:
|
||||
case kDevPowerODVoltage:
|
||||
case kDevSOCClk:
|
||||
return writeDevInfoStr(type, val);
|
||||
case kDevComputePartition:
|
||||
case kDevMemoryPartition:
|
||||
return writeDevInfoStr(type, val);
|
||||
return writeDevInfoStr(type, val, true);
|
||||
|
||||
default:
|
||||
return EINVAL;
|
||||
@@ -899,6 +947,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
|
||||
std::string tempStr;
|
||||
int ret;
|
||||
int tmp_val;
|
||||
std::ostringstream ss;
|
||||
|
||||
switch (type) {
|
||||
case kDevDevID:
|
||||
|
||||
@@ -496,6 +496,12 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t *smu) {
|
||||
// a specific version.
|
||||
*smu = {};
|
||||
|
||||
uint8_t dev_content_revision = dev->gpu_metrics_ver().content_revision;
|
||||
if (dev_content_revision != RSMI_GPU_METRICS_API_CONTENT_VER_1 ||
|
||||
dev_content_revision != RSMI_GPU_METRICS_API_CONTENT_VER_2 ||
|
||||
dev_content_revision != RSMI_GPU_METRICS_API_CONTENT_VER_3) {
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
if (dev->gpu_metrics_ver().content_revision ==
|
||||
RSMI_GPU_METRICS_API_CONTENT_VER_1) {
|
||||
ret = GetDevBinaryBlob(amd::smi::kDevGpuMetrics, dv_ind,
|
||||
|
||||
@@ -971,5 +971,26 @@ int get_gpu_id(uint32_t node, uint64_t *gpu_id) {
|
||||
return retVal;
|
||||
}
|
||||
|
||||
// /sys/class/kfd/kfd/topology/nodes/*/properties | grep gfx_target_version
|
||||
int KFDNode::get_gfx_target_version(uint64_t *gfx_target_version) {
|
||||
std::ostringstream ss;
|
||||
std::string properties_path = "/sys/class/kfd/kfd/topology/nodes/"
|
||||
+ std::to_string(this->node_indx_) + "/properties";
|
||||
uint64_t gfx_version = 0;
|
||||
int ret = read_node_properties(this->node_indx_, "gfx_target_version",
|
||||
&gfx_version);
|
||||
*gfx_target_version = gfx_version;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | File: " << properties_path
|
||||
<< " | Successfully read node #" << std::to_string(this->node_indx_)
|
||||
<< " for gfx_target_version"
|
||||
<< " | Data (gfx_target_version) *gfx_target_version = "
|
||||
<< std::to_string(*gfx_target_version)
|
||||
<< " | return = " << std::to_string(ret)
|
||||
<< " | ";
|
||||
LOG_DEBUG(ss);
|
||||
return ret;
|
||||
}
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
@@ -445,6 +445,12 @@ RocmSMI::Initialize(uint64_t flags) {
|
||||
// store each device boot partition state, if file doesn't exist
|
||||
dev->storeDevicePartitions(dv_ind);
|
||||
}
|
||||
|
||||
// Assists displaying GPU information after device enumeration
|
||||
// Otherwise GPU related info will not be discoverable
|
||||
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
|
||||
logSystemDetails();
|
||||
}
|
||||
// Leaving below to help debug temp file issues
|
||||
// displayAppTmpFilesContent();
|
||||
std::string amdGPUDeviceList = displayAllDevicePaths(devices_);
|
||||
|
||||
@@ -599,9 +599,19 @@ std::tuple<bool, std::string> readTmpFile(uint32_t dv_ind,
|
||||
}
|
||||
|
||||
// wrapper to return string expression of a rsmi_status_t return
|
||||
std::string getRSMIStatusString(rsmi_status_t ret) {
|
||||
// rsmi_status_t ret - return value of RSMI API function
|
||||
// bool fullStatus - defaults to true, set to false to chop off description
|
||||
// Returns:
|
||||
// string - if fullStatus == true, returns full decription of return value
|
||||
// ex. 'RSMI_STATUS_SUCCESS: The function has been executed successfully.'
|
||||
// string - if fullStatus == false, returns a minimalized return value
|
||||
// ex. 'RSMI_STATUS_SUCCESS'
|
||||
std::string getRSMIStatusString(rsmi_status_t ret, bool fullStatus) {
|
||||
const char *err_str;
|
||||
rsmi_status_string(ret, &err_str);
|
||||
if (!fullStatus) {
|
||||
return splitString(std::string(err_str), ':');
|
||||
}
|
||||
return std::string(err_str);
|
||||
}
|
||||
|
||||
@@ -620,9 +630,13 @@ std::string getRSMIStatusString(rsmi_status_t ret) {
|
||||
// Expressed as big endian or little endian.
|
||||
// Big Endian (BE), multi-bit symbols encoded as big endian (MSB first)
|
||||
// Little Endian (LE), multi-bit symbols encoded as little endian (LSB first)
|
||||
// string rocm_lib_path = Path to library
|
||||
// string rocm_build_type = Release or debug
|
||||
// string rocm_build_date = Creation date of library
|
||||
// string dev_gfx_versions = GPU target graphics version
|
||||
std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string, std::string,
|
||||
std::string, std::string, std::string>
|
||||
std::string, std::string, std::string, std::string, std::string>
|
||||
getSystemDetails(void) {
|
||||
struct utsname buf;
|
||||
bool errorDetected = false;
|
||||
@@ -637,7 +651,9 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
std::string endianness = "<undefined>";
|
||||
std::string rocm_lib_path = "<undefined>";
|
||||
std::string rocm_build_type = "<undefined>";
|
||||
std::string rocm_build_date = "<undefined>";
|
||||
std::string rocm_env_variables = "<undefined>";
|
||||
std::string dev_gfx_versions = "<undefined>";
|
||||
|
||||
if (uname(&buf) < 0) {
|
||||
errorDetected = true;
|
||||
@@ -674,11 +690,20 @@ std::tuple<bool, std::string, std::string, std::string, std::string,
|
||||
}
|
||||
rocm_build_type = getBuildType();
|
||||
rocm_lib_path = getMyLibPath();
|
||||
rocm_build_date = getFileCreationDate(rocm_lib_path);
|
||||
rocm_env_variables = RocmSMI::getInstance().getRSMIEnvVarInfo();
|
||||
std::queue<std::string> devGraphicsVersions = getAllDeviceGfxVers();
|
||||
if (devGraphicsVersions.empty() == false) {
|
||||
dev_gfx_versions = "";
|
||||
while (devGraphicsVersions.empty() == false) {
|
||||
dev_gfx_versions += "\n\t" + devGraphicsVersions.front();
|
||||
devGraphicsVersions.pop();
|
||||
}
|
||||
}
|
||||
return std::make_tuple(errorDetected, sysname, nodename, release,
|
||||
version, machine, domainName, os_distribution,
|
||||
endianness, rocm_build_type, rocm_lib_path,
|
||||
rocm_env_variables);
|
||||
rocm_build_date, rocm_env_variables, dev_gfx_versions);
|
||||
}
|
||||
|
||||
// If logging is enabled through RSMI_LOGGING environment variable.
|
||||
@@ -687,10 +712,11 @@ void logSystemDetails(void) {
|
||||
std::ostringstream ss;
|
||||
bool errorDetected;
|
||||
std::string sysname, node, release, version, machine, domain, distName,
|
||||
endianness, rocm_build_type, lib_path, rocm_env_vars;
|
||||
endianness, rocm_build_type, lib_path, build_date, rocm_env_vars,
|
||||
dev_gfx_versions;
|
||||
std::tie(errorDetected, sysname, node, release, version, machine, domain,
|
||||
distName, endianness, rocm_build_type, lib_path,
|
||||
rocm_env_vars) = getSystemDetails();
|
||||
distName, endianness, rocm_build_type, lib_path, build_date,
|
||||
rocm_env_vars, dev_gfx_versions) = getSystemDetails();
|
||||
if (errorDetected == false) {
|
||||
ss << "====== Gathered system details ============\n"
|
||||
<< "SYSTEM NAME: " << sysname << "\n"
|
||||
@@ -703,7 +729,9 @@ void logSystemDetails(void) {
|
||||
<< "ENDIANNESS: " << endianness << "\n"
|
||||
<< "ROCM BUILD TYPE: " << rocm_build_type << "\n"
|
||||
<< "ROCM-SMI-LIB PATH: " << lib_path << "\n"
|
||||
<< "ROCM ENV VARIABLES: " << rocm_env_vars << "\n";
|
||||
<< "ROCM-SMI-LIB BUILD DATE: " << build_date << "\n"
|
||||
<< "ROCM ENV VARIABLES: " << rocm_env_vars
|
||||
<< "AMD GFX VERSIONS: " << dev_gfx_versions << "\n";
|
||||
LOG_INFO(ss);
|
||||
} else {
|
||||
ss << "====== Gathered system details ============\n"
|
||||
@@ -831,6 +859,13 @@ std::string getMyLibPath(void) {
|
||||
return path;
|
||||
}
|
||||
|
||||
std::string getFileCreationDate(std::string path) {
|
||||
struct stat t_stat;
|
||||
stat(path.c_str(), &t_stat);
|
||||
struct tm *timeinfo = localtime(&t_stat.st_ctime); // NOLINT
|
||||
return removeNewLines(std::string(asctime(timeinfo))); // NOLINT
|
||||
}
|
||||
|
||||
rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str)
|
||||
{
|
||||
auto result = rsmi_status_t::RSMI_STATUS_SUCCESS;
|
||||
@@ -974,5 +1009,164 @@ std::string power_type_string(RSMI_POWER_TYPE type) {
|
||||
return powerTypesToString.at(type);
|
||||
}
|
||||
|
||||
std::string splitString(std::string str, char delim) {
|
||||
std::vector<std::string> tokens;
|
||||
std::stringstream ss(str);
|
||||
std::string token;
|
||||
|
||||
if (str.empty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
while (std::getline(ss, token, delim)) {
|
||||
tokens.push_back(token);
|
||||
return token; // return 1st match
|
||||
}
|
||||
}
|
||||
|
||||
static std::string pt_rng_Mhz(std::string title, rsmi_range *r) {
|
||||
std::ostringstream ss;
|
||||
if (r == nullptr) {
|
||||
ss << "pt_rng_Mhz | rsmi_range r = nullptr\n";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
ss << title;
|
||||
ss << r->lower_bound/1000000 << " to "
|
||||
<< r->upper_bound/1000000 << " MHz" << "\n";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
static std::string pt_rng_mV(std::string title, rsmi_range *r) {
|
||||
std::ostringstream ss;
|
||||
if (r == nullptr) {
|
||||
ss << "pt_rng_mV | rsmi_range r = nullptr\n";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
ss << title;
|
||||
ss << r->lower_bound << " to " << r->upper_bound
|
||||
<< " mV" << "\n";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
static std::string print_pnt(rsmi_od_vddc_point_t *pt) {
|
||||
std::ostringstream ss;
|
||||
ss << "\t\t** Frequency: " << pt->frequency/1000000 << " MHz\n";
|
||||
ss << "\t\t** Voltage: " << pt->voltage << " mV\n";
|
||||
return ss.str();
|
||||
}
|
||||
static std::string pt_vddc_curve(rsmi_od_volt_curve *c) {
|
||||
std::ostringstream ss;
|
||||
if (c == nullptr) {
|
||||
ss << "pt_vddc_curve | rsmi_od_volt_curve c = nullptr\n";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < RSMI_NUM_VOLTAGE_CURVE_POINTS; ++i) {
|
||||
ss << print_pnt(&c->vc_points[i]);
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string print_rsmi_od_volt_freq_data_t(rsmi_od_volt_freq_data_t *odv) {
|
||||
std::ostringstream ss;
|
||||
if (odv == nullptr) {
|
||||
ss << "rsmi_od_volt_freq_data_t odv = nullptr\n";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
ss << pt_rng_Mhz("\t**Current SCLK frequency range: ", &odv->curr_sclk_range);
|
||||
ss << pt_rng_Mhz("\t**Current MCLK frequency range: ", &odv->curr_mclk_range);
|
||||
ss << pt_rng_Mhz("\t**Min/Max Possible SCLK frequency range: ",
|
||||
&odv->sclk_freq_limits);
|
||||
ss << pt_rng_Mhz("\t**Min/Max Possible MCLK frequency range: ",
|
||||
&odv->mclk_freq_limits);
|
||||
|
||||
ss << "\t**Current Freq/Volt. curve: " << "\n";
|
||||
ss << pt_vddc_curve(&odv->curve);
|
||||
|
||||
ss << "\t**Number of Freq./Volt. regions: " << odv->num_regions << "\n\n";
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string print_odv_region(rsmi_freq_volt_region_t *region) {
|
||||
std::ostringstream ss;
|
||||
ss << pt_rng_Mhz("\t\tFrequency range: ", ®ion->freq_range);
|
||||
ss << pt_rng_mV("\t\tVoltage range: ", ®ion->volt_range);
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
std::string print_rsmi_od_volt_freq_regions(uint32_t num_regions,
|
||||
rsmi_freq_volt_region_t *regions) {
|
||||
std::ostringstream ss;
|
||||
if (regions == nullptr) {
|
||||
ss << "rsmi_freq_volt_region_t regions = nullptr\n";
|
||||
return ss.str();
|
||||
}
|
||||
for (uint32_t i = 0; i < num_regions; ++i) {
|
||||
ss << "\tRegion " << i << ": " << "\n";
|
||||
ss << print_odv_region(®ions[i]);
|
||||
}
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
bool is_sudo_user() {
|
||||
std::ostringstream ss;
|
||||
bool isRunningWithSudo = false;
|
||||
auto myUID = getuid();
|
||||
auto myPrivledges = geteuid();
|
||||
if ((myUID == myPrivledges) && (myPrivledges == 0)) {
|
||||
isRunningWithSudo = true;
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << (isRunningWithSudo ? " | running as sudoer" :
|
||||
" | NOT running as sudoer");
|
||||
LOG_DEBUG(ss);
|
||||
return isRunningWithSudo;
|
||||
}
|
||||
|
||||
rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind,
|
||||
std::string *gfx_version) {
|
||||
std::ostringstream ss;
|
||||
uint64_t kfd_gfx_version = 0;
|
||||
GET_DEV_AND_KFDNODE_FROM_INDX
|
||||
|
||||
int ret = kfd_node->get_gfx_target_version(&kfd_gfx_version);
|
||||
if (ret == 0) {
|
||||
ss << "gfx" << kfd_gfx_version;
|
||||
*gfx_version = ss.str();
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
} else {
|
||||
*gfx_version = "Unknown";
|
||||
return RSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
}
|
||||
|
||||
std::queue<std::string> getAllDeviceGfxVers() {
|
||||
uint32_t num_monitor_devs = 0;
|
||||
rsmi_status_t ret;
|
||||
std::queue<std::string> deviceGfxVersions;
|
||||
std::string response = "";
|
||||
std::string dev_gfx_ver = "";
|
||||
|
||||
ret = rsmi_num_monitor_devices(&num_monitor_devs);
|
||||
if (ret != RSMI_STATUS_SUCCESS || num_monitor_devs == 0) {
|
||||
response = "N/A - No AMD devices detected";
|
||||
deviceGfxVersions.push(response);
|
||||
return deviceGfxVersions;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < num_monitor_devs; ++i) {
|
||||
ret = amd::smi::rsmi_get_gfx_target_version(i , &dev_gfx_ver);
|
||||
response = "Device[" + std::to_string(i) + "]: ";
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
deviceGfxVersions.push(response + getRSMIStatusString(ret, false));
|
||||
} else {
|
||||
deviceGfxVersions.push(response + std::string(dev_gfx_ver));
|
||||
}
|
||||
}
|
||||
return deviceGfxVersions;
|
||||
}
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
@@ -21,6 +21,14 @@ message("--------Proj Lib Dir: " ${PROJECT_BINARY_DIR}/lib)
|
||||
message("--------Proj Exe Dir: " ${PROJECT_BINARY_DIR}/bin)
|
||||
message("")
|
||||
|
||||
## Compiler flags
|
||||
set(CMAKE_CXX_FLAGS
|
||||
"${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti -std=c++17")
|
||||
if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64")
|
||||
set(CMAKE_CXX_FLAGS
|
||||
"${CMAKE_CXX_FLAGS} -m64 -msse -msse2")
|
||||
endif()
|
||||
|
||||
set(SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
set(RSMITST "rsmitst")
|
||||
|
||||
+75
-38
@@ -54,6 +54,7 @@
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi_test/functional/computepartition_read_write.h"
|
||||
#include "rocm_smi_test/test_common.h"
|
||||
|
||||
@@ -118,6 +119,24 @@ computePartitionString(rsmi_compute_partition_type computeParitionType) {
|
||||
}
|
||||
}
|
||||
|
||||
static void system_wait(int seconds) {
|
||||
// Adding a delay - since changing partitions depends on gpus not
|
||||
// being in an active state, we'll wait a few seconds before starting
|
||||
// full testing
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
int waitTime = seconds;
|
||||
std::cout << "** Waiting for "
|
||||
<< std::dec << waitTime
|
||||
<< " seconds, for any GPU"
|
||||
<< " activity to clear up. **" << std::endl;
|
||||
sleep(waitTime);
|
||||
auto stop = std::chrono::high_resolution_clock::now();
|
||||
auto duration =
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
|
||||
std::cout << "** Waiting took " << duration.count() / 1000000
|
||||
<< " seconds **" << std::endl;
|
||||
}
|
||||
|
||||
static const std::map<std::string, rsmi_compute_partition_type_t>
|
||||
mapStringToRSMIComputePartitionTypes {
|
||||
{"CPX", RSMI_COMPUTE_PARTITION_CPX},
|
||||
@@ -141,21 +160,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
// Confirm system supports compute partition, before executing wait
|
||||
ret = rsmi_dev_compute_partition_get(0, orig_char_computePartition, 255);
|
||||
if (ret == RSMI_STATUS_SUCCESS) {
|
||||
// Adding a delay - since changing partitions depends on gpus not
|
||||
// being in an active state, we'll wait a few seconds before starting
|
||||
// full testing
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
int waitTime = 20;
|
||||
std::cout << "** Waiting for "
|
||||
<< std::dec << waitTime
|
||||
<< " seconds, for any GPU"
|
||||
<< " activity to clear up. **" << std::endl;
|
||||
sleep(waitTime);
|
||||
auto stop = std::chrono::high_resolution_clock::now();
|
||||
auto duration =
|
||||
std::chrono::duration_cast<std::chrono::microseconds>(stop - start);
|
||||
std::cout << "** Waiting took " << duration.count() / 1000000
|
||||
<< " seconds **" << std::endl;
|
||||
system_wait(25);
|
||||
}
|
||||
|
||||
for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
|
||||
@@ -165,6 +170,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
}
|
||||
}
|
||||
PrintDeviceHeader(dv_ind);
|
||||
bool devicePartitionUpdated = false;
|
||||
|
||||
// Standard checks to see if API is supported, before running full tests
|
||||
ret = rsmi_dev_compute_partition_get(dv_ind, orig_char_computePartition,
|
||||
@@ -231,9 +237,8 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
}
|
||||
|
||||
// Verify api support checking functionality is working
|
||||
rsmi_compute_partition_type_t newPartition
|
||||
= rsmi_compute_partition_type_t::RSMI_COMPUTE_PARTITION_INVALID;
|
||||
err = rsmi_dev_compute_partition_set(dv_ind, newPartition);
|
||||
err = rsmi_dev_compute_partition_set(dv_ind,
|
||||
RSMI_COMPUTE_PARTITION_INVALID);
|
||||
ASSERT_TRUE((err == RSMI_STATUS_INVALID_ARGS) ||
|
||||
(err == RSMI_STATUS_NOT_SUPPORTED) ||
|
||||
(err == RSMI_STATUS_PERMISSION));
|
||||
@@ -270,27 +275,40 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
* //!< work together with shared memory
|
||||
*/
|
||||
|
||||
for (int partition =
|
||||
rsmi_compute_partition_type_t::RSMI_COMPUTE_PARTITION_CPX;
|
||||
partition <= rsmi_compute_partition_type_t::RSMI_COMPUTE_PARTITION_QPX;
|
||||
for (int partition = static_cast<int>(RSMI_COMPUTE_PARTITION_CPX);
|
||||
partition <= static_cast<int>(RSMI_COMPUTE_PARTITION_QPX);
|
||||
partition++) {
|
||||
newPartition = static_cast<rsmi_compute_partition_type_t>(partition);
|
||||
rsmi_compute_partition_type_t updatePartition
|
||||
= static_cast<rsmi_compute_partition_type_t>(partition);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << std::endl;
|
||||
std::cout << "\t**"
|
||||
<< "======== TEST RSMI_COMPUTE_PARTITION_"
|
||||
<< computePartitionString(newPartition)
|
||||
<< computePartitionString(updatePartition)
|
||||
<< " ===============" << std::endl;
|
||||
}
|
||||
ret = rsmi_dev_compute_partition_set(dv_ind, updatePartition);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**"
|
||||
<< "Attempting to set compute partition to: "
|
||||
<< computePartitionString(newPartition) << std::endl;
|
||||
<< "rsmi_dev_compute_partition_set(dv_ind, updatePartition): "
|
||||
<< amd::smi::getRSMIStatusString(ret, false) << "\n"
|
||||
<< "\t**New Partition (set): "
|
||||
<< computePartitionString(updatePartition) << "\n";
|
||||
}
|
||||
ret = rsmi_dev_compute_partition_set(dv_ind, newPartition);
|
||||
ASSERT_TRUE((ret == RSMI_STATUS_SETTING_UNAVAILABLE)
|
||||
|| (ret== RSMI_STATUS_PERMISSION)
|
||||
|| (ret == RSMI_STATUS_SUCCESS)
|
||||
|| ret == RSMI_STATUS_BUSY);
|
||||
|
||||
if (ret == RSMI_STATUS_BUSY) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device is currently busy.. continue\n";
|
||||
}
|
||||
system_wait(5);
|
||||
continue;
|
||||
}
|
||||
|
||||
bool isSettingUnavailable = false;
|
||||
ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) ||
|
||||
(ret == RSMI_STATUS_SETTING_UNAVAILABLE));
|
||||
if (ret == RSMI_STATUS_SETTING_UNAVAILABLE) {
|
||||
isSettingUnavailable = true;
|
||||
}
|
||||
@@ -306,7 +324,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
}
|
||||
if (isSettingUnavailable) {
|
||||
ASSERT_EQ(RSMI_STATUS_SETTING_UNAVAILABLE, ret);
|
||||
ASSERT_STRNE(computePartitionString(newPartition).c_str(),
|
||||
ASSERT_STRNE(computePartitionString(updatePartition).c_str(),
|
||||
current_char_computePartition);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**"
|
||||
@@ -314,23 +332,30 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
<< "RSMI_STATUS_SETTING_UNAVAILABLE,\n\t current compute "
|
||||
<< "partition (" << current_char_computePartition
|
||||
<< ") did not update to ("
|
||||
<< computePartitionString(newPartition) << ")"
|
||||
<< computePartitionString(updatePartition) << ")"
|
||||
<< std::endl;
|
||||
}
|
||||
} else {
|
||||
if (strcmp(orig_char_computePartition, current_char_computePartition) !=
|
||||
0) {
|
||||
devicePartitionUpdated = true;
|
||||
} else {
|
||||
devicePartitionUpdated = false;
|
||||
}
|
||||
|
||||
ASSERT_EQ(RSMI_STATUS_SUCCESS, ret);
|
||||
ASSERT_STREQ(computePartitionString(newPartition).c_str(),
|
||||
ASSERT_STREQ(computePartitionString(updatePartition).c_str(),
|
||||
current_char_computePartition);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**"
|
||||
<< "Confirmed current compute partition ("
|
||||
<< current_char_computePartition << ") matches"
|
||||
<< "\n\t requested compute partition ("
|
||||
<< computePartitionString(newPartition) << ")"
|
||||
<< computePartitionString(updatePartition) << ")"
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
} // END looping through partition changes
|
||||
|
||||
/* TEST RETURN TO BOOT COMPUTE PARTITION SETTING */
|
||||
IF_VERB(STANDARD) {
|
||||
@@ -342,8 +367,14 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
std::string oldPartition = current_char_computePartition;
|
||||
bool wasResetSuccess = false;
|
||||
ret = rsmi_dev_compute_partition_reset(dv_ind);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**"
|
||||
<< "rsmi_dev_compute_partition_reset(dv_ind): "
|
||||
<< amd::smi::getRSMIStatusString(ret, false) << "\n";
|
||||
}
|
||||
ASSERT_TRUE((ret == RSMI_STATUS_SUCCESS) ||
|
||||
(ret == RSMI_STATUS_NOT_SUPPORTED));
|
||||
(ret == RSMI_STATUS_NOT_SUPPORTED) ||
|
||||
(ret == RSMI_STATUS_BUSY));
|
||||
if (ret == RSMI_STATUS_SUCCESS) {
|
||||
wasResetSuccess = true;
|
||||
}
|
||||
@@ -352,9 +383,15 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
CHK_ERR_ASRT(ret)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**" << "Current compute partition: "
|
||||
<< current_char_computePartition << std::endl;
|
||||
<< current_char_computePartition << "\n"
|
||||
<< "\t**" << "Original compute partition: "
|
||||
<< orig_char_computePartition << "\n"
|
||||
<< "\t**" << "Reset Successful: "
|
||||
<< (wasResetSuccess ? "TRUE" : "FALSE") << "\n"
|
||||
<< "\t**" << "Partitions Updated: "
|
||||
<< (devicePartitionUpdated ? "TRUE" : "FALSE") << "\n";
|
||||
}
|
||||
if (wasResetSuccess) {
|
||||
if (wasResetSuccess && devicePartitionUpdated) {
|
||||
ASSERT_STRNE(oldPartition.c_str(), current_char_computePartition);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**"
|
||||
@@ -379,7 +416,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
<< "=========== TEST RETURN TO ORIGINAL COMPUTE PARTITION "
|
||||
<< "SETTING ========" << std::endl;
|
||||
}
|
||||
newPartition
|
||||
rsmi_compute_partition_type_t newPartition
|
||||
= mapStringToRSMIComputePartitionTypes.at(
|
||||
std::string(orig_char_computePartition));
|
||||
ret = rsmi_dev_compute_partition_set(dv_ind, newPartition);
|
||||
@@ -401,5 +438,5 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
ASSERT_EQ(RSMI_STATUS_SUCCESS, ret);
|
||||
ASSERT_STREQ(computePartitionString(newPartition).c_str(),
|
||||
current_char_computePartition);
|
||||
}
|
||||
} // END looping through devices
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2020, Advanced Micro Devices, Inc.
|
||||
* Copyright (c) 2020-2023, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
@@ -56,6 +56,7 @@
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi_test/functional/perf_determinism.h"
|
||||
#include "rocm_smi_test/test_common.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
|
||||
|
||||
TestPerfDeterminism::TestPerfDeterminism() : TestBase() {
|
||||
@@ -103,23 +104,49 @@ void TestPerfDeterminism::Run(void) {
|
||||
|
||||
for (uint32_t i = 0; i < num_monitor_devs(); ++i) {
|
||||
PrintDeviceHeader(i);
|
||||
std::cout << "\t**Resetting performance determinism\n";
|
||||
err = rsmi_dev_perf_level_set(i, RSMI_DEV_PERF_LEVEL_AUTO);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**rsmi_dev_perf_level_set(i, RSMI_DEV_PERF_LEVEL_AUTO): "
|
||||
<< amd::smi::getRSMIStatusString(err, false)
|
||||
<< "\n";
|
||||
}
|
||||
CHK_ERR_ASRT(err)
|
||||
ret = rsmi_dev_perf_level_get(i, &pfl);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**rsmi_dev_perf_level_get(i, &pfl): "
|
||||
<< amd::smi::getRSMIStatusString(ret, false) << "\n";
|
||||
}
|
||||
CHK_ERR_ASRT(ret)
|
||||
err = rsmi_dev_od_volt_info_get(i, &odv);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**rsmi_dev_od_volt_info_get(i, &odv): "
|
||||
<< amd::smi::getRSMIStatusString(err, false)
|
||||
<< "\n"
|
||||
<< amd::smi::print_rsmi_od_volt_freq_data_t(&odv)
|
||||
<< "\n";
|
||||
}
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t** Not supported on this machine" << std::endl;
|
||||
std::cout << "\t** Not supported on this machine\n";
|
||||
}
|
||||
return;
|
||||
}
|
||||
else{
|
||||
} else if (err == RSMI_STATUS_SUCCESS) {
|
||||
clkvalue = (odv.curr_sclk_range.lower_bound/1000000) + 50;
|
||||
} else {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t** Unable to retrieve lower bound sclk, continue.. \n";
|
||||
}
|
||||
continue;
|
||||
}
|
||||
std::cout << "About to rsmi_perf_determinism_mode_set() -->\n";
|
||||
|
||||
err = rsmi_perf_determinism_mode_set(i, clkvalue);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Not supported on this machine" << std::endl;
|
||||
}
|
||||
return;
|
||||
continue;
|
||||
} else {
|
||||
ret = rsmi_dev_perf_level_get(i, &pfl);
|
||||
CHK_ERR_ASRT(ret)
|
||||
@@ -130,7 +157,7 @@ void TestPerfDeterminism::Run(void) {
|
||||
}
|
||||
|
||||
std::cout << "\t**Resetting performance determinism" << std::endl;
|
||||
err = rsmi_dev_perf_level_set(i, RSMI_DEV_PERF_LEVEL_AUTO);;
|
||||
err = rsmi_dev_perf_level_set(i, RSMI_DEV_PERF_LEVEL_AUTO);
|
||||
CHK_ERR_ASRT(err)
|
||||
ret = rsmi_dev_perf_level_get(i, &pfl);
|
||||
CHK_ERR_ASRT(ret)
|
||||
@@ -138,7 +165,6 @@ void TestPerfDeterminism::Run(void) {
|
||||
std::cout << "\t**New Perf Level:" << GetPerfLevelStr(pfl) <<
|
||||
std::endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
}
|
||||
} // END - SET SUPPORTED
|
||||
} // END - DEVICE LOOP
|
||||
}
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2019, Advanced Micro Devices, Inc.
|
||||
* Copyright (c) 2019-2023, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
@@ -53,6 +53,7 @@
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi_test/functional/volt_freq_curv_read.h"
|
||||
#include "rocm_smi_test/test_common.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
|
||||
TestVoltCurvRead::TestVoltCurvRead() : TestBase() {
|
||||
set_title("RSMI Voltage-Frequency Curve Read Test");
|
||||
@@ -84,69 +85,10 @@ void TestVoltCurvRead::Close() {
|
||||
TestBase::Close();
|
||||
}
|
||||
|
||||
static void pt_rng_Mhz(std::string title, rsmi_range *r) {
|
||||
assert(r != nullptr);
|
||||
|
||||
std::cout << title << std::endl;
|
||||
std::cout << "\t\t** " << r->lower_bound/1000000 << " to " <<
|
||||
r->upper_bound/1000000 << " MHz" << std::endl;
|
||||
}
|
||||
|
||||
static void pt_rng_mV(std::string title, rsmi_range *r) {
|
||||
assert(r != nullptr);
|
||||
|
||||
std::cout << title << std::endl;
|
||||
std::cout << "\t\t** " << r->lower_bound << " to " << r->upper_bound <<
|
||||
" mV" << std::endl;
|
||||
}
|
||||
|
||||
static void print_pnt(rsmi_od_vddc_point_t *pt) {
|
||||
std::cout << "\t\t** Frequency: " << pt->frequency/1000000 << "MHz" <<
|
||||
std::endl;
|
||||
std::cout << "\t\t** Voltage: " << pt->voltage << "mV" << std::endl;
|
||||
}
|
||||
static void pt_vddc_curve(rsmi_od_volt_curve *c) {
|
||||
assert(c != nullptr);
|
||||
|
||||
for (uint32_t i = 0; i < RSMI_NUM_VOLTAGE_CURVE_POINTS; ++i) {
|
||||
print_pnt(&c->vc_points[i]);
|
||||
}
|
||||
}
|
||||
|
||||
static void print_rsmi_od_volt_freq_data_t(rsmi_od_volt_freq_data_t *odv) {
|
||||
assert(odv != nullptr);
|
||||
|
||||
std::cout.setf(std::ios::dec, std::ios::basefield);
|
||||
pt_rng_Mhz("\t\tCurrent SCLK frequency range:", &odv->curr_sclk_range);
|
||||
pt_rng_Mhz("\t\tCurrent MCLK frequency range:", &odv->curr_mclk_range);
|
||||
pt_rng_Mhz("\t\tMin/Max Possible SCLK frequency range:",
|
||||
&odv->sclk_freq_limits);
|
||||
pt_rng_Mhz("\t\tMin/Max Possible MCLK frequency range:",
|
||||
&odv->mclk_freq_limits);
|
||||
|
||||
std::cout << "\t\tCurrent Freq/Volt. curve:" << std::endl;
|
||||
pt_vddc_curve(&odv->curve);
|
||||
|
||||
std::cout << "\tNumber of Freq./Volt. regions: " <<
|
||||
odv->num_regions << std::endl;
|
||||
}
|
||||
|
||||
static void print_odv_region(rsmi_freq_volt_region_t *region) {
|
||||
pt_rng_Mhz("\t\tFrequency range:", ®ion->freq_range);
|
||||
pt_rng_mV("\t\tVoltage range:", ®ion->volt_range);
|
||||
}
|
||||
|
||||
static void print_rsmi_od_volt_freq_regions(uint32_t num_regions,
|
||||
rsmi_freq_volt_region_t *regions) {
|
||||
for (uint32_t i = 0; i < num_regions; ++i) {
|
||||
std::cout << "\tRegion " << i << ":" << std::endl;
|
||||
print_odv_region(®ions[i]);
|
||||
}
|
||||
}
|
||||
|
||||
void TestVoltCurvRead::Run(void) {
|
||||
rsmi_status_t err;
|
||||
rsmi_status_t err, ret;
|
||||
rsmi_od_volt_freq_data_t odv;
|
||||
rsmi_dev_perf_level_t pfl;
|
||||
|
||||
TestBase::Run();
|
||||
if (setup_failed_) {
|
||||
@@ -157,26 +99,57 @@ void TestVoltCurvRead::Run(void) {
|
||||
for (uint32_t i = 0; i < num_monitor_devs(); ++i) {
|
||||
PrintDeviceHeader(i);
|
||||
|
||||
std::cout << "\n\t**Resetting performance determinism to auto\n";
|
||||
err = rsmi_dev_perf_level_set(i, RSMI_DEV_PERF_LEVEL_AUTO);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**rsmi_dev_perf_level_set(i, RSMI_DEV_PERF_LEVEL_AUTO): "
|
||||
<< amd::smi::getRSMIStatusString(err, false)
|
||||
<< "\n";
|
||||
}
|
||||
CHK_ERR_ASRT(err)
|
||||
ret = rsmi_dev_perf_level_get(i, &pfl);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**rsmi_dev_perf_level_get(i, &pfl): "
|
||||
<< amd::smi::getRSMIStatusString(ret, false) << "\n";
|
||||
}
|
||||
CHK_ERR_ASRT(ret)
|
||||
err = rsmi_dev_od_volt_info_get(i, &odv);
|
||||
if (err == RSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**rsmi_dev_od_volt_info_get(i, &odv): "
|
||||
<< amd::smi::getRSMIStatusString(err, false)
|
||||
<< "\n"
|
||||
<< amd::smi::print_rsmi_od_volt_freq_data_t(&odv)
|
||||
<< "\n";
|
||||
}
|
||||
if (err != RSMI_STATUS_SUCCESS) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout <<
|
||||
"\t**rsmi_dev_od_volt_info_get: Not supported on this machine"
|
||||
<< std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_od_volt_info_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_od_volt_info_get(i, nullptr);
|
||||
ASSERT_EQ(err, RSMI_STATUS_INVALID_ARGS);
|
||||
continue;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = rsmi_dev_od_volt_info_get(i, nullptr);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**rsmi_dev_od_volt_info_get(i, nullptr): "
|
||||
<< amd::smi::getRSMIStatusString(err, false) << "\n";
|
||||
// << "\n"
|
||||
// << amd::smi::print_rsmi_od_volt_freq_data_t(&odv)
|
||||
// << "\n";
|
||||
}
|
||||
ASSERT_TRUE(err == RSMI_STATUS_INVALID_ARGS);
|
||||
err = rsmi_dev_od_volt_info_get(i, &odv);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**rsmi_dev_od_volt_info_get(i, &odv): "
|
||||
<< amd::smi::getRSMIStatusString(err, false) << "\n"
|
||||
<< amd::smi::print_rsmi_od_volt_freq_data_t(&odv)
|
||||
<< "\t**odv.num_regions = " << std::dec
|
||||
<< odv.num_regions << "\n";
|
||||
}
|
||||
|
||||
if (err == RSMI_STATUS_SUCCESS) {
|
||||
std::cout << "\t**Frequency-voltage curve data:" << std::endl;
|
||||
print_rsmi_od_volt_freq_data_t(&odv);
|
||||
std::cout << "\t**Frequency-voltage curve data:" << "\n";
|
||||
std::cout << amd::smi::print_rsmi_od_volt_freq_data_t(&odv);
|
||||
|
||||
rsmi_freq_volt_region_t *regions;
|
||||
uint32_t num_regions;
|
||||
@@ -185,11 +158,30 @@ void TestVoltCurvRead::Run(void) {
|
||||
|
||||
num_regions = odv.num_regions;
|
||||
err = rsmi_dev_od_volt_curve_regions_get(i, &num_regions, regions);
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**rsmi_dev_od_volt_curve_regions_get("
|
||||
<< "i, &num_regions, regions): "
|
||||
<< amd::smi::getRSMIStatusString(err, false) << "\n"
|
||||
<< "\t**Number of regions: " << std::dec << num_regions
|
||||
<< "\n";
|
||||
}
|
||||
ASSERT_TRUE(err == RSMI_STATUS_SUCCESS
|
||||
|| err == RSMI_STATUS_NOT_SUPPORTED
|
||||
|| err == RSMI_STATUS_UNEXPECTED_DATA
|
||||
|| err == RSMI_STATUS_UNEXPECTED_SIZE);
|
||||
if (err != RSMI_STATUS_SUCCESS) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**rsmi_dev_od_volt_curve_regions_get: "
|
||||
"Not supported on this machine" << std::endl;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
CHK_ERR_ASRT(err)
|
||||
ASSERT_TRUE(num_regions == odv.num_regions);
|
||||
|
||||
std::cout << "\t**Frequency-voltage curve regions:" << std::endl;
|
||||
print_rsmi_od_volt_freq_regions(num_regions, regions);
|
||||
std::cout << amd::smi::print_rsmi_od_volt_freq_regions(num_regions,
|
||||
regions);
|
||||
|
||||
delete []regions;
|
||||
}
|
||||
|
||||
@@ -163,6 +163,14 @@ TEST(rsmitstReadOnly, TestPerfLevelRead) {
|
||||
TestPerfLevelRead tst;
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
TEST(rsmitstReadWrite, TestComputePartitionReadWrite) {
|
||||
TestComputePartitionReadWrite tst;
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
TEST(rsmitstReadWrite, TestMemoryPartitionReadWrite) {
|
||||
TestMemoryPartitionReadWrite tst;
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
TEST(rsmitstReadWrite, TestPerfLevelReadWrite) {
|
||||
TestPerfLevelReadWrite tst;
|
||||
RunGenericTest(&tst);
|
||||
@@ -267,14 +275,6 @@ TEST(rsmitstReadOnly, TestMutualExclusion) {
|
||||
tst.Run();
|
||||
RunCustomTestEpilog(&tst);
|
||||
}
|
||||
TEST(rsmitstReadWrite, TestComputePartitionReadWrite) {
|
||||
TestComputePartitionReadWrite tst;
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
TEST(rsmitstReadWrite, TestMemoryPartitionReadWrite) {
|
||||
TestMemoryPartitionReadWrite tst;
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
TEST(rsmitstReadWrite, TestEvtNotifReadWrite) {
|
||||
TestEvtNotifReadWrite tst;
|
||||
RunGenericTest(&tst);
|
||||
|
||||
Ссылка в новой задаче
Block a user