[SWDEV-529030/SWDEV-531217] Fix tests & output for partitioned configurations (CPX, DPX, QPX, etc.)
Changes:
- Updated AMD SMI firmware to display "N/A" for unavailable firmware in partitioned environments, improving clarity.
Example (in DPX):
$ amd-smi firmware
GPU: 0
FW_LIST:
...
FW 12:
FW_ID: PM
FW_VERSION: 00.86.39.00
GPU: 1
FW_LIST: N/A
- Fixed amd-smi partition not showing current partition information on
asics with inablity to set memory or accelerator partitions.
$ amd-smi partition -c -m
CURRENT_PARTITION:
GPU_ID MEMORY ACCELERATOR_TYPE ACCELERATOR_PROFILE_INDEX PARTITION_ID
0 NPS1 CPX 2 0
1 N/A N/A N/A 1
2 N/A N/A N/A 2
3 N/A N/A N/A 3
4 N/A N/A N/A 4
5 N/A N/A N/A 5
6 NPS1 SPX 0 0
7 NPS1 SPX 0 0
8 NPS1 SPX 0 0
MEMORY_PARTITION:
GPU_ID MEMORY_PARTITION_CAPS CURRENT_MEMORY_PARTITION
0 N/A NPS1
1 N/A N/A
2 N/A N/A
3 N/A N/A
4 N/A N/A
5 N/A N/A
6 N/A NPS1
7 N/A NPS1
8 N/A NPS1
- Refactored amd_smi_drm_example.cc:
- Grouped partition changes and restores original partition settings.
- Now handles partitioned environments allowing example to continue even if some APIs are not supported in partitioned configurations.
- Modified amdsmi_asic_info_t (see amdsmi_get_gpu_asic_info()) to report OAM ID as N/A if 0xFFFFFFFF (was 0xFFFF).
Allows for better handling of OAM IDs in partitioned environments (DNE for non-primary nodes,
since its a physical identifier). Easier to handle in tests and example code (ie. now consistent w/ max size of the structure's value).
- Introduced amdsmi_RAII_open_FD() (internal API) to manage file descriptors using RAII, ensuring proper closure and preventing resource leaks.
Updated the following APIs to use this function:
- amdsmi_get_gpu_asic_info(), amdsmi_get_gpu_vram_usage(),
amdsmi_get_gpu_vram_info(), amdsmi_get_gpu_vbios_info(),
amdsmi_get_gpu_driver_info(), amdsmi_get_gpu_virtualization_mode()
- Updated AMD SMI test_base.cc/.h:
- Improved output and handling for partitioned environments.
- Added detailed ASIC information logging to align with structure changes.
- Enhanced error messages for better context before ASSERT checks.
- Resolved test failures in partitioned environments by updating
logic and handling for partition-specific configurations.
Fixed tests include:
- computepartition_read_write.cc, frequencies_read_write.cc,
gpu_metrics_read.cc, mem_util_read.cc, memorypartition_read_write.cc,
perf_level_read.cc, perf_level_read_write.cc, power_cap_read_write.cc,
power_read.cc, sys_info_read.cc, gpu_busy_read.cc
Change-Id: I36e903f8fddd714c74c719459c71aba8bbb77e6f
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
Resetting head + adding fixes for tests ran in partitions
Change-Id: I0c1e9ac07488b50c95f3bc6d8a724e67d2c715dc
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
[ROCm/amdsmi commit: 391451752b]
Этот коммит содержится в:
коммит произвёл
Arif, Maisam
родитель
f12b070e14
Коммит
df6de25624
@@ -323,7 +323,10 @@ class AMDSMILogger():
|
||||
if isinstance(value, dict):
|
||||
yaml_string += " " * indent + f"{key}:\n" + self.custom_dump(value, indent + 1)
|
||||
elif isinstance(value, list):
|
||||
yaml_string += " " * indent + f"{key}:\n"
|
||||
if not value:
|
||||
yaml_string += " " * indent + f"{key}: N/A\n"
|
||||
elif isinstance(value, dict):
|
||||
yaml_string += " " * indent + f"{key}:\n"
|
||||
for item in value:
|
||||
if isinstance(item, dict):
|
||||
yaml_string += self.custom_dump(item, indent + 1)
|
||||
|
||||
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
@@ -845,7 +845,7 @@ typedef struct {
|
||||
uint64_t device_id; //!< The device ID of a GPU
|
||||
uint32_t rev_id; //!< The revision ID of a GPU
|
||||
char asic_serial[AMDSMI_MAX_STRING_LENGTH];
|
||||
uint32_t oam_id; //!< 0xFFFF if not supported
|
||||
uint32_t oam_id; //!< 0xFFFFFFFF if not supported
|
||||
uint32_t num_of_compute_units; //!< 0xFFFFFFFF if not supported
|
||||
uint64_t target_graphics_version; //!< 0xFFFFFFFFFFFFFFFF if not supported
|
||||
uint32_t subsystem_id; //!> The subsystem ID
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "amd_smi/impl/amd_smi_gpu_device.h"
|
||||
@@ -58,6 +59,24 @@ std::string smi_split_string(std::string str, char delim);
|
||||
std::string smi_amdgpu_get_status_string(amdsmi_status_t ret, bool fullStatus);
|
||||
amdsmi_status_t smi_clear_char_and_reinitialize(char buffer[], uint32_t len,
|
||||
std::string newString);
|
||||
|
||||
/**
|
||||
* @brief Opens a file descriptor for the specified path with RAII semantics and caching.
|
||||
*
|
||||
* This function attempts to open a file descriptor (FD) for the given file path and flags.
|
||||
* It maintains a cache of weak pointers to previously opened FDs, allowing for reuse of
|
||||
* file descriptors if they are still valid. If a valid FD for the path exists in the cache,
|
||||
* it is reused; otherwise, a new FD is opened. The returned FD is managed by a std::shared_ptr
|
||||
* with a custom deleter that ensures the FD is properly closed when no longer in use.
|
||||
*
|
||||
* Thread safety is ensured via a static mutex.
|
||||
*
|
||||
* @param path The file system path to open.
|
||||
* @param flags Flags to use when opening the file (as per open(2)).
|
||||
* @return std::shared_ptr<int> Shared pointer managing the file descriptor, or nullptr on failure.
|
||||
*/
|
||||
std::shared_ptr<int> amdsmi_RAII_FD_handler(const std::string& path, int flags);
|
||||
|
||||
/**
|
||||
* @brief Wait for user input, a debugging function to pause the program
|
||||
*
|
||||
|
||||
@@ -1879,18 +1879,24 @@ def amdsmi_get_gpu_asic_info(
|
||||
|
||||
market_name = _pad_hex_value(asic_info_struct.market_name.decode("utf-8"), 4)
|
||||
target_graphics_version = hex(asic_info_struct.target_graphics_version)[2:]
|
||||
subsystem_id = _validate_if_max_uint(asic_info_struct.subsystem_id, MaxUIntegerTypes.UINT32_T)
|
||||
subvendor_id = _validate_if_max_uint(asic_info_struct.subvendor_id, MaxUIntegerTypes.UINT32_T)
|
||||
if subsystem_id is not "N/A":
|
||||
subsystem_id = _pad_hex_value(hex(subsystem_id), 4)
|
||||
if subvendor_id is not "N/A":
|
||||
subvendor_id = _pad_hex_value(hex(subvendor_id), 4)
|
||||
asic_info = {
|
||||
"market_name": market_name,
|
||||
"vendor_id": asic_info_struct.vendor_id,
|
||||
"vendor_name": asic_info_struct.vendor_name.decode("utf-8"),
|
||||
"subvendor_id": asic_info_struct.subvendor_id,
|
||||
"subvendor_id": subvendor_id,
|
||||
"device_id": asic_info_struct.device_id,
|
||||
"rev_id": _pad_hex_value(hex(asic_info_struct.rev_id), 2),
|
||||
"asic_serial": asic_info_struct.asic_serial.decode("utf-8"),
|
||||
"oam_id": asic_info_struct.oam_id,
|
||||
"num_compute_units": asic_info_struct.num_of_compute_units,
|
||||
"oam_id": _validate_if_max_uint(asic_info_struct.oam_id, MaxUIntegerTypes.UINT32_T),
|
||||
"num_compute_units": _validate_if_max_uint(asic_info_struct.num_of_compute_units, MaxUIntegerTypes.UINT32_T),
|
||||
"target_graphics_version": "gfx" + target_graphics_version,
|
||||
"subsystem_id": asic_info_struct.subsystem_id
|
||||
"subsystem_id": subsystem_id
|
||||
}
|
||||
|
||||
string_values = ["market_name", "vendor_name"]
|
||||
@@ -1898,7 +1904,7 @@ def amdsmi_get_gpu_asic_info(
|
||||
if not asic_info[value]:
|
||||
asic_info[value] = "N/A"
|
||||
|
||||
hex_values = ["vendor_id", "subvendor_id", "device_id", "subsystem_id"]
|
||||
hex_values = ["vendor_id", "device_id"]
|
||||
for value in hex_values:
|
||||
if asic_info[value]:
|
||||
asic_info[value] = hex(asic_info[value])
|
||||
@@ -1913,14 +1919,6 @@ def amdsmi_get_gpu_asic_info(
|
||||
else:
|
||||
asic_info["asic_serial"] = "N/A"
|
||||
|
||||
# Check for max value as a sign for not applicable
|
||||
if asic_info["oam_id"] == 0xFFFF: # uint 16 max
|
||||
asic_info["oam_id"] = "N/A"
|
||||
|
||||
# Check for max value as a sign for not applicable
|
||||
if asic_info["num_compute_units"] == 0xFFFFFFFF: # uint 32 max
|
||||
asic_info["num_compute_units"] = "N/A"
|
||||
|
||||
# Remove commas from vendor name for clean output
|
||||
asic_info["vendor_name"] = asic_info["vendor_name"].replace(',', '')
|
||||
|
||||
@@ -2834,9 +2832,9 @@ def amdsmi_get_fw_info(
|
||||
'fw_name': fw_name,
|
||||
'fw_version': fw_version_string.upper(),
|
||||
})
|
||||
return {
|
||||
'fw_list': firmwares
|
||||
}
|
||||
return_dict = {'fw_list': firmwares}
|
||||
# logging.debug("amdsmi_interface.py | amdsmi_get_fw_info | return_dictionary = \n" + str(json.dumps(return_dict, indent=4)))
|
||||
return return_dict
|
||||
|
||||
|
||||
def amdsmi_get_gpu_vram_usage(
|
||||
@@ -3314,6 +3312,11 @@ def amdsmi_get_gpu_memory_partition_config(processor_handle: amdsmi_wrapper.amds
|
||||
mem_caps_list.append("NPS4")
|
||||
if config.partition_caps.nps_flags.nps8_cap == 1:
|
||||
mem_caps_list.append("NPS8")
|
||||
if (config.partition_caps.nps_flags.nps1_cap == 0 and
|
||||
config.partition_caps.nps_flags.nps2_cap == 0 and
|
||||
config.partition_caps.nps_flags.nps4_cap == 0 and
|
||||
config.partition_caps.nps_flags.nps8_cap == 0):
|
||||
mem_caps_list.append("N/A")
|
||||
|
||||
return_dict = {
|
||||
"partition_caps": mem_caps_list,
|
||||
@@ -3421,6 +3424,11 @@ def amdsmi_get_gpu_accelerator_partition_profile(
|
||||
mem_caps_list.append("NPS4")
|
||||
if profile.memory_caps.nps_flags.nps8_cap == 1:
|
||||
mem_caps_list.append("NPS8")
|
||||
if (profile.memory_caps.nps_flags.nps1_cap == 0 and
|
||||
profile.memory_caps.nps_flags.nps2_cap == 0 and
|
||||
profile.memory_caps.nps_flags.nps4_cap == 0 and
|
||||
profile.memory_caps.nps_flags.nps8_cap == 0):
|
||||
mem_caps_list.append("N/A")
|
||||
partition_profile_dict = {
|
||||
"profile_type" : profile_type_ret,
|
||||
"num_partitions" : profile.num_partitions,
|
||||
@@ -3473,6 +3481,11 @@ def amdsmi_get_gpu_accelerator_partition_profile_config(processor_handle: amdsmi
|
||||
mem_caps_list.append("NPS4")
|
||||
if profile.memory_caps.nps_flags.nps8_cap == 1:
|
||||
mem_caps_list.append("NPS8")
|
||||
if (profile.memory_caps.nps_flags.nps1_cap == 0 and
|
||||
profile.memory_caps.nps_flags.nps2_cap == 0 and
|
||||
profile.memory_caps.nps_flags.nps4_cap == 0 and
|
||||
profile.memory_caps.nps_flags.nps8_cap == 0):
|
||||
mem_caps_list.append("N/A")
|
||||
|
||||
for r in range(config.num_resource_profiles):
|
||||
# logging.debug("\namdsmi_interface.py | amdsmi_get_gpu_accelerator_partition_profile_config | i = " + str(i) + "; r = " + str(r) + "; resource_idx = " + str(resource_idx))
|
||||
|
||||
@@ -344,10 +344,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type,
|
||||
return amd::smi::ErrnoToRsmiStatus(ret);
|
||||
}
|
||||
|
||||
if (val_str.empty()) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(type)
|
||||
<< " | Cause: SYSFS read was empty"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
|
||||
LOG_INFO(ss);
|
||||
return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
if (!amd::smi::IsInteger(val_str)) {
|
||||
std::ostringstream ss;
|
||||
ss << "Expected integer value from monitor, but got \"" << val_str << "\"";
|
||||
LOG_ERROR(ss);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(type)
|
||||
<< " | Cause: Expected integer value from monitor, but got "<< val_str
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
|
||||
LOG_INFO(ss);
|
||||
return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
@@ -374,10 +395,31 @@ static rsmi_status_t get_dev_mon_value(amd::smi::MonitorTypes type,
|
||||
return amd::smi::ErrnoToRsmiStatus(ret);
|
||||
}
|
||||
|
||||
if (val_str.empty()) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(type)
|
||||
<< " | Cause: SYSFS read was empty"
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
|
||||
LOG_INFO(ss);
|
||||
return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
if (!amd::smi::IsInteger(val_str)) {
|
||||
std::ostringstream ss;
|
||||
ss << "Expected integer value from monitor, but got \"" << val_str << "\"";
|
||||
LOG_ERROR(ss);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Type: " << monitorTypesToString.at(type)
|
||||
<< " | Cause: Expected integer value from monitor, but got "<< val_str
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA) << " |";
|
||||
LOG_INFO(ss);
|
||||
return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
@@ -806,12 +848,13 @@ rsmi_topo_numa_affinity_get(uint32_t dv_ind, int32_t *numa_node) {
|
||||
TRY
|
||||
rsmi_status_t ret;
|
||||
|
||||
CHK_SUPPORT_NAME_ONLY(numa_node)
|
||||
|
||||
DEVICE_MUTEX
|
||||
if (!numa_node) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
std::string str_val;
|
||||
ret = get_dev_value_str(amd::smi::kDevNumaNode, dv_ind, &str_val);
|
||||
if (ret != RSMI_STATUS_SUCCESS){
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
return ret;
|
||||
}
|
||||
*numa_node = std::stoi(str_val, nullptr);
|
||||
@@ -1060,7 +1103,11 @@ rsmi_dev_subsystem_id_get(uint32_t dv_ind, uint16_t *id) {
|
||||
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
CHK_SUPPORT_NAME_ONLY(id)
|
||||
return get_id(dv_ind, amd::smi::kDevSubSysDevID, id);
|
||||
auto ret = get_id(dv_ind, amd::smi::kDevSubSysDevID, id);
|
||||
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
|
||||
<< ", reporting " << amd::smi::getRSMIStatusString(ret, false);
|
||||
LOG_INFO(ss);
|
||||
return ret;
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
@@ -1069,6 +1116,9 @@ rsmi_dev_vendor_id_get(uint32_t dv_ind, uint16_t *id) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
if (!id) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
CHK_SUPPORT_NAME_ONLY(id)
|
||||
int ret_kfd = 0;
|
||||
uint32_t node_id;
|
||||
@@ -1143,8 +1193,11 @@ rsmi_dev_perf_level_get(uint32_t dv_ind, rsmi_dev_perf_level_t *perf) {
|
||||
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
CHK_SUPPORT_NAME_ONLY(perf)
|
||||
DEVICE_MUTEX
|
||||
if (!perf) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
CHK_SUPPORT_NAME_ONLY(perf)
|
||||
|
||||
rsmi_status_t ret = get_dev_value_str(amd::smi::kDevPerfLevel, dv_ind,
|
||||
&val_str);
|
||||
@@ -2811,17 +2864,17 @@ rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len) {
|
||||
std::ostringstream ss;
|
||||
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
CHK_SUPPORT_NAME_ONLY(name)
|
||||
|
||||
if (len == 0) {
|
||||
if (len == 0 || !name) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
CHK_SUPPORT_NAME_ONLY(name)
|
||||
|
||||
DEVICE_MUTEX
|
||||
|
||||
ret = get_dev_name_from_file(dv_ind, name, len);
|
||||
|
||||
if (ret || name[0] == '\0' || !isprint(name[0]) ) {
|
||||
if (ret || name[0] == '\0' || !isprint(name[0])) {
|
||||
ret = get_dev_name_from_id(dv_ind, name, len, NAME_STR_DEVICE);
|
||||
}
|
||||
|
||||
@@ -3850,6 +3903,9 @@ rsmi_dev_power_cap_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *cap) {
|
||||
LOG_TRACE(ss);
|
||||
|
||||
++sensor_ind; // power sysfs files have 1-based indices
|
||||
if (!cap) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
CHK_SUPPORT_SUBVAR_ONLY(cap, sensor_ind)
|
||||
|
||||
rsmi_status_t ret;
|
||||
@@ -3870,6 +3926,9 @@ rsmi_dev_power_cap_range_get(uint32_t dv_ind, uint32_t sensor_ind,
|
||||
LOG_TRACE(ss);
|
||||
|
||||
++sensor_ind; // power sysfs files have 1-based indices
|
||||
if (max == nullptr || min == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
CHK_SUPPORT_SUBVAR_ONLY((min == nullptr || max == nullptr ?nullptr : min),
|
||||
sensor_ind)
|
||||
rsmi_status_t ret;
|
||||
@@ -3993,6 +4052,8 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
|
||||
}
|
||||
|
||||
DEVICE_MUTEX
|
||||
*total = 0; // Initialize total to 0
|
||||
// This is needed to avoid returning garbage value in case of failure
|
||||
ret = get_dev_value_int(mem_type_file, dv_ind, total);
|
||||
|
||||
// Fallback to KFD reported memory if VRAM total is 0 or sysfs read fails
|
||||
@@ -4070,6 +4131,8 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type,
|
||||
}
|
||||
|
||||
DEVICE_MUTEX
|
||||
*used = 0; // Initialize used to 0
|
||||
// This is needed to avoid returning garbage value in case of failure
|
||||
ret = get_dev_value_int(mem_type_file, dv_ind, used);
|
||||
|
||||
// Fallback to KFD reported memory if no VRAM or sysfs read fails
|
||||
@@ -4652,10 +4715,8 @@ rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *unique_id) {
|
||||
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
|
||||
LOG_TRACE(ss);
|
||||
|
||||
CHK_SUPPORT_NAME_ONLY(unique_id)
|
||||
|
||||
DEVICE_MUTEX
|
||||
if (unique_id == nullptr) {
|
||||
if (!unique_id) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
*unique_id = std::numeric_limits<uint64_t>::max();
|
||||
|
||||
@@ -806,7 +806,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) {
|
||||
if (ret != 0 || !reg_file) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Adjusted file path also does not exist - SYSFS file ("
|
||||
<< sysfs_path
|
||||
<< sysfs_path
|
||||
<< ") for DevInfoInfoType (" << get_type_string(type)
|
||||
<< "), returning " << std::to_string(ret);
|
||||
LOG_ERROR(ss);
|
||||
@@ -865,8 +865,8 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) {
|
||||
ret = openDebugFileStream(type, &fs);
|
||||
if (ret != 0) {
|
||||
ss << "Could not read debugInfoStr for DevInfoType ("
|
||||
<< get_type_string(type)<< "), returning "
|
||||
<< std::to_string(ret);
|
||||
<< get_type_string(type) << "), returning "
|
||||
<< std::to_string(ret);
|
||||
LOG_ERROR(ss);
|
||||
return ret;
|
||||
}
|
||||
@@ -879,7 +879,7 @@ int Device::readDebugInfoStr(DevInfoTypes type, std::string *retStr) {
|
||||
fs.close();
|
||||
|
||||
ss << "Successfully read debugInfoStr for DevInfoType ("
|
||||
<< get_type_string(type)<< "), retString= " << *retStr;
|
||||
<< get_type_string(type) << "), retString= " << *retStr;
|
||||
LOG_INFO(ss);
|
||||
|
||||
return 0;
|
||||
@@ -904,8 +904,8 @@ int Device::readDevInfoStr(DevInfoTypes type, std::string *retStr) {
|
||||
fs >> *retStr;
|
||||
fs.close();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< "Successfully read device info string for DevInfoType (" <<
|
||||
get_type_string(type) << "): " + *retStr
|
||||
<< "Successfully read device info string for DevInfoType ("
|
||||
<< get_type_string(type) << "): " + *retStr
|
||||
<< " | "
|
||||
<< (fs.is_open() ? " File stream is opened" : " File stream is closed")
|
||||
<< " | " << (fs.bad() ? "[ERROR] Bad read operation" :
|
||||
@@ -1078,7 +1078,6 @@ const char* Device::get_type_string(DevInfoTypes type) {
|
||||
}
|
||||
|
||||
return "Unknown";
|
||||
|
||||
}
|
||||
|
||||
int Device::readDevInfoBinary(DevInfoTypes type, std::size_t b_size,
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
|
||||
#include <dirent.h>
|
||||
#include <pthread.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
@@ -4537,8 +4538,24 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) {
|
||||
dev->set_smi_partition_id(0);
|
||||
}
|
||||
|
||||
dev->dev_log_gpu_metrics(ostrstream);
|
||||
// check if file exists, report not supported if it does not exist
|
||||
std::string file_name = "/sys/class/drm/card"
|
||||
+ std::to_string(dev->index())
|
||||
+ "/device/gpu_metrics";
|
||||
if (access(file_name.c_str(), F_OK | R_OK) != 0) {
|
||||
status_code = RSMI_STATUS_NOT_SUPPORTED;
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | ======= end ======= "
|
||||
<< " | Fail "
|
||||
<< " | Device #: " << dv_ind
|
||||
<< " | Returning = "
|
||||
<< getRSMIStatusString(status_code, false)
|
||||
<< " |";
|
||||
LOG_ERROR(ss);
|
||||
return status_code;
|
||||
}
|
||||
|
||||
dev->dev_log_gpu_metrics(ostrstream);
|
||||
const auto [error_code, external_metrics] = dev->dev_copy_internal_to_external_metrics();
|
||||
if (error_code != rsmi_status_t::RSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
|
||||
@@ -898,24 +898,28 @@ amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_hand
|
||||
|
||||
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex());
|
||||
std::string render_name = gpu_device->get_gpu_path();
|
||||
int drm_fd = -1;
|
||||
std::string path = "/dev/dri/" + render_name;
|
||||
if (render_name != "") {
|
||||
drm_fd = open(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
} else {
|
||||
close(drm_fd);
|
||||
if (render_name.empty()) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
std::string path = "/dev/dri/" + render_name;
|
||||
auto drm_fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | open(" << path << ") returned: " << strerror(errno) << "\n"
|
||||
<< " | drm_fd: " << std::dec << drm_fd << "\n"
|
||||
<< " | drm_fd: " << (drm_fd == nullptr ? "nullptr" : std::to_string(*drm_fd)) << "\n"
|
||||
<< " | render_name: " << render_name << "\n";
|
||||
LOG_INFO(ss);
|
||||
if (!drm_fd) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to open " << path << ": " << strerror(errno)
|
||||
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false);
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_FILE_ERROR;
|
||||
}
|
||||
|
||||
amd::smi::AMDSmiLibraryLoader libdrm;
|
||||
amdsmi_status_t status = libdrm.load("libdrm.so.2");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load libdrm.so.2: " << strerror(errno)
|
||||
@@ -938,7 +942,6 @@ amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_hand
|
||||
status = libdrm.load_symbol(reinterpret_cast<drmCommandWrite_t *>(&drmCommandWrite),
|
||||
"drmCommandWrite");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load drmCommandWrite symbol"
|
||||
@@ -950,7 +953,6 @@ amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_hand
|
||||
<< " | drmCommandWrite symbol loaded successfully";
|
||||
LOG_INFO(ss);
|
||||
|
||||
|
||||
uint64_t total = 0;
|
||||
r = rsmi_wrapper(rsmi_dev_memory_total_get, processor_handle, 0,
|
||||
RSMI_MEM_TYPE_VRAM, &total);
|
||||
@@ -964,10 +966,9 @@ amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_hand
|
||||
request.return_pointer = reinterpret_cast<unsigned long long>(&vram_used);
|
||||
request.return_size = sizeof(vram_used);
|
||||
request.query = AMDGPU_INFO_VRAM_USAGE;
|
||||
auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request,
|
||||
sizeof(struct drm_amdgpu_info));
|
||||
auto drm_write = drmCommandWrite(*drm_fd, DRM_AMDGPU_INFO, &request,
|
||||
sizeof(struct drm_amdgpu_info));
|
||||
if (drm_write != 0) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue - drm_write failed, drm_write (AMDGPU_INFO_VRAM_USAGE): "
|
||||
@@ -978,7 +979,6 @@ amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_hand
|
||||
}
|
||||
|
||||
vram_info->vram_used = static_cast<uint32_t>(vram_used / (1024 * 1024));
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | vram_info->vram_total (MB): " << std::dec << vram_info->vram_total << "\n"
|
||||
@@ -1531,6 +1531,19 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
|
||||
uint16_t subvendor_id = 0;
|
||||
uint16_t device_id = 0;
|
||||
uint16_t subsystem_id = 0;
|
||||
char temp_market_name[AMDSMI_MAX_STRING_LENGTH] = {0};
|
||||
smi_clear_char_and_reinitialize(info->market_name, AMDSMI_MAX_STRING_LENGTH, temp_market_name);
|
||||
info->market_name[0] = '\0';
|
||||
info->vendor_id = std::numeric_limits<uint32_t>::max();
|
||||
info->vendor_name[0] = '\0';
|
||||
info->subvendor_id = std::numeric_limits<uint32_t>::max();
|
||||
info->device_id = std::numeric_limits<uint64_t>::max();
|
||||
info->rev_id = std::numeric_limits<uint16_t>::max();
|
||||
info->asic_serial[0] = '\0';
|
||||
info->oam_id = std::numeric_limits<uint32_t>::max();
|
||||
info->num_of_compute_units = std::numeric_limits<uint32_t>::max();
|
||||
info->target_graphics_version = std::numeric_limits<uint64_t>::max();
|
||||
info->subsystem_id = std::numeric_limits<uint32_t>::max();
|
||||
|
||||
std::ostringstream ss;
|
||||
amd::smi::AMDSmiGPUDevice* gpu_device = nullptr;
|
||||
@@ -1539,80 +1552,6 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
|
||||
return r;
|
||||
}
|
||||
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex())
|
||||
amdsmi_status_t status = smi_amdgpu_get_market_name_from_dev_id(gpu_device, info->market_name);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0,
|
||||
info->market_name, AMDSMI_MAX_STRING_LENGTH);
|
||||
}
|
||||
|
||||
std::string render_name = gpu_device->get_gpu_path();
|
||||
int drm_fd = -1;
|
||||
std::string path = "/dev/dri/" + render_name;
|
||||
if (render_name != "") {
|
||||
drm_fd = open(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
} else {
|
||||
close(drm_fd);
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | open(" << path << ") returned: " << strerror(errno) << "\n"
|
||||
<< " | drm_fd: " << std::dec << drm_fd << "\n"
|
||||
<< " | render_name: " << render_name << "\n";
|
||||
LOG_INFO(ss);
|
||||
|
||||
amd::smi::AMDSmiLibraryLoader libdrm;
|
||||
status = libdrm.load("libdrm.so.2");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load libdrm.so.2: " << strerror(errno)
|
||||
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
|
||||
LOG_ERROR(ss);
|
||||
return status;
|
||||
}
|
||||
|
||||
// extern int drmCommandWrite(int fd, unsigned long drmCommandIndex,
|
||||
// void *data, unsigned long size);
|
||||
typedef int (*drmCommandWrite_t)(int fd, unsigned long drmCommandIndex,
|
||||
void *data, unsigned long size);
|
||||
drmCommandWrite_t drmCommandWrite = nullptr;
|
||||
|
||||
// load symbol from libdrm
|
||||
status = libdrm.load_symbol(reinterpret_cast<drmCommandWrite_t *>(&drmCommandWrite),
|
||||
"drmCommandWrite");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load drmCommandWrite symbol"
|
||||
<< " | Returning: " << smi_amdgpu_get_status_string(status, false);
|
||||
LOG_ERROR(ss);
|
||||
return status;
|
||||
}
|
||||
|
||||
// Get the device info
|
||||
memset(&dev_info, 0, sizeof(struct drm_amdgpu_info_device));
|
||||
struct drm_amdgpu_info request = {};
|
||||
memset(&request, 0, sizeof(request));
|
||||
request.return_pointer = reinterpret_cast<unsigned long long>(&dev_info);
|
||||
request.return_size = sizeof(struct drm_amdgpu_info_device);
|
||||
request.query = AMDGPU_INFO_DEV_INFO;
|
||||
auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request,
|
||||
sizeof(struct drm_amdgpu_info));
|
||||
if (drm_write != 0) {
|
||||
libdrm.unload();
|
||||
close(drm_fd);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue - drm_write failed, drm_write: " << std::dec << drm_write << "\n"
|
||||
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_DRM_ERROR, false);
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_DRM_ERROR;
|
||||
}
|
||||
// TODO(cpoag): check if this is correct, might be able to go through KGD/KFD
|
||||
info->rev_id = static_cast<uint32_t>(dev_info.pci_rev);
|
||||
libdrm.unload();
|
||||
close(drm_fd);
|
||||
|
||||
/**
|
||||
* For other sysfs related information, get from rocm-smi
|
||||
@@ -1622,7 +1561,8 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
|
||||
std::string max_uint64_str = "ffffffffffffffff";
|
||||
smi_clear_char_and_reinitialize(info->asic_serial, AMDSMI_MAX_STRING_LENGTH, max_uint64_str);
|
||||
uint64_t device_uuid = 0;
|
||||
status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0, &device_uuid);
|
||||
amdsmi_status_t status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, 0,
|
||||
&device_uuid);
|
||||
if (status == AMDSMI_STATUS_SUCCESS) {
|
||||
ss.clear();
|
||||
ss << std::hex << std::setw(16) << std::setfill('0') << device_uuid;
|
||||
@@ -1647,31 +1587,32 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
|
||||
&subsystem_id);
|
||||
if (status == AMDSMI_STATUS_SUCCESS) info->subsystem_id = subsystem_id;
|
||||
|
||||
char temp_vendor_name[AMDSMI_MAX_STRING_LENGTH] = {0};
|
||||
status = rsmi_wrapper(rsmi_dev_pcie_vendor_name_get, processor_handle, 0,
|
||||
info->vendor_name, AMDSMI_MAX_STRING_LENGTH);
|
||||
temp_vendor_name, AMDSMI_MAX_STRING_LENGTH);
|
||||
if (status == AMDSMI_STATUS_SUCCESS) {
|
||||
smi_clear_char_and_reinitialize(info->vendor_name, AMDSMI_MAX_STRING_LENGTH,
|
||||
temp_vendor_name);
|
||||
}
|
||||
|
||||
// default to 0xffff as not supported
|
||||
info->oam_id = std::numeric_limits<uint16_t>::max();
|
||||
uint16_t tmp_oam_id = 0;
|
||||
status = rsmi_wrapper(rsmi_dev_xgmi_physical_id_get, processor_handle, 0,
|
||||
&(tmp_oam_id));
|
||||
info->oam_id = tmp_oam_id;
|
||||
if (status == AMDSMI_STATUS_SUCCESS) {
|
||||
info->oam_id = tmp_oam_id;
|
||||
}
|
||||
|
||||
// default to 0xffffffff as not supported
|
||||
info->num_of_compute_units = std::numeric_limits<uint32_t>::max();
|
||||
auto tmp_num_of_compute_units = uint32_t(0);
|
||||
status = rsmi_wrapper(amd::smi::rsmi_dev_number_of_computes_get, processor_handle, 0,
|
||||
&(tmp_num_of_compute_units));
|
||||
if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) {
|
||||
if (status == AMDSMI_STATUS_SUCCESS) {
|
||||
info->num_of_compute_units = tmp_num_of_compute_units;
|
||||
}
|
||||
|
||||
// default to 0xffffffffffffffff as not supported
|
||||
info->target_graphics_version = std::numeric_limits<uint64_t>::max();
|
||||
auto tmp_target_gfx_version = uint64_t(0);
|
||||
status = rsmi_wrapper(rsmi_dev_target_graphics_version_get, processor_handle, 0,
|
||||
&(tmp_target_gfx_version));
|
||||
if (status == amdsmi_status_t::AMDSMI_STATUS_SUCCESS) {
|
||||
if (status == AMDSMI_STATUS_SUCCESS) {
|
||||
info->target_graphics_version = tmp_target_gfx_version;
|
||||
}
|
||||
|
||||
@@ -1685,16 +1626,12 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
|
||||
LOG_INFO(ss);
|
||||
if (status == AMDSMI_STATUS_SUCCESS) {
|
||||
info->device_id = static_cast<uint64_t>(device_id);
|
||||
} else {
|
||||
info->device_id = std::numeric_limits<uint64_t>::max();
|
||||
}
|
||||
info->rev_id = dev_info.pci_rev;
|
||||
status = rsmi_wrapper(rsmi_dev_vendor_id_get, processor_handle, 0,
|
||||
&vendor_id);
|
||||
if (status == AMDSMI_STATUS_SUCCESS) {
|
||||
info->vendor_id = vendor_id;
|
||||
} else {
|
||||
info->vendor_id = std::numeric_limits<uint32_t>::max();
|
||||
}
|
||||
|
||||
// If vendor name is empty and the vendor id is 0x1002, set vendor name to AMD vendor string
|
||||
@@ -1703,6 +1640,95 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i
|
||||
smi_clear_char_and_reinitialize(info->vendor_name, AMDSMI_MAX_STRING_LENGTH, amd_name);
|
||||
}
|
||||
|
||||
status = smi_amdgpu_get_market_name_from_dev_id(gpu_device, info->market_name);
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
status = rsmi_wrapper(rsmi_dev_brand_get, processor_handle, 0,
|
||||
temp_market_name, AMDSMI_MAX_STRING_LENGTH);
|
||||
if (status == AMDSMI_STATUS_SUCCESS) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | rsmi_dev_brand_get() returned: "
|
||||
<< smi_amdgpu_get_status_string(status, false) << "\n"
|
||||
<< " ; temp_market_name: " << temp_market_name << "\n";
|
||||
LOG_INFO(ss);
|
||||
smi_clear_char_and_reinitialize(info->market_name, AMDSMI_MAX_STRING_LENGTH,
|
||||
temp_market_name);
|
||||
} else {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | rsmi_dev_brand_get() failed: "
|
||||
<< smi_amdgpu_get_status_string(status, false) << "\n";
|
||||
LOG_INFO(ss);
|
||||
}
|
||||
}
|
||||
|
||||
std::string render_name = gpu_device->get_gpu_path();
|
||||
if (render_name.empty()) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
std::string path = "/dev/dri/" + render_name;
|
||||
auto drm_fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | open(" << path << ") returned: " << strerror(errno) << "\n"
|
||||
<< " | drm_fd: " << (drm_fd == nullptr ? "nullptr" : std::to_string(*drm_fd)) << "\n"
|
||||
<< " | render_name: " << render_name << "\n";
|
||||
LOG_INFO(ss);
|
||||
if (!drm_fd) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to open " << path << ": " << strerror(errno)
|
||||
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false);
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_FILE_ERROR;
|
||||
}
|
||||
|
||||
amd::smi::AMDSmiLibraryLoader libdrm;
|
||||
status = libdrm.load("libdrm.so.2");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load libdrm.so.2: " << strerror(errno)
|
||||
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
|
||||
LOG_ERROR(ss);
|
||||
return status;
|
||||
}
|
||||
|
||||
// extern int drmCommandWrite(int fd, unsigned long drmCommandIndex,
|
||||
// void *data, unsigned long size);
|
||||
typedef int (*drmCommandWrite_t)(int fd, unsigned long drmCommandIndex,
|
||||
void *data, unsigned long size);
|
||||
drmCommandWrite_t drmCommandWrite = nullptr;
|
||||
|
||||
// load symbol from libdrm
|
||||
status = libdrm.load_symbol(reinterpret_cast<drmCommandWrite_t *>(&drmCommandWrite),
|
||||
"drmCommandWrite");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load drmCommandWrite symbol"
|
||||
<< " | Returning: " << smi_amdgpu_get_status_string(status, false);
|
||||
LOG_ERROR(ss);
|
||||
return status;
|
||||
}
|
||||
|
||||
// Get the device info
|
||||
memset(&dev_info, 0, sizeof(struct drm_amdgpu_info_device));
|
||||
struct drm_amdgpu_info request = {};
|
||||
memset(&request, 0, sizeof(request));
|
||||
request.return_pointer = reinterpret_cast<unsigned long long>(&dev_info);
|
||||
request.return_size = sizeof(struct drm_amdgpu_info_device);
|
||||
request.query = AMDGPU_INFO_DEV_INFO;
|
||||
auto drm_write = drmCommandWrite(*drm_fd, DRM_AMDGPU_INFO, &request,
|
||||
sizeof(struct drm_amdgpu_info));
|
||||
if (drm_write != 0) {
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue - drm_write failed, drm_write: " << std::dec << drm_write << "\n"
|
||||
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_DRM_ERROR, false);
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_DRM_ERROR;
|
||||
}
|
||||
// TODO(cpoag): check if this is correct, might be able to go through KGD/KFD
|
||||
info->rev_id = static_cast<uint32_t>(dev_info.pci_rev);
|
||||
libdrm.unload();
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | info->market_name: " << info->market_name << "\n"
|
||||
<< " | info->vendor_id (dec): " << std::dec << info->vendor_id << "\n"
|
||||
@@ -1862,24 +1888,28 @@ amdsmi_status_t amdsmi_get_gpu_vram_info(
|
||||
|
||||
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex());
|
||||
std::string render_name = gpu_device->get_gpu_path();
|
||||
int drm_fd = -1;
|
||||
std::string path = "/dev/dri/" + render_name;
|
||||
if (render_name != "") {
|
||||
drm_fd = open(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
} else {
|
||||
close(drm_fd);
|
||||
if (render_name.empty()) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
auto drm_fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | open(" << path << ") returned: " << strerror(errno) << "\n"
|
||||
<< " | drm_fd: " << std::dec << drm_fd << "\n"
|
||||
<< " | drm_fd: " << (drm_fd == nullptr ? "nullptr" : std::to_string(*drm_fd)) << "\n"
|
||||
<< " | render_name: " << render_name << "\n";
|
||||
LOG_INFO(ss);
|
||||
if (!drm_fd) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to open " << path << ": " << strerror(errno)
|
||||
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false);
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_FILE_ERROR;
|
||||
}
|
||||
|
||||
amd::smi::AMDSmiLibraryLoader libdrm;
|
||||
amdsmi_status_t status = libdrm.load("libdrm.so.2");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load libdrm.so.2: " << strerror(errno)
|
||||
@@ -1902,7 +1932,6 @@ amdsmi_status_t amdsmi_get_gpu_vram_info(
|
||||
status = libdrm.load_symbol(reinterpret_cast<drmCommandWrite_t *>(&drmCommandWrite),
|
||||
"drmCommandWrite");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load drmCommandWrite symbol"
|
||||
@@ -1921,10 +1950,9 @@ amdsmi_status_t amdsmi_get_gpu_vram_info(
|
||||
request.return_pointer = reinterpret_cast<unsigned long long>(&dev_info);
|
||||
request.return_size = sizeof(struct drm_amdgpu_info_device);
|
||||
request.query = AMDGPU_INFO_DEV_INFO;
|
||||
auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request,
|
||||
auto drm_write = drmCommandWrite(*drm_fd, DRM_AMDGPU_INFO, &request,
|
||||
sizeof(struct drm_amdgpu_info));
|
||||
if (drm_write != 0) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Issue - drm_write failed, drm_write: " << std::dec << drm_write << "\n"
|
||||
@@ -1935,8 +1963,9 @@ amdsmi_status_t amdsmi_get_gpu_vram_info(
|
||||
|
||||
info->vram_type = amd::smi::vram_type_value(dev_info.vram_type);
|
||||
info->vram_bit_width = dev_info.vram_bit_width;
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
// if vram type is greater than the max enum set it to unknown
|
||||
if (info->vram_type > AMDSMI_VRAM_TYPE__MAX) info->vram_type = AMDSMI_VRAM_TYPE_UNKNOWN;
|
||||
|
||||
// set info->vram_max_bandwidth to gpu_metrics vram_max_bandwidth if it is not set
|
||||
amdsmi_gpu_metrics_t metric_info = {};
|
||||
@@ -1945,10 +1974,6 @@ amdsmi_status_t amdsmi_get_gpu_vram_info(
|
||||
info->vram_max_bandwidth = metric_info.vram_max_bandwidth;
|
||||
}
|
||||
|
||||
// if vram type is greater than the max enum set it to unknown
|
||||
if (info->vram_type > AMDSMI_VRAM_TYPE__MAX)
|
||||
info->vram_type = AMDSMI_VRAM_TYPE_UNKNOWN;
|
||||
|
||||
// map the vendor name to enum
|
||||
char brand[256] = {'\0'};
|
||||
r = rsmi_wrapper(rsmi_dev_vram_vendor_get, processor_handle, 0, brand, 255);
|
||||
@@ -2949,8 +2974,8 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h
|
||||
constexpr uint32_t kCurrentPartitionSize = 5;
|
||||
char current_partition[kCurrentPartitionSize];
|
||||
std::string current_partition_str = "N/A";
|
||||
status = amdsmi_get_gpu_compute_partition(processor_handle, current_partition,
|
||||
kCurrentPartitionSize);
|
||||
amdsmi_status_t compute_status = amdsmi_get_gpu_compute_partition(processor_handle,
|
||||
current_partition, kCurrentPartitionSize);
|
||||
ss << __PRETTY_FUNCTION__ << " | amdsmi_get_gpu_compute_partition() current_partition = |"
|
||||
<< current_partition << "|";
|
||||
LOG_DEBUG(ss);
|
||||
@@ -2965,7 +2990,8 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h
|
||||
if (accelerator_capabilities.find(current_partition_str) != std::string::npos) {
|
||||
auto it = std::find(tokens.begin(), tokens.end(), current_partition_str);
|
||||
if (it != tokens.end()) {
|
||||
profile->profile_index = static_cast<uint32_t>(std::distance(tokens.begin(), it));
|
||||
profile->profile_index = static_cast<uint32_t>(std::distance(
|
||||
tokens.begin(), it));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3056,7 +3082,7 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h
|
||||
profile->memory_caps = flags;
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | END returning " << smi_amdgpu_get_status_string(status, false) << "\n"
|
||||
<< " | END returning " << smi_amdgpu_get_status_string(compute_status, false) << "\n"
|
||||
<< " | accelerator_capabilities: " << accelerator_capabilities << "\n"
|
||||
<< " | current_partition_str: " << current_partition_str << "\n"
|
||||
<< " | std::vector<std::string> tokens: " << ss_1.str() << "\n"
|
||||
@@ -3072,7 +3098,9 @@ amdsmi_get_gpu_accelerator_partition_profile(amdsmi_processor_handle processor_h
|
||||
<< " | partition_id: " << ss_2.str();
|
||||
LOG_INFO(ss);
|
||||
|
||||
return status;
|
||||
return compute_status; // only return status from amdsmi_get_gpu_compute_partition
|
||||
// as this is the only function that can fail
|
||||
// if the device does not support partitions
|
||||
}
|
||||
|
||||
amdsmi_status_t
|
||||
@@ -3373,7 +3401,9 @@ amdsmi_status_t
|
||||
amdsmi_status_t amdsmi_get_gpu_perf_level(amdsmi_processor_handle processor_handle,
|
||||
amdsmi_dev_perf_level_t *perf) {
|
||||
AMDSMI_CHECK_INIT();
|
||||
// nullptr api supported
|
||||
if (!perf) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
|
||||
return rsmi_wrapper(rsmi_dev_perf_level_get, processor_handle, 0,
|
||||
reinterpret_cast<rsmi_dev_perf_level_t*>(perf));
|
||||
@@ -3681,6 +3711,9 @@ amdsmi_status_t amdsmi_get_gpu_bdf_id(
|
||||
|
||||
amdsmi_status_t amdsmi_get_gpu_topo_numa_affinity(
|
||||
amdsmi_processor_handle processor_handle, int32_t *numa_node) {
|
||||
if (!numa_node) {
|
||||
return AMDSMI_STATUS_INVAL;
|
||||
}
|
||||
return rsmi_wrapper(rsmi_topo_numa_affinity_get, processor_handle, 0,
|
||||
numa_node);
|
||||
}
|
||||
@@ -3716,24 +3749,28 @@ amdsmi_get_gpu_vbios_info(amdsmi_processor_handle processor_handle, amdsmi_vbios
|
||||
|
||||
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex());
|
||||
std::string render_name = gpu_device->get_gpu_path();
|
||||
int drm_fd = -1;
|
||||
std::string path = "/dev/dri/" + render_name;
|
||||
if (render_name != "") {
|
||||
drm_fd = open(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
} else {
|
||||
close(drm_fd);
|
||||
if (render_name.empty()) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
auto drm_fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | open(" << path << ") returned: " << strerror(errno) << "\n"
|
||||
<< " | drm_fd: " << std::dec << drm_fd << "\n"
|
||||
<< " | drm_fd: " << (drm_fd == nullptr ? "nullptr" : std::to_string(*drm_fd)) << "\n"
|
||||
<< " | render_name: " << render_name << "\n";
|
||||
LOG_INFO(ss);
|
||||
if (!drm_fd) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to open " << path << ": " << strerror(errno)
|
||||
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false);
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_FILE_ERROR;
|
||||
}
|
||||
|
||||
amd::smi::AMDSmiLibraryLoader libdrm;
|
||||
status = libdrm.load("libdrm.so.2");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load libdrm.so.2: " << strerror(errno)
|
||||
@@ -3757,7 +3794,6 @@ amdsmi_get_gpu_vbios_info(amdsmi_processor_handle processor_handle, amdsmi_vbios
|
||||
"drmCommandWrite");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
libdrm.unload();
|
||||
close(drm_fd);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load drmCommandWrite symbol"
|
||||
<< " | Returning: " << smi_amdgpu_get_status_string(status, false);
|
||||
@@ -3775,7 +3811,7 @@ amdsmi_get_gpu_vbios_info(amdsmi_processor_handle processor_handle, amdsmi_vbios
|
||||
request.return_size = sizeof(drm_amdgpu_info_vbios);
|
||||
request.query = AMDGPU_INFO_VBIOS;
|
||||
request.vbios_info.type = AMDGPU_INFO_VBIOS_INFO;
|
||||
auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request,
|
||||
auto drm_write = drmCommandWrite(*drm_fd, DRM_AMDGPU_INFO, &request,
|
||||
sizeof(struct drm_amdgpu_info));
|
||||
|
||||
if (drm_write == 0) {
|
||||
@@ -3799,7 +3835,6 @@ amdsmi_get_gpu_vbios_info(amdsmi_processor_handle processor_handle, amdsmi_vbios
|
||||
vbios_version, AMDSMI_MAX_STRING_LENGTH);
|
||||
}
|
||||
}
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | drmCommandWrite returned: " << strerror(errno) << "\n"
|
||||
@@ -4283,19 +4318,22 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han
|
||||
|
||||
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex())
|
||||
std::string render_name = gpu_device->get_gpu_path();
|
||||
int drm_fd = -1;
|
||||
std::string path = "/dev/dri/" + render_name;
|
||||
if (render_name != "") {
|
||||
drm_fd = open(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
} else {
|
||||
close(drm_fd);
|
||||
if (render_name.empty()) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
auto drm_fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
if (!drm_fd) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to open " << path << ": " << strerror(errno)
|
||||
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false);
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_FILE_ERROR;
|
||||
}
|
||||
amd::smi::AMDSmiLibraryLoader libdrm;
|
||||
status = libdrm.load("libdrm.so.2");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load libdrm.so.2"
|
||||
@@ -4313,7 +4351,6 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han
|
||||
status = libdrm.load_symbol(
|
||||
reinterpret_cast<drmGetVersion_t *>(&drm_get_version), "drmGetVersion");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load drmGetVersion symbol"
|
||||
@@ -4324,7 +4361,6 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han
|
||||
status = libdrm.load_symbol(
|
||||
reinterpret_cast<drmGetVersion_t *>(&drm_free_version), "drmFreeVersion");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load drmFreeVersion symbol"
|
||||
@@ -4335,9 +4371,8 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han
|
||||
|
||||
// Get the driver date
|
||||
std::string driver_date;
|
||||
auto version = drm_get_version(drm_fd);
|
||||
auto version = drm_get_version(*drm_fd);
|
||||
if (version == nullptr) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to get driver version"
|
||||
@@ -4358,7 +4393,6 @@ amdsmi_status_t amdsmi_get_gpu_driver_info(amdsmi_processor_handle processor_han
|
||||
std::string driver_name = version->name;
|
||||
strncpy(info->driver_name, driver_name.c_str(), AMDSMI_MAX_STRING_LENGTH-1);
|
||||
drm_free_version(version);
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Driver version: " << info->driver_version << "\n"
|
||||
@@ -4402,9 +4436,9 @@ amdsmi_status_t amdsmi_get_pcie_info(amdsmi_processor_handle processor_handle, a
|
||||
} else {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to open file: " << path_max_link_width
|
||||
<< " | returning AMDSMI_STATUS_API_FAILED";
|
||||
<< " | returning AMDSMI_STATUS_NOT_SUPPORTED";
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_API_FAILED;
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
info->pcie_static.max_pcie_width = (uint16_t)pcie_width;
|
||||
|
||||
@@ -4760,19 +4794,23 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle,
|
||||
SMIGPUDEVICE_MUTEX(gpu_device->get_mutex())
|
||||
|
||||
std::string render_name = gpu_device->get_gpu_path();
|
||||
int drm_fd = -1;
|
||||
std::string path = "/dev/dri/" + render_name;
|
||||
if (render_name != "") {
|
||||
drm_fd = open(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
} else {
|
||||
close(drm_fd);
|
||||
if (render_name.empty()) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
auto drm_fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
if (!drm_fd) {
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to open " << path << ": " << strerror(errno)
|
||||
<< "; Returning: " << smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false);
|
||||
LOG_ERROR(ss);
|
||||
return AMDSMI_STATUS_FILE_ERROR;
|
||||
}
|
||||
|
||||
amd::smi::AMDSmiLibraryLoader libdrm;
|
||||
status = libdrm.load("libdrm.so.2");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load libdrm.so.2"
|
||||
@@ -4790,9 +4828,7 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle,
|
||||
status = libdrm.load_symbol(reinterpret_cast<drmGetVersion_t *>(&drm_get_version),
|
||||
"drmGetVersion");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
drm_get_version = nullptr;
|
||||
libdrm.unload();
|
||||
close(drm_fd);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load drmGetVersion symbol"
|
||||
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
|
||||
@@ -4806,7 +4842,6 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle,
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
drm_free_version = nullptr;
|
||||
libdrm.unload();
|
||||
close(drm_fd);
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load drmFreeVersion symbol"
|
||||
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
|
||||
@@ -4815,7 +4850,7 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle,
|
||||
}
|
||||
|
||||
// get drm version. If it's older than 3.62.0, then say not supported and exit.
|
||||
auto drm_version = drm_get_version(drm_fd);
|
||||
auto drm_version = drm_get_version(*drm_fd);
|
||||
// minimum version that supports getting of virtualization mode
|
||||
int major_version = 3;
|
||||
int minor_version = 62;
|
||||
@@ -4840,7 +4875,6 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle,
|
||||
// If not, then return not supported
|
||||
if (isDRMVersionSupported == false) {
|
||||
drm_free_version(drm_version);
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
@@ -4855,11 +4889,10 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle,
|
||||
"drmCommandWrite");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
drm_free_version(drm_version);
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | Failed to load drmCommandWrite symbol: " << strerror(errno)
|
||||
<< " | returning AMDSMI_STATUS_DRM_ERROR";
|
||||
<< "; Returning: " << smi_amdgpu_get_status_string(status, false);
|
||||
LOG_ERROR(ss);
|
||||
return status;
|
||||
}
|
||||
@@ -4871,10 +4904,10 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle,
|
||||
request.return_pointer = reinterpret_cast<unsigned long long>(&dev_info);
|
||||
request.return_size = sizeof(struct drm_amdgpu_info_device);
|
||||
request.query = AMDGPU_INFO_DEV_INFO;
|
||||
auto drm_write = drmCommandWrite(drm_fd, DRM_AMDGPU_INFO, &request,
|
||||
auto drm_write = drmCommandWrite(*drm_fd, DRM_AMDGPU_INFO, &request,
|
||||
sizeof(struct drm_amdgpu_info));
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | drm_fd: " << std::dec << drm_fd << "\n"
|
||||
<< " | drm_fd: " << std::dec << *drm_fd << "\n"
|
||||
<< " | path: " << path << "\n"
|
||||
<< " | drmCommandWrite: " << drm_write << "\n"
|
||||
<< " | drmCommandWrite returned: " << strerror(errno) << "\n"
|
||||
@@ -4917,7 +4950,6 @@ amdsmi_get_gpu_virtualization_mode(amdsmi_processor_handle processor_handle,
|
||||
status = AMDSMI_STATUS_DRM_ERROR;
|
||||
}
|
||||
drm_free_version(drm_version);
|
||||
close(drm_fd);
|
||||
libdrm.unload();
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include <string.h>
|
||||
#include <memory>
|
||||
#include <regex>
|
||||
#include "amd_smi/impl/amd_smi_utils.h"
|
||||
#include "amd_smi/impl/amd_smi_drm.h"
|
||||
#include "amd_smi/impl/amd_smi_common.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
@@ -59,8 +60,6 @@ std::string AMDSmiDrm::find_file_in_folder(const std::string& folder,
|
||||
|
||||
amdsmi_status_t AMDSmiDrm::init() {
|
||||
std::ostringstream ss;
|
||||
int fd = -1;
|
||||
|
||||
|
||||
amdsmi_status_t status = lib_loader_.load("libdrm.so.2");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
@@ -131,27 +130,18 @@ amdsmi_status_t AMDSmiDrm::init() {
|
||||
|
||||
// looking for /sys/class/drm/card0/../renderD*
|
||||
std::string render_name = find_file_in_folder(renderD_folder, regex);
|
||||
fd = -1;
|
||||
std::string name = "/dev/dri/" + render_name;
|
||||
if (render_name != "") {
|
||||
fd = open(name.c_str(), O_RDWR | O_CLOEXEC);
|
||||
}
|
||||
auto fd = amdsmi_RAII_FD_handler(name.c_str(), O_RDWR | O_CLOEXEC);
|
||||
|
||||
amdsmi_bdf_t bdf;
|
||||
if (fd >= 0) {
|
||||
auto version = drm_get_version(fd);
|
||||
if (strcmp("amdgpu", version->name)) { // only amdgpu
|
||||
close(fd);
|
||||
fd = -1;
|
||||
}
|
||||
if (fd >= 0 && drm_get_device(fd, &device) != 0) {
|
||||
if (*fd >= 0) {
|
||||
auto version = drm_get_version(*fd);
|
||||
if (*fd >= 0 && drm_get_device(*fd, &device) != 0) {
|
||||
drm_free_device(&device);
|
||||
close(fd);
|
||||
fd = -1;
|
||||
}
|
||||
ss << __PRETTY_FUNCTION__ << " | "
|
||||
<< " render file name: " << name << "\n"
|
||||
<< "; fd: " << std::dec << fd << "\n"
|
||||
<< "; fd: " << std::dec << *fd << "\n"
|
||||
<< "; drm version->name: " << version->name << "\n"
|
||||
<< "; drm version->date: " << version->date << "\n"
|
||||
<< "; drm version_major.version_minor.version_patchlevel: "
|
||||
@@ -174,11 +164,12 @@ amdsmi_status_t AMDSmiDrm::init() {
|
||||
drm_free_version(version);
|
||||
}
|
||||
|
||||
drm_fds_.push_back(fd);
|
||||
drm_fds_.push_back(*fd);
|
||||
drm_paths_.push_back(render_name);
|
||||
// even if fail, still add to prevent mismatch the index
|
||||
if (fd < 0) {
|
||||
if (*fd < 0) {
|
||||
drm_bdfs_.push_back(bdf);
|
||||
drm_free_device(&device);
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -215,7 +206,6 @@ amdsmi_status_t AMDSmiDrm::init() {
|
||||
|
||||
drm_bdfs_.push_back(bdf);
|
||||
drm_free_device(&device);
|
||||
close(fd);
|
||||
}
|
||||
|
||||
// cannot find any valid fds.
|
||||
@@ -223,7 +213,6 @@ amdsmi_status_t AMDSmiDrm::init() {
|
||||
drm_bdfs_.clear();
|
||||
return AMDSMI_STATUS_INIT_ERROR;
|
||||
}
|
||||
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
|
||||
@@ -404,6 +404,7 @@ amdsmi_status_t AMDSmiSystem::cleanup() {
|
||||
// we do not need to delete the sockets/processors, clear takes care of this
|
||||
if (!processors_.empty()) {processors_.clear();}
|
||||
if (!sockets_.empty()) {sockets_.clear();}
|
||||
drm_.cleanup();
|
||||
init_flag_ &= ~AMDSMI_INIT_AMD_GPUS;
|
||||
rsmi_status_t ret = rsmi_shut_down();
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
|
||||
@@ -583,31 +583,40 @@ amdsmi_status_t smi_amdgpu_get_pcie_speed_from_pcie_type(uint16_t pcie_type, uin
|
||||
|
||||
amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(amd::smi::AMDSmiGPUDevice* device,
|
||||
char *market_name) {
|
||||
SMIGPUDEVICE_MUTEX(device->get_mutex())
|
||||
if (market_name == nullptr || device == nullptr) {
|
||||
return AMDSMI_STATUS_ARG_PTR_NULL;
|
||||
}
|
||||
// initialize the market_name to empty string
|
||||
std::string empty = "";
|
||||
std::strncpy(market_name, empty.c_str(), AMDSMI_MAX_STRING_LENGTH - 1);
|
||||
|
||||
std::ostringstream ss;
|
||||
std::string render_name = device->get_gpu_path();
|
||||
int fd = -1;
|
||||
std::string path = "/dev/dri/" + render_name;
|
||||
|
||||
if (render_name != "") {
|
||||
fd = open(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
} else {
|
||||
market_name[0] = '\0';
|
||||
close(fd);
|
||||
if (render_name.empty()) {
|
||||
return AMDSMI_STATUS_NOT_SUPPORTED;
|
||||
}
|
||||
|
||||
auto fd = amdsmi_RAII_FD_handler(path.c_str(), O_RDWR | O_CLOEXEC);
|
||||
ss << __PRETTY_FUNCTION__ << " | Render Name: "
|
||||
<< render_name << "; path: " << path << "; fd: " << fd;
|
||||
<< render_name << "; path: " << path << "; fd: "
|
||||
<< (fd == nullptr ? "nullptr" : std::to_string(*fd)) << "\n";
|
||||
LOG_DEBUG(ss);
|
||||
if (!fd) {
|
||||
ss << __PRETTY_FUNCTION__ << " | Render Name: "
|
||||
<< render_name << "; path: " << path << "; fd: "
|
||||
<< (fd == nullptr ? "nullptr" : std::to_string(*fd)) << "\n"
|
||||
<< "; Returning: "
|
||||
<< smi_amdgpu_get_status_string(AMDSMI_STATUS_FILE_ERROR, false) << "\n";
|
||||
LOG_INFO(ss);
|
||||
return AMDSMI_STATUS_FILE_ERROR;
|
||||
}
|
||||
|
||||
amd::smi::AMDSmiLibraryLoader libdrm_amdgpu_;
|
||||
amdsmi_status_t status = libdrm_amdgpu_.load("libdrm_amdgpu.so");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(fd);
|
||||
libdrm_amdgpu_.AMDSmiLibraryLoader::unload();
|
||||
libdrm_amdgpu_.unload();
|
||||
return status;
|
||||
}
|
||||
|
||||
@@ -621,60 +630,46 @@ amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(amd::smi::AMDSmiGPUDevice
|
||||
amdgpu_device_deinitialize_t amdgpu_device_deinitialize = nullptr;
|
||||
amdgpu_get_marketing_name_t amdgpu_get_marketing_name = nullptr;
|
||||
|
||||
status = libdrm_amdgpu_.load_symbol(
|
||||
reinterpret_cast<amdgpu_device_initialize_t *>(&amdgpu_device_initialize),
|
||||
"amdgpu_device_initialize");
|
||||
status = libdrm_amdgpu_.load_symbol(reinterpret_cast<void**>(&amdgpu_device_deinitialize),
|
||||
"amdgpu_device_deinitialize");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(fd);
|
||||
libdrm_amdgpu_.AMDSmiLibraryLoader::unload();
|
||||
libdrm_amdgpu_.unload();
|
||||
return status;
|
||||
}
|
||||
|
||||
status = libdrm_amdgpu_.load_symbol(reinterpret_cast<void**>(&amdgpu_device_initialize),
|
||||
"amdgpu_device_initialize");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
libdrm_amdgpu_.unload();
|
||||
return status;
|
||||
}
|
||||
|
||||
amdgpu_device_handle device_handle = nullptr;
|
||||
uint32_t major_version, minor_version;
|
||||
int ret = amdgpu_device_initialize(fd, &major_version, &minor_version, &device_handle);
|
||||
int ret = amdgpu_device_initialize(*fd, &major_version, &minor_version, &device_handle);
|
||||
if (ret != 0) {
|
||||
close(fd);
|
||||
libdrm_amdgpu_.AMDSmiLibraryLoader::unload();
|
||||
amdgpu_device_deinitialize(device_handle);
|
||||
libdrm_amdgpu_.unload();
|
||||
return AMDSMI_STATUS_DRM_ERROR;
|
||||
}
|
||||
|
||||
status = libdrm_amdgpu_.load_symbol(
|
||||
reinterpret_cast<amdgpu_get_marketing_name_t *>(
|
||||
&amdgpu_get_marketing_name), "amdgpu_get_marketing_name");
|
||||
status = libdrm_amdgpu_.load_symbol(reinterpret_cast<void**>(&amdgpu_get_marketing_name),
|
||||
"amdgpu_get_marketing_name");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(fd);
|
||||
libdrm_amdgpu_.AMDSmiLibraryLoader::unload();
|
||||
amdgpu_device_deinitialize(device_handle);
|
||||
libdrm_amdgpu_.unload();
|
||||
return status;
|
||||
}
|
||||
|
||||
status = libdrm_amdgpu_.load_symbol(reinterpret_cast<amdgpu_device_deinitialize_t *>(
|
||||
&amdgpu_device_deinitialize), "amdgpu_device_deinitialize");
|
||||
if (status != AMDSMI_STATUS_SUCCESS) {
|
||||
close(fd);
|
||||
libdrm_amdgpu_.AMDSmiLibraryLoader::unload();
|
||||
return status;
|
||||
}
|
||||
|
||||
ret = amdgpu_device_initialize(fd, &major_version, &minor_version, &device_handle);
|
||||
if (ret != 0) {
|
||||
std::string empty = "";
|
||||
std::strncpy(market_name, empty.c_str(), AMDSMI_MAX_STRING_LENGTH - 1);
|
||||
amdgpu_device_deinitialize(device_handle);
|
||||
close(fd);
|
||||
return AMDSMI_STATUS_DRM_ERROR;
|
||||
}
|
||||
|
||||
// Get the marketing name using libdrm's API
|
||||
const char *name = amdgpu_get_marketing_name(device_handle);
|
||||
if (name != nullptr) {
|
||||
std::strncpy(market_name, name, AMDSMI_MAX_STRING_LENGTH - 1);
|
||||
market_name[AMDSMI_MAX_STRING_LENGTH - 1] = '\0';
|
||||
amdgpu_device_deinitialize(device_handle);
|
||||
close(fd);
|
||||
libdrm_amdgpu_.AMDSmiLibraryLoader::unload();
|
||||
libdrm_amdgpu_.unload();
|
||||
ss << __PRETTY_FUNCTION__ << " | path: " << path << "\n"
|
||||
<< " | fd: "<< std::dec << fd << "\n"
|
||||
<< " | fd: "<< std::dec << *fd << "\n"
|
||||
<< " | Marketing Name: " << market_name << "\n"
|
||||
<< " | Returning: "
|
||||
<< smi_amdgpu_get_status_string(AMDSMI_STATUS_SUCCESS, false) << "\n";
|
||||
@@ -683,10 +678,9 @@ amdsmi_status_t smi_amdgpu_get_market_name_from_dev_id(amd::smi::AMDSmiGPUDevice
|
||||
}
|
||||
|
||||
amdgpu_device_deinitialize(device_handle);
|
||||
close(fd);
|
||||
libdrm_amdgpu_.AMDSmiLibraryLoader::unload();
|
||||
libdrm_amdgpu_.unload();
|
||||
ss << __PRETTY_FUNCTION__ << " | path: " << path << "\n"
|
||||
<< " | fd: "<< std::dec << fd << "\n"
|
||||
<< " | fd: "<< std::dec << *fd << "\n"
|
||||
<< " | Marketing Name: " << market_name << "\n"
|
||||
<< " | Returning: "
|
||||
<< smi_amdgpu_get_status_string(AMDSMI_STATUS_DRM_ERROR, false) << "\n";
|
||||
@@ -804,7 +798,6 @@ amdsmi_status_t smi_amdgpu_get_device_index(amdsmi_processor_handle processor_ha
|
||||
<< "Returning device_index: " << *device_index << "\nSocket #: " << i
|
||||
<< "; Device #: " << j << "; current_device_index #: " << current_device_index
|
||||
<< "\n";
|
||||
// std::cout << ss.str();
|
||||
LOG_DEBUG(ss);
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -913,8 +906,6 @@ amdsmi_status_t smi_amdgpu_get_processor_handle_by_index(
|
||||
LOG_DEBUG(ss);
|
||||
|
||||
for (uint32_t j = 0; j < device_count; j++) {
|
||||
// std::cout << "current_device_index: " << current_device_index
|
||||
// << " device_index: " << device_index << std::endl;
|
||||
if (current_device_index == device_index) {
|
||||
*processor_handle = processor_handles[j];
|
||||
ss << __PRETTY_FUNCTION__ << " | AMDSMI_STATUS_SUCCESS"
|
||||
@@ -924,7 +915,6 @@ amdsmi_status_t smi_amdgpu_get_processor_handle_by_index(
|
||||
<< "; processor_handle: " << *processor_handle
|
||||
<< "; processor_handles[j]: " << processor_handles[j]
|
||||
<< "\n";
|
||||
// std::cout << ss.str();
|
||||
LOG_DEBUG(ss);
|
||||
return AMDSMI_STATUS_SUCCESS;
|
||||
}
|
||||
@@ -959,3 +949,58 @@ void amdsmi_wait_for_user_input(void) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::shared_ptr<int> amdsmi_RAII_FD_handler(const std::string& path, int flags) {
|
||||
static std::mutex fd_mutex;
|
||||
static std::map<std::string, std::weak_ptr<int>> open_files;
|
||||
static std::ostringstream ss;
|
||||
|
||||
std::lock_guard<std::mutex> lock(fd_mutex);
|
||||
|
||||
// Clean up expired entries from the cache
|
||||
for (auto it = open_files.begin(); it != open_files.end();) {
|
||||
if (it->second.expired()) {
|
||||
it = open_files.erase(it);
|
||||
} else {
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
// Try to reuse an existing open FD
|
||||
auto it = open_files.find(path);
|
||||
if (it != open_files.end()) {
|
||||
if (auto existing_fd = it->second.lock()) {
|
||||
ss <<__PRETTY_FUNCTION__ << " | Reusing FD for path: " << path;
|
||||
LOG_INFO(ss);
|
||||
return existing_fd;
|
||||
}
|
||||
}
|
||||
|
||||
// Open a new file descriptor
|
||||
int fd = open(path.c_str(), flags);
|
||||
if (fd < 0) {
|
||||
ss << __PRETTY_FUNCTION__ << " | Failed to open file: " << path
|
||||
<< " | Error: " << strerror(errno);
|
||||
LOG_INFO(ss);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ss << __PRETTY_FUNCTION__ << " | Opened FD: " << std::to_string(fd)
|
||||
<< " for path: " << path;
|
||||
LOG_INFO(ss);
|
||||
|
||||
// Create a shared_ptr with a custom deleter to close the FD
|
||||
auto fd_ptr = std::shared_ptr<int>(new int(fd), [path](int* fd) {
|
||||
if (fd && *fd >= 0) {
|
||||
ss << __PRETTY_FUNCTION__ << " | Closing FD: " << std::to_string(*fd)
|
||||
<< " | Path: " << path << std::endl;
|
||||
LOG_INFO(ss);
|
||||
close(*fd);
|
||||
delete fd;
|
||||
}
|
||||
});
|
||||
|
||||
// Store weak_ptr in cache for reuse
|
||||
open_files[path] = fd_ptr;
|
||||
return fd_ptr;
|
||||
}
|
||||
|
||||
+28
-15
@@ -273,6 +273,7 @@ static void checkPartitionIdChanges(amdsmi_processor_handle* const processor_han
|
||||
"\"sudo rmmod amdgpu && sudo rmmod ast && sudo modprobe amdgpu\")."
|
||||
"\n\tCPX may not enumerate properly.\n";
|
||||
}
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors
|
||||
break;
|
||||
}
|
||||
amdsmi_kfd_info_t kfd_info;
|
||||
@@ -432,7 +433,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
<< computePartitionString(updatePartition)
|
||||
<< " ===============" << std::endl;
|
||||
}
|
||||
// waitForUserInput(); // watch for any errors
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors
|
||||
|
||||
auto ret_set = amdsmi_set_gpu_compute_partition(processor_handles_[dv_ind], updatePartition);
|
||||
IF_VERB(STANDARD) {
|
||||
@@ -463,6 +464,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
<< "\n\t Device might be in a static partition mode. "
|
||||
<< "With inability to change partition modes."
|
||||
<< std::endl;
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -491,7 +493,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
static_cast<amdsmi_compute_partition_type_t>(
|
||||
mapStringToSMIComputePartitionTypes.at(
|
||||
std::string(orig_char_computePartition)));
|
||||
// waitForUserInput(); // watch for any errors on going back to original partition
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors on going back to original partition
|
||||
auto ret_set = amdsmi_set_gpu_compute_partition(processor_handles_[dv_ind], updatePartition);
|
||||
EXPECT_TRUE(ret_set == AMDSMI_STATUS_SETTING_UNAVAILABLE
|
||||
|| ret_set== AMDSMI_STATUS_NO_PERM
|
||||
@@ -510,6 +512,8 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
|
||||
// TEST 2: Set/Get Compute Partition (new functionality)
|
||||
initial_num_devices = num_monitor_devs();
|
||||
amdsmi_accelerator_partition_type_t primary_partition_type = AMDSMI_ACCELERATOR_PARTITION_INVALID;
|
||||
uint32_t primary_index = 0;
|
||||
for (uint32_t dv_ind = 0; dv_ind < initial_num_devices; ++dv_ind) {
|
||||
if (dv_ind != 0) {
|
||||
std::cout << "\n";
|
||||
@@ -518,11 +522,11 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
std::cout << "\n";
|
||||
std::cout << "\t**======================================================================\n";
|
||||
std::cout << "\t**Test #2: Get/Set Compute Partition (new functionality) ===============\n";
|
||||
std::cout << "\t**DEVICE: #" << std::setw(2) << std::setfill('0') << dv_ind
|
||||
std::cout << "\t**DEVICE: #" << std::dec << std::setw(2) << std::setfill('0') << dv_ind
|
||||
<< " ==========================================================\n";
|
||||
std::cout << "\t**======================================================================\n";
|
||||
}
|
||||
// waitForUserInput(); // watch for any errors
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors
|
||||
PrintDeviceHeader(processor_handles_[dv_ind]);
|
||||
amdsmi_accelerator_partition_profile_t profile = {};
|
||||
uint32_t partition_id[8] = {0, 0, 0, 0, 0, 0, 0, 0};
|
||||
@@ -563,6 +567,12 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
profile_type_str = "CPX";
|
||||
}
|
||||
|
||||
// save the primary partition type
|
||||
if (profile.profile_type != AMDSMI_ACCELERATOR_PARTITION_INVALID) {
|
||||
primary_partition_type = profile.profile_type;
|
||||
primary_index = dv_ind;
|
||||
}
|
||||
|
||||
std::string partition_id_str = "";
|
||||
for (int i = 0; i < 8; i++) {
|
||||
partition_id_str += std::to_string(partition_id[i]);
|
||||
@@ -570,7 +580,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
partition_id_str += ", ";
|
||||
}
|
||||
|
||||
switch (profile.profile_type) {
|
||||
switch (primary_partition_type) {
|
||||
case AMDSMI_ACCELERATOR_PARTITION_SPX:
|
||||
EXPECT_LT(partition_id[i], MAX_SPX_PARTITIONS);
|
||||
break;
|
||||
@@ -586,7 +596,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
case AMDSMI_ACCELERATOR_PARTITION_CPX: {
|
||||
uint16_t num_xcd;
|
||||
uint32_t max_xcps = 0;
|
||||
ret = amdsmi_get_gpu_xcd_counter(processor_handles_[dv_ind], &num_xcd);
|
||||
ret = amdsmi_get_gpu_xcd_counter(processor_handles_[primary_index], &num_xcd);
|
||||
if (ret == AMDSMI_STATUS_SUCCESS) {
|
||||
max_xcps = static_cast<uint32_t>(num_xcd);
|
||||
}
|
||||
@@ -640,7 +650,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
AcceleratorProfileConfig original_profile_config = {};
|
||||
original_profile_config
|
||||
= getAvailableProfileConfigs(dv_ind, profile, profile_config, isVerbose);
|
||||
// waitForUserInput(); // watch for any errors
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**=========================================================\n";
|
||||
@@ -762,7 +772,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
<< profile_config.profiles[config].profile_index << ")"
|
||||
<< " ===============" << std::endl;
|
||||
}
|
||||
// waitForUserInput(); // watch for any errors
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors
|
||||
|
||||
auto ret_set = amdsmi_set_gpu_accelerator_partition_profile(
|
||||
processor_handles_[dv_ind],
|
||||
@@ -789,6 +799,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
<< "\n\t Device might be in a static partition mode. "
|
||||
<< "With inability to change partition modes."
|
||||
<< std::endl;
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors
|
||||
break;
|
||||
}
|
||||
if (ret_set == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
@@ -872,6 +883,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
std::cout << "\t**" << "amdsmi_get_gpu_accelerator_partition_profile: "
|
||||
<< "Not supported on this machine, skipping remaining tests." << std::endl;
|
||||
}
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -911,7 +923,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
std::cout << "\t**Test #3: Check fluctuating # of devices & partition IDs ==============\n";
|
||||
std::cout << "\t**======================================================================\n";
|
||||
}
|
||||
// waitForUserInput(); // watch for any errors on going back to original partition
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors on going back to original partition
|
||||
|
||||
// ---------------------------------------------------------//
|
||||
// TEST 3: Check fluctuating # of devices & partition IDs //
|
||||
@@ -925,12 +937,12 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
std::cout << "\n";
|
||||
std::cout << "\t**======================================================================\n";
|
||||
std::cout << "\t**Test #3: Check fluctuating # of devices & partition IDs ==============\n";
|
||||
std::cout << "\t**DEVICE: #" << std::setw(2) << std::setfill('0') << dv_ind
|
||||
std::cout << "\t**DEVICE: #" << std::dec << std::setw(2) << std::setfill('0') << dv_ind
|
||||
<< " ========================================================\n";
|
||||
std::cout << "\t**======================================================================\n";
|
||||
}
|
||||
// Leaving for debug purposes
|
||||
// waitForUserInput(); // watch for any errors on going back to original partition
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors on going back to original partition
|
||||
uint32_t device_index = 0;
|
||||
amdsmi_processor_handle p_handle = {};
|
||||
uint32_t current_num_devices = 0;
|
||||
@@ -1013,6 +1025,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
<< "\n\t Device might be in a static partition mode. "
|
||||
<< "With inability to change partition modes."
|
||||
<< std::endl;
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1039,7 +1052,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
EXPECT_NE(updatePartition, mapStringToSMIComputePartitionTypes.at(
|
||||
std::string(current_char_computePartition)));
|
||||
}
|
||||
// waitForUserInput(); // watch for any errors on going back to original partition
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors on going back to original partition
|
||||
}
|
||||
|
||||
uint32_t device_index3 = 0;
|
||||
@@ -1055,7 +1068,7 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
std::cout << "\t**ABOUT TO GO BACK TO ORIGINAL PARTITION ("
|
||||
<< orig_char_computePartition << ")\n";
|
||||
}
|
||||
// waitForUserInput(); // watch for any errors on going back to original partition
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors on going back to original partition
|
||||
auto ret_set = amdsmi_set_gpu_compute_partition(p_handle3, updatePartition);
|
||||
checkPartitionIdChanges(processor_handles_, dv_ind, std::string(orig_char_computePartition),
|
||||
isVerbose, true);
|
||||
@@ -1076,8 +1089,8 @@ void TestComputePartitionReadWrite::Run(void) {
|
||||
}
|
||||
}
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Get/Set Test #3 (dev_ind: "
|
||||
<< dv_ind << "): Check fluctuating # of devices & partition IDs ===============\n";
|
||||
std::cout << "\t**Get/Set Test #3 (dev_ind: " << std::dec
|
||||
<< dv_ind << "): Check fluctuating # of devices & partition IDs ===============\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -120,16 +120,17 @@ void TestErrCntRead::Run(void) {
|
||||
|
||||
err = amdsmi_get_gpu_ecc_count(processor_handles_[i], static_cast<amdsmi_gpu_block_t>(b), &ec);
|
||||
|
||||
if (err == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
if (err == AMDSMI_STATUS_NOT_SUPPORTED || err == AMDSMI_STATUS_FILE_ERROR) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Error Count for " <<
|
||||
GetBlockNameStr(static_cast<amdsmi_gpu_block_t>(b)) <<
|
||||
": Not supported for this device" << std::endl;
|
||||
": Not supported for this device or error accessing file" << std::endl;
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = amdsmi_get_gpu_ecc_count(processor_handles_[i], static_cast<amdsmi_gpu_block_t>(b),
|
||||
nullptr);
|
||||
ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED);
|
||||
ASSERT_TRUE(err == AMDSMI_STATUS_NOT_SUPPORTED
|
||||
|| err == AMDSMI_STATUS_FILE_ERROR);
|
||||
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
|
||||
@@ -31,6 +31,7 @@
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "amd_smi/impl/amd_smi_utils.h"
|
||||
#include "frequencies_read_write.h"
|
||||
#include "../test_common.h"
|
||||
|
||||
@@ -71,6 +72,19 @@ void TestFrequenciesReadWrite::Run(void) {
|
||||
amdsmi_frequencies_t f;
|
||||
uint32_t freq_bitmask;
|
||||
amdsmi_clk_type_t amdsmi_clk;
|
||||
const std::map<amdsmi_clk_type_t, std::string> clk_type_map = {
|
||||
{AMDSMI_CLK_TYPE_SYS, "SYS"},
|
||||
{AMDSMI_CLK_TYPE_GFX, "GFX"},
|
||||
{AMDSMI_CLK_TYPE_DF, "DF"},
|
||||
{AMDSMI_CLK_TYPE_DCEF, "DCEF"},
|
||||
{AMDSMI_CLK_TYPE_SOC, "SOC"},
|
||||
{AMDSMI_CLK_TYPE_MEM, "MEM"},
|
||||
{AMDSMI_CLK_TYPE_PCIE, "PCIE"},
|
||||
{AMDSMI_CLK_TYPE_VCLK0, "VCLK0"},
|
||||
{AMDSMI_CLK_TYPE_VCLK1, "VCLK1"},
|
||||
{AMDSMI_CLK_TYPE_DCLK0, "DCLK0"},
|
||||
{AMDSMI_CLK_TYPE_DCLK1, "DCLK1"},
|
||||
};
|
||||
|
||||
TestBase::Run();
|
||||
if (setup_failed_) {
|
||||
@@ -86,11 +100,18 @@ void TestFrequenciesReadWrite::Run(void) {
|
||||
|
||||
auto freq_read = [&]() -> bool {
|
||||
// Skip AMDSMI_CLK_TYPE_PCIE, which does not supported in rocm-smi.
|
||||
std::cout << amdsmi_clk << std::endl;
|
||||
if (amdsmi_clk == AMDSMI_CLK_TYPE_PCIE)
|
||||
return false;
|
||||
if (auto it = clk_type_map.find(amdsmi_clk); it != clk_type_map.end()) {
|
||||
if (amdsmi_clk == AMDSMI_CLK_TYPE_PCIE) {
|
||||
return false; // Quietly skip PCIE clock
|
||||
// Cannot read/write to PCIE clock in driver
|
||||
}
|
||||
std::cout << "amdsmi_get_clk_freq(" << it->second << ", f)";
|
||||
}
|
||||
|
||||
ret = amdsmi_get_clk_freq(processor_handles_[dv_ind], amdsmi_clk, &f);
|
||||
std::cout << ret << std::endl;
|
||||
if (auto it = clk_type_map.find(amdsmi_clk); it != clk_type_map.end()) {
|
||||
std::cout << ": " << smi_amdgpu_get_status_string(ret, false) << std::endl;
|
||||
}
|
||||
|
||||
if (ret == AMDSMI_STATUS_NOT_SUPPORTED ||
|
||||
ret == AMDSMI_STATUS_NOT_YET_IMPLEMENTED) {
|
||||
|
||||
@@ -22,11 +22,11 @@
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "gpu_busy_read.h"
|
||||
#include "../test_common.h"
|
||||
@@ -63,9 +63,36 @@ void TestGPUBusyRead::Close() {
|
||||
|
||||
|
||||
void TestGPUBusyRead::Run(void) {
|
||||
amdsmi_status_t err;
|
||||
uint32_t val_ui32;
|
||||
|
||||
TestBase::Run();
|
||||
if (setup_failed_) {
|
||||
std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
for (uint32_t x = 0; x < num_iterations(); ++x) {
|
||||
for (uint32_t i = 0; i < num_monitor_devs(); ++i) {
|
||||
PrintDeviceHeader(processor_handles_[i]);
|
||||
|
||||
err = amdsmi_get_gpu_busy_percent(processor_handles_[i], &val_ui32);
|
||||
if (err != AMDSMI_STATUS_SUCCESS) {
|
||||
if (err == AMDSMI_STATUS_FILE_ERROR || err == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**GPU Busy Percent: Not supported on this machine"
|
||||
<< std::endl;
|
||||
}
|
||||
ASSERT_TRUE(err == AMDSMI_STATUS_FILE_ERROR || err == AMDSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
}
|
||||
} else {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**GPU Busy Percent (Percent Idle):" << std::dec <<
|
||||
val_ui32 << " (" << 100 - val_ui32 << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -98,7 +98,7 @@ void TestGpuMetricsRead::Run(void) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**" <<
|
||||
"Not supported on this machine" << std::endl;
|
||||
return;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -22,15 +22,16 @@
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <map>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "mem_util_read.h"
|
||||
#include "../test_common.h"
|
||||
#include "amd_smi/impl/amd_smi_utils.h"
|
||||
|
||||
TestMemUtilRead::TestMemUtilRead() : TestBase() {
|
||||
set_title("Memory Utilization Read Test");
|
||||
@@ -81,12 +82,14 @@ void TestMemUtilRead::Run(void) {
|
||||
}
|
||||
|
||||
auto err_chk = [&](const char *str) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t** " << str << std::endl;
|
||||
}
|
||||
if (err != AMDSMI_STATUS_SUCCESS) {
|
||||
if (err == AMDSMI_STATUS_FILE_ERROR) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t** " << str << ": Not supported on this machine"
|
||||
<< std::endl;
|
||||
}
|
||||
if (err == AMDSMI_STATUS_FILE_ERROR ||
|
||||
err == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
ASSERT_TRUE(err == AMDSMI_STATUS_NOT_SUPPORTED
|
||||
|| err == AMDSMI_STATUS_FILE_ERROR);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
}
|
||||
@@ -101,23 +104,32 @@ void TestMemUtilRead::Run(void) {
|
||||
mem_type <= AMDSMI_MEM_TYPE_LAST; ++mem_type) {
|
||||
err = amdsmi_get_gpu_memory_total(processor_handles_[i],
|
||||
static_cast<amdsmi_memory_type_t>(mem_type), &total);
|
||||
err_chk("amdsmi_get_gpu_memory_total()");
|
||||
smi_amdgpu_get_status_string(err, false);
|
||||
std::string mem_type_str =
|
||||
kDevMemoryTypeNameMap.at(static_cast<amdsmi_memory_type_t>(mem_type));
|
||||
std::string input_str =
|
||||
"amdsmi_get_gpu_memory_total(" + mem_type_str + "): "
|
||||
+ smi_amdgpu_get_status_string(err, false);
|
||||
err_chk(input_str.c_str());
|
||||
if (err != AMDSMI_STATUS_SUCCESS) {
|
||||
return;
|
||||
continue;
|
||||
}
|
||||
|
||||
err = amdsmi_get_gpu_memory_usage(processor_handles_[i],
|
||||
static_cast<amdsmi_memory_type_t>(mem_type), &usage);
|
||||
err_chk("amdsmi_get_gpu_memory_usage()");
|
||||
input_str =
|
||||
"amdsmi_get_gpu_memory_usage(" + mem_type_str + "): "
|
||||
+ smi_amdgpu_get_status_string(err, false);
|
||||
err_chk(input_str.c_str());
|
||||
if (err != AMDSMI_STATUS_SUCCESS) {
|
||||
return;
|
||||
continue;
|
||||
}
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**" <<
|
||||
kDevMemoryTypeNameMap.at(static_cast<amdsmi_memory_type_t>(mem_type))
|
||||
<< " Calculated Utilization: " <<
|
||||
(static_cast<float>(usage)*100)/static_cast<float>(total) << "% ("<< usage <<
|
||||
(static_cast<float>(usage)*100)/static_cast<float>(total) << "% (" << usage <<
|
||||
"/" << total << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,6 +124,8 @@ void TestMemoryPartitionReadWrite::Run(void) {
|
||||
std::cout << "\t**=========================================================\n";
|
||||
}
|
||||
auto initial_num_devices = num_monitor_devs();
|
||||
amdsmi_accelerator_partition_type_t primary_partition_type = AMDSMI_ACCELERATOR_PARTITION_INVALID;
|
||||
uint32_t primary_index = 0;
|
||||
for (uint32_t dv_ind = 0; dv_ind < initial_num_devices; ++dv_ind) {
|
||||
if (dv_ind != 0) {
|
||||
std::cout << "\n";
|
||||
@@ -168,6 +170,12 @@ void TestMemoryPartitionReadWrite::Run(void) {
|
||||
profile_type_str = "CPX";
|
||||
}
|
||||
|
||||
// save the primary partition type
|
||||
if (profile.profile_type != AMDSMI_ACCELERATOR_PARTITION_INVALID) {
|
||||
primary_partition_type = profile.profile_type;
|
||||
primary_index = dv_ind;
|
||||
}
|
||||
|
||||
std::string partition_id_str = "";
|
||||
for (int i = 0; i < 8; i++) {
|
||||
partition_id_str += std::to_string(partition_id[i]);
|
||||
@@ -175,7 +183,7 @@ void TestMemoryPartitionReadWrite::Run(void) {
|
||||
partition_id_str += ", ";
|
||||
}
|
||||
|
||||
switch (profile.profile_type) {
|
||||
switch (primary_partition_type) {
|
||||
case AMDSMI_ACCELERATOR_PARTITION_SPX:
|
||||
EXPECT_LT(partition_id[i], MAX_SPX_PARTITIONS);
|
||||
break;
|
||||
@@ -191,7 +199,7 @@ void TestMemoryPartitionReadWrite::Run(void) {
|
||||
case AMDSMI_ACCELERATOR_PARTITION_CPX: {
|
||||
uint16_t num_xcd;
|
||||
uint32_t max_xcps = 0;
|
||||
ret = amdsmi_get_gpu_xcd_counter(processor_handles_[dv_ind], &num_xcd);
|
||||
ret = amdsmi_get_gpu_xcd_counter(processor_handles_[primary_index], &num_xcd);
|
||||
if (ret == AMDSMI_STATUS_SUCCESS) {
|
||||
max_xcps = static_cast<uint32_t>(num_xcd);
|
||||
}
|
||||
@@ -245,7 +253,7 @@ void TestMemoryPartitionReadWrite::Run(void) {
|
||||
AcceleratorProfileConfig original_profile_config =
|
||||
getAvailableProfileConfigs(dv_ind, profile, profile_config, isVerbose);
|
||||
orig_dev_config[dv_ind] = original_profile_config;
|
||||
// waitForUserInput(); // watch for any errors
|
||||
// amdsmi_wait_for_user_input(); // watch for any errors
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**=========================================================\n";
|
||||
@@ -321,7 +329,7 @@ void TestMemoryPartitionReadWrite::Run(void) {
|
||||
|| ret == AMDSMI_STATUS_NOT_SUPPORTED);
|
||||
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**" << "amdsmi_get_gpu_accelerator_partition_profile_config: "
|
||||
std::cout << "\t**" << "amdsmi_get_gpu_accelerator_partition_profile_config(): "
|
||||
<< "Not supported on this machine" << std::endl;
|
||||
}
|
||||
continue;
|
||||
@@ -329,6 +337,11 @@ void TestMemoryPartitionReadWrite::Run(void) {
|
||||
}
|
||||
|
||||
// Run memory partition tests
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**=========================================================\n";
|
||||
std::cout << "\t**Test: Memory Partition Sets =============================\n";
|
||||
std::cout << "\t**=========================================================\n";
|
||||
}
|
||||
uint32_t current_num_devices = 0;
|
||||
smi_amdgpu_get_device_count(¤t_num_devices);
|
||||
|
||||
@@ -352,7 +365,7 @@ void TestMemoryPartitionReadWrite::Run(void) {
|
||||
processor_handles_[dv_ind], orig_memory_partition, k255Len);
|
||||
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**" << ": "
|
||||
std::cout << "\t**" << "amdsmi_get_gpu_memory_partition(): "
|
||||
<< "Not supported on this machine" << std::endl;
|
||||
}
|
||||
continue;
|
||||
@@ -563,7 +576,7 @@ void TestMemoryPartitionReadWrite::Run(void) {
|
||||
}
|
||||
if (ret_set == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**" << ": "
|
||||
std::cout << "\t**" << "amdsmi_set_gpu_memory_partition_mode(): "
|
||||
<< "Not supported on this machine" << std::endl;
|
||||
}
|
||||
break;
|
||||
@@ -618,7 +631,8 @@ void TestMemoryPartitionReadWrite::Run(void) {
|
||||
|
||||
ret = amdsmi_get_gpu_memory_partition_config(processor_handles_[dv_ind],
|
||||
¤t_memory_config);
|
||||
CHK_ERR_ASRT(ret)
|
||||
ASSERT_TRUE((ret == AMDSMI_STATUS_NOT_SUPPORTED) ||
|
||||
(ret == AMDSMI_STATUS_SUCCESS));
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**"
|
||||
<< "amdsmi_get_gpu_memory_partition_config(processor_handles_[" << dv_ind
|
||||
@@ -629,6 +643,13 @@ void TestMemoryPartitionReadWrite::Run(void) {
|
||||
<< memoryPartitionString(current_memory_config.mp_mode)
|
||||
<< std::endl;
|
||||
}
|
||||
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**" << "amdsmi_get_gpu_memory_partition_config(): "
|
||||
<< "Not supported on this machine... trying on other devices" << std::endl;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
new_memory_partition
|
||||
= mapStringToRSMIMemoryPartitionTypes.at(orig_memory_partition);
|
||||
|
||||
@@ -22,11 +22,11 @@
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "perf_level_read.h"
|
||||
#include "../test_common.h"
|
||||
@@ -76,10 +76,15 @@ void TestPerfLevelRead::Run(void) {
|
||||
PrintDeviceHeader(processor_handles_[i]);
|
||||
|
||||
err = amdsmi_get_gpu_perf_level(processor_handles_[i], &pfl);
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Performance Level:" << std::dec << (uint32_t)pfl <<
|
||||
std::endl;
|
||||
if (err == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout << "\t**Performance Level: Not Supported" << std::endl;
|
||||
ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Performance Level:" << std::dec << (uint32_t)pfl
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
// Verify api support checking functionality is working
|
||||
err = amdsmi_get_gpu_perf_level(processor_handles_[i], nullptr);
|
||||
|
||||
@@ -22,11 +22,11 @@
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "perf_level_read_write.h"
|
||||
#include "../test_common.h"
|
||||
@@ -79,11 +79,17 @@ void TestPerfLevelReadWrite::Run(void) {
|
||||
PrintDeviceHeader(processor_handles_[dv_ind]);
|
||||
|
||||
ret = amdsmi_get_gpu_perf_level(processor_handles_[dv_ind], &orig_pfl);
|
||||
CHK_ERR_ASRT(ret)
|
||||
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**amdsmi_get_gpu_perf_level(): Not supported on this machine" << std::endl;
|
||||
}
|
||||
ASSERT_EQ(ret, AMDSMI_STATUS_NOT_SUPPORTED);
|
||||
continue;
|
||||
}
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Original Perf Level:" <<
|
||||
GetPerfLevelStr(orig_pfl) << std::endl;
|
||||
std::cout << "\t**Original Perf Level:"
|
||||
<< GetPerfLevelStr(orig_pfl) << std::endl;
|
||||
}
|
||||
|
||||
uint32_t pfl_i = static_cast<uint32_t>(AMDSMI_DEV_PERF_LEVEL_FIRST);
|
||||
|
||||
@@ -22,13 +22,13 @@
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <bitset>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "power_cap_read_write.h"
|
||||
#include "../test_common.h"
|
||||
@@ -83,18 +83,18 @@ void TestPowerCapReadWrite::SetCheckPowerCap(std::string msg, uint32_t dv_ind, u
|
||||
start = clock();
|
||||
ret = amdsmi_set_power_cap(processor_handles_[dv_ind], 0, new_cap);
|
||||
end = clock();
|
||||
cpu_time_used = ((double) (end - start)) * 1000000UL / CLOCKS_PER_SEC;
|
||||
cpu_time_used = (static_cast<double>(end - start)) * 1000000UL / CLOCKS_PER_SEC;
|
||||
|
||||
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t** Not supported on this machine" << std::endl;
|
||||
std::cout << "\t**amdsmi_set_power_cap(): Not supported on this machine" << std::endl;
|
||||
}
|
||||
return;
|
||||
}
|
||||
ASSERT_EQ(ret, ret_expected);
|
||||
if (ret == AMDSMI_STATUS_INVAL) {
|
||||
new_cap = curr_cap;
|
||||
std::cout << "\t** Expected invalid result" << std::endl;
|
||||
std::cout << "\t**amdsmi_set_power_cap(): Expected invalid result" << std::endl;
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -134,11 +134,16 @@ void TestPowerCapReadWrite::Run(void) {
|
||||
PrintDeviceHeader(processor_handles_[dv_ind]);
|
||||
|
||||
amdsmi_power_cap_info_t info;
|
||||
ret = amdsmi_get_power_cap_info(processor_handles_[dv_ind], 0, &info);
|
||||
CHK_ERR_ASRT(ret)
|
||||
// Verify api support checking functionality is working
|
||||
ret = amdsmi_get_power_cap_info(processor_handles_[dv_ind], 0, nullptr);
|
||||
ASSERT_EQ(ret, AMDSMI_STATUS_INVAL);
|
||||
|
||||
ret = amdsmi_get_power_cap_info(processor_handles_[dv_ind], 0, &info);
|
||||
if (ret == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout << "\t**amdsmi_get_power_cap_info(): Not supported on this machine" << std::endl;
|
||||
ASSERT_EQ(ret, AMDSMI_STATUS_NOT_SUPPORTED);
|
||||
continue;
|
||||
}
|
||||
min_cap = info.min_power_cap;
|
||||
max_cap = info.max_power_cap;
|
||||
default_cap = info.default_power_cap;
|
||||
@@ -148,15 +153,16 @@ void TestPowerCapReadWrite::Run(void) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "[Before Set] Default Power Cap: " << default_cap << " uW" << std::endl;
|
||||
std::cout << "[Before Set] Current Power Cap: " << curr_cap << " uW" << std::endl;
|
||||
std::cout << "[Before Set] Power Cap Range [max to min]: " << max_cap << " uW to " << min_cap <<
|
||||
" uW" << std::endl;
|
||||
std::cout << "[Before Set] Power Cap Range [max to min]: "
|
||||
<< max_cap << " uW to " << min_cap << " uW" << std::endl;
|
||||
std::cout << "[Before Set] Setting new cap to " << new_cap << "..." << std::endl;
|
||||
}
|
||||
|
||||
// Check if power cap is within the range
|
||||
// skip the test otherwise
|
||||
if (new_cap < min_cap || new_cap > max_cap) {
|
||||
std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl;
|
||||
std::cout << "\t** Power cap requested (" << new_cap
|
||||
<< " uW) is failed to set for " << dv_ind << std::endl;
|
||||
continue;
|
||||
}
|
||||
ret = AMDSMI_STATUS_SUCCESS;
|
||||
@@ -166,17 +172,18 @@ void TestPowerCapReadWrite::Run(void) {
|
||||
}
|
||||
IF_VERB(STANDARD) {
|
||||
if (!new_cap)
|
||||
std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl;
|
||||
std::cout << "\t** Power cap requested (" << new_cap
|
||||
<< " uW) is failed to set for " << dv_ind << std::endl;
|
||||
}
|
||||
|
||||
if (min_cap > 0)
|
||||
{
|
||||
if (min_cap > 0) {
|
||||
new_cap = min_cap;
|
||||
ret = AMDSMI_STATUS_SUCCESS;
|
||||
SetCheckPowerCap("Setting to Min Power Cap", dv_ind, curr_cap, new_cap, ret);
|
||||
IF_VERB(STANDARD) {
|
||||
if (!new_cap)
|
||||
std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl;
|
||||
std::cout << "\t** Power cap requested (" << new_cap
|
||||
<< " uW) is failed to set for " << dv_ind << std::endl;
|
||||
}
|
||||
|
||||
new_cap = uint64_t(min_cap - 1);
|
||||
@@ -185,7 +192,8 @@ void TestPowerCapReadWrite::Run(void) {
|
||||
if (ret != AMDSMI_STATUS_INVAL) {
|
||||
IF_VERB(STANDARD) {
|
||||
if (!new_cap)
|
||||
std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl;
|
||||
std::cout << "\t** Power cap requested (" << new_cap
|
||||
<< " uW) is failed to set for " << dv_ind << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -195,13 +203,13 @@ void TestPowerCapReadWrite::Run(void) {
|
||||
if (ret != AMDSMI_STATUS_INVAL) {
|
||||
IF_VERB(STANDARD) {
|
||||
if (!new_cap)
|
||||
std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl;
|
||||
std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for "
|
||||
<< dv_ind << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "\tPower cap requested is less than or equal to 0, skipping test for " << dv_ind << std::endl;
|
||||
} else {
|
||||
std::cout << "\tPower cap requested is less than or equal to 0, skipping test for device #"
|
||||
<< dv_ind << std::endl;
|
||||
}
|
||||
|
||||
new_cap = max_cap;
|
||||
@@ -209,7 +217,8 @@ void TestPowerCapReadWrite::Run(void) {
|
||||
SetCheckPowerCap("Setting to Max Power Cap", dv_ind, curr_cap, new_cap, ret);
|
||||
IF_VERB(STANDARD) {
|
||||
if (!new_cap)
|
||||
std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl;
|
||||
std::cout << "\t** Power cap requested (" << new_cap
|
||||
<< " uW) is failed to set for " << dv_ind << std::endl;
|
||||
}
|
||||
|
||||
new_cap = uint64_t(max_cap + 1);
|
||||
@@ -218,7 +227,8 @@ void TestPowerCapReadWrite::Run(void) {
|
||||
if (ret != AMDSMI_STATUS_INVAL) {
|
||||
IF_VERB(STANDARD) {
|
||||
if (!new_cap)
|
||||
std::cout << "\t** Power cap requested (" << new_cap << " uW) failed to set for " << dv_ind << std::endl;
|
||||
std::cout << "\t** Power cap requested (" << new_cap
|
||||
<< " uW) failed to set for " << dv_ind << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -228,7 +238,8 @@ void TestPowerCapReadWrite::Run(void) {
|
||||
if (ret != AMDSMI_STATUS_INVAL) {
|
||||
IF_VERB(STANDARD) {
|
||||
if (!new_cap)
|
||||
std::cout << "\t** Power cap requested (" << new_cap << " uW) is failed to set for " << dv_ind << std::endl;
|
||||
std::cout << "\t** Power cap requested (" << new_cap
|
||||
<< " uW) is failed to set for " << dv_ind << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -22,11 +22,11 @@
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "power_read.h"
|
||||
#include "../test_common.h"
|
||||
@@ -77,6 +77,11 @@ void TestPowerRead::Run(void) {
|
||||
|
||||
amdsmi_power_cap_info_t info;
|
||||
err = amdsmi_get_power_cap_info(processor_handles_[i], 0, &info);
|
||||
if (err == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout << "\t**Power Cap not supported on this device." << std::endl;
|
||||
ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED);
|
||||
continue;
|
||||
}
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Current Power Cap: " << info.power_cap << "uW" <<std::endl;
|
||||
@@ -87,7 +92,7 @@ void TestPowerRead::Run(void) {
|
||||
std::cout << "\t**Power Cap Range: " << info.min_power_cap << " to " <<
|
||||
info.max_power_cap << " uW" << std::endl;
|
||||
}
|
||||
// TODO: Add current_socket_power tests
|
||||
// TODO(amdsmi_team): Add current_socket_power tests
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,12 +22,12 @@
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <limits>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "sys_info_read.h"
|
||||
#include "../test_common.h"
|
||||
@@ -118,16 +118,22 @@ void TestSysInfoRead::Run(void) {
|
||||
ASSERT_EQ(err, AMDSMI_STATUS_INVAL);
|
||||
|
||||
err = amdsmi_get_gpu_topo_numa_affinity(processor_handles_[i], &val_i32);
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**NUMA NODE: 0x" << std::hex << val_i32;
|
||||
std::cout << " (" << std::dec << val_i32 << ")" << std::endl;
|
||||
if (err == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
std::cout << "\t**amdsmi_get_gpu_topo_numa_affinity(): Not supported on this machine"
|
||||
<< std::endl;
|
||||
ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**NUMA NODE: 0x" << std::hex << val_i32;
|
||||
std::cout << " (" << std::dec << val_i32 << ")" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
// Verify api support checking functionality is working
|
||||
err = amdsmi_get_gpu_topo_numa_affinity(processor_handles_[i], nullptr);
|
||||
ASSERT_EQ(err, AMDSMI_STATUS_INVAL);
|
||||
|
||||
|
||||
// vendor_id, unique_id, target_gfx_version
|
||||
amdsmi_asic_info_t asic_info = {};
|
||||
err = amdsmi_get_gpu_asic_info(processor_handles_[i], &asic_info);
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
@@ -27,7 +28,7 @@
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include <gtest/gtest.h>
|
||||
#include "amd_smi/impl/amd_smi_utils.h"
|
||||
#include "test_common.h"
|
||||
#include "test_base.h"
|
||||
|
||||
@@ -114,6 +115,7 @@ static void RunGenericTest(TestBase *test) {
|
||||
// RunGenericTest(&<test_obj>);
|
||||
// }
|
||||
TEST(amdsmitstReadOnly, TestVersionRead) {
|
||||
// amdsmi_wait_for_user_input();
|
||||
TestVersionRead tst;
|
||||
RunGenericTest(&tst);
|
||||
}
|
||||
|
||||
@@ -21,8 +21,8 @@
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <cassert>
|
||||
#include <limits>
|
||||
|
||||
#include "amd_smi/amdsmi.h"
|
||||
#include "amd_smi/impl/amd_smi_utils.h"
|
||||
@@ -171,7 +171,6 @@ void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) {
|
||||
amdsmi_status_t err;
|
||||
uint16_t val_ui16;
|
||||
uint32_t val_ui32;
|
||||
amdsmi_asic_info_t info;
|
||||
|
||||
err = smi_amdgpu_get_device_count(&val_ui32);
|
||||
CHK_ERR_ASRT(err)
|
||||
@@ -189,16 +188,16 @@ void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) {
|
||||
std::cout << "\t**Device handle: " << dv_ind << std::endl;
|
||||
}
|
||||
err = amdsmi_get_gpu_id(dv_ind, &val_ui16);
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << std::endl;
|
||||
}
|
||||
|
||||
err = amdsmi_get_gpu_revision(dv_ind, &val_ui16);
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device Revision ID: 0x" << std::hex <<
|
||||
val_ui16 << std::endl;
|
||||
if (err == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device ID: N/A" << std::endl;
|
||||
}
|
||||
ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
amdsmi_board_info_t board_info;
|
||||
@@ -206,30 +205,82 @@ void TestBase::PrintDeviceHeader(amdsmi_processor_handle dv_ind) {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device name: " << board_info.product_name << std::endl;
|
||||
|
||||
err = amdsmi_get_gpu_asic_info(dv_ind, &info);
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device Vendor ID: 0x" << std::hex <<
|
||||
info.vendor_id << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
amdsmi_asic_info_t asic_info;
|
||||
err = amdsmi_get_gpu_asic_info(dv_ind, &asic_info);
|
||||
CHK_ERR_ASRT(err)
|
||||
if (err == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**ASIC info: " << smi_amdgpu_get_status_string(err, false) << std::endl;
|
||||
}
|
||||
ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED);
|
||||
} else if (err == AMDSMI_STATUS_FILE_ERROR) { // File error can happen for partition switches,
|
||||
// if SMI is not re-initialized
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**ASIC info: " << smi_amdgpu_get_status_string(err, false) << std::endl;
|
||||
}
|
||||
ASSERT_EQ(err, AMDSMI_STATUS_FILE_ERROR);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
}
|
||||
|
||||
// Print everything we can get from the ASIC info
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Market name: " << asic_info.market_name << std::endl;
|
||||
std::cout << "\t**ASIC serial: 0x" << std::hex << asic_info.asic_serial << std::endl;
|
||||
std::cout << "\t**Target GFX Version: gfx" << asic_info.target_graphics_version << std::endl;
|
||||
std::cout << "\t**Device ID: 0x" << std::hex << std::setfill('0') << std::setw(4)
|
||||
<< asic_info.device_id << std::endl;
|
||||
if (checkIfMaxValue(asic_info.num_of_compute_units)) {
|
||||
std::cout << "\t**Num of Compute Units: N/A" << std::endl;
|
||||
} else {
|
||||
std::cout << "\t**Num of Compute Units: " << std::dec << asic_info.num_of_compute_units
|
||||
<< std::endl;
|
||||
}
|
||||
if (checkIfMaxValue(asic_info.oam_id)) {
|
||||
std::cout << "\t**OAM ID: N/A" << std::endl;
|
||||
} else {
|
||||
std::cout << "\t**OAM ID: " << std::dec << asic_info.oam_id << std::endl;
|
||||
}
|
||||
std::cout << "\t**Revision ID: 0x" << std::hex << std::setfill('0') << std::setw(2)
|
||||
<< asic_info.rev_id << std::endl;
|
||||
if (checkIfMaxValue(asic_info.subvendor_id)) {
|
||||
std::cout << "\t**Subvendor ID: N/A" << std::endl;
|
||||
} else {
|
||||
std::cout << "\t**Subvendor ID: 0x" << std::hex << std::setfill('0') << std::setw(4)
|
||||
<< asic_info.subvendor_id << std::endl;
|
||||
}
|
||||
std::cout << "\t**Vendor ID: 0x" << std::hex << std::setfill('0') << std::setw(4)
|
||||
<< asic_info.vendor_id << std::endl;
|
||||
std::cout << "\t**Vendor name: " << asic_info.vendor_name
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
err = amdsmi_get_gpu_revision(dv_ind, &val_ui16);
|
||||
if (err == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device Revision ID: N/A" << std::endl;
|
||||
}
|
||||
ASSERT_EQ(err, AMDSMI_STATUS_NOT_SUPPORTED);
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Device Revision ID: 0x" << std::hex << std::setfill('0') << std::setw(2)
|
||||
<< val_ui16 << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
err = amdsmi_get_gpu_subsystem_id(dv_ind, &val_ui16);
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Subsystem ID: 0x" << std::hex << val_ui16 << std::endl;
|
||||
std::cout << "\t**Subsystem Vendor ID: 0x" << std::hex
|
||||
<< info.subvendor_id << std::endl;
|
||||
if (err == AMDSMI_STATUS_NOT_SUPPORTED) {
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Subsystem ID: N/A" << std::endl;
|
||||
}
|
||||
} else {
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**Subsystem ID: 0x" << std::hex << std::setfill('0') << std::setw(4)
|
||||
<< val_ui16 << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << std::setbase(10);
|
||||
@@ -349,22 +400,6 @@ TestBase::AcceleratorProfileConfig TestBase::getAvailableProfileConfigs(
|
||||
return profile_config;
|
||||
}
|
||||
|
||||
void TestBase::waitForUserInput() {
|
||||
for (;;) {
|
||||
std::cout << "\n\t**Press any key to continue**" << std::endl;
|
||||
int input = std::cin.get();
|
||||
if (input == EOF) {
|
||||
std::cout << "EOF detected. Exiting." << std::endl;
|
||||
return;
|
||||
}
|
||||
char input_char = static_cast<char>(input);
|
||||
std::cout << "User entered: " << input_char << std::endl;
|
||||
if (input_char == '\n') {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t TestBase::promptNumDevicesToTest(uint32_t current_num_devices) {
|
||||
uint32_t return_value = 0;
|
||||
std::cout << "**How many devices would you like to test? (0 to skip): ";
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <limits>
|
||||
#include "amd_smi/amdsmi.h"
|
||||
|
||||
// The max devices can be monitored
|
||||
@@ -133,12 +134,21 @@ class TestBase {
|
||||
amdsmi_accelerator_partition_profile_t current_profile,
|
||||
amdsmi_accelerator_partition_profile_config_t config,
|
||||
bool isVerbose);
|
||||
void waitForUserInput();
|
||||
|
||||
uint32_t promptNumDevicesToTest(uint32_t current_num_devices);
|
||||
|
||||
std::string getResourceType(amdsmi_accelerator_partition_resource_type_t resource_type);
|
||||
|
||||
template <typename T>
|
||||
bool checkIfMaxValue(T value) {
|
||||
T max_value = std::numeric_limits<T>::max();
|
||||
if (value == max_value) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
void MakeHeaderStr(const char *inStr, std::string *outStr) const;
|
||||
void PrintDeviceHeader(amdsmi_processor_handle dv_ind);
|
||||
@@ -163,9 +173,16 @@ class TestBase {
|
||||
|
||||
// Macros to be used within TestBase classes
|
||||
#define CHK_ERR_ASRT(RET) { \
|
||||
if (dont_fail() && ((RET) != AMDSMI_STATUS_SUCCESS)) { \
|
||||
if ((RET) != AMDSMI_STATUS_SUCCESS) { \
|
||||
std::cout << std::endl << "\t===> TEST FAILURE." << std::endl; \
|
||||
DISPLAY_AMDSMI_ERR(RET); \
|
||||
const char *err_str; \
|
||||
std::cout << "\t===> ERROR: AMDSMI call returned " << (RET) << std::endl; \
|
||||
amdsmi_status_code_to_string((RET), &err_str); \
|
||||
std::cout << "\t===> (" << err_str << ")" << std::endl; \
|
||||
std::cout << "\t===> at " << __FILE__ << ":" << std::dec << __LINE__ << \
|
||||
std::endl; \
|
||||
} \
|
||||
if (dont_fail() && ((RET) != AMDSMI_STATUS_SUCCESS)) { \
|
||||
std::cout << \
|
||||
"\t===> Abort is over-ridden due to dont_fail command line option." \
|
||||
<< std::endl; \
|
||||
|
||||
Ссылка в новой задаче
Block a user