bdfid fix for partition & xgmi nodes

* Updates:
    - [API] After discovering all amd gpus, we now properly
      map correct bdf (xgmi nodes). Especially important for
      partition changes - aka secondary nodes.
    - [API] While adding new secondary nodes we now have
      better grouping -> due to resorting based on
      kfd properties list & matching to primary uniqueid
    - [API] All secondary nodes are now AddToDeviceList
      with correct bdf (location id), provided by kfd
    - [API] Modified AddToDeviceList(..., uint64_t bdfid):
      providing an optional field - bdfid. This allows working
      around primary pcie cards with xgmi nodes
    - [API] Utils - cpplint minor fixes
    - [Example] Removed all endl references w/ newline, fixed
      spacing, and some incorrect values displaying as hex
      (needed dec representation)
    - [API] kfd node functions - now print full path of file
      for trace logs
    - [Tests] power_read.cc: Added in generic power test to
      confirm guaranteeing specific return values

Change-Id: I143474e8d64c4915a966e789be6bcea4fa7f4472
Signed-off-by: Charis Poag <Charis.Poag@amd.com>


[ROCm/amdsmi commit: 6f1afd2678]
Этот коммит содержится в:
Charis Poag
2023-10-12 10:54:46 -05:00
коммит произвёл Galantsev, Dmitrii
родитель f6c46e97ee
Коммит d1450bbbcc
6 изменённых файлов: 137 добавлений и 33 удалений
+1 -1
Просмотреть файл
@@ -128,7 +128,7 @@ class RocmSMI {
std::map<std::pair<uint32_t, uint32_t>, std::shared_ptr<IOLink>>
io_link_map_;
std::map<uint32_t, uint32_t> dev_ind_to_node_ind_map_;
void AddToDeviceList(std::string dev_name);
void AddToDeviceList(std::string dev_name, uint64_t bdfid = 0);
void GetEnvVariables(void);
std::shared_ptr<Monitor> FindMonitor(std::string monitor_path);
+12 -11
Просмотреть файл
@@ -58,8 +58,8 @@
#define PRINT_RSMI_ERR(RET) { \
if (RET != RSMI_STATUS_SUCCESS) { \
std::cout << "[ERROR] RSMI call returned " << (RET) \
<< " at line " << __LINE__ << std::endl; \
std::cout << amd::smi::getRSMIStatusString(RET) << std::endl; \
<< " at line " << __LINE__ << "\n"; \
std::cout << amd::smi::getRSMIStatusString(RET) << "\n"; \
} \
}
@@ -718,7 +718,7 @@ int main() {
rsmi_num_monitor_devices(&num_monitor_devs);
for (uint32_t i = 0; i < num_monitor_devs; ++i) {
std::cout << "\t**Device #: " << std::dec << i << std::endl;
std::cout << "\t**Device #: " << std::dec << i << "\n";
ret = rsmi_dev_id_get(i, &val_ui16);
CHK_RSMI_RET_I(ret)
std::cout << "\t**Device ID: 0x" << std::hex << val_ui16 << "\n";
@@ -765,8 +765,9 @@ int main() {
uint64_t max_bandwidth = 0;
ret = rsmi_minmax_bandwidth_get(0, i, &min_bandwidth, &max_bandwidth);
CHK_RSMI_NOT_SUPPORTED_OR_UNEXPECTED_DATA_RET(ret)
std::cout << "\nMinimum Bandwidth: " << min_bandwidth
<< "\nMaximum Bandwidth: " << max_bandwidth;
std::cout << "\n\t**\tMinimum Bandwidth: " << std::dec << min_bandwidth
<< "\n\t**\tMaximum Bandwidth: " << std::dec
<< max_bandwidth << "\n";
} else {
std::cout << "Not Supported\n";
}
@@ -813,7 +814,7 @@ int main() {
ret = rsmi_dev_temp_metric_get(i, RSMI_TEMP_TYPE_EDGE,
rsmi_temperature_metric_t::RSMI_TEMP_CURRENT, &val_i64);
if (ret == RSMI_STATUS_SUCCESS) {
std::cout << val_i64/1000 << "C" << "\n";
std::cout << std::dec << val_i64/1000 << " C" << "\n";
}
CHK_RSMI_NOT_SUPPORTED_RET(ret)
@@ -821,7 +822,7 @@ int main() {
ret = rsmi_dev_temp_metric_get(i, RSMI_TEMP_TYPE_JUNCTION,
rsmi_temperature_metric_t::RSMI_TEMP_CURRENT, &val_i64);
if (ret == RSMI_STATUS_SUCCESS) {
std::cout << (val_i64 / 1000) << "C" << std::endl;
std::cout << std::dec << (val_i64 / 1000) << " C" << "\n";
}
CHK_RSMI_NOT_SUPPORTED_RET(ret)
@@ -869,14 +870,14 @@ int main() {
std::cout << "\t**Average Power Usage: ";
ret = rsmi_dev_power_ave_get(i, 0, &val_ui64);
if (ret == RSMI_STATUS_SUCCESS) {
std::cout << convert_mw_to_w(val_ui64) << " W" << std::endl;
std::cout << convert_mw_to_w(val_ui64) << " W" << "\n";
}
CHK_RSMI_NOT_SUPPORTED_RET(ret)
std::cout << "\t**Current Socket Power Usage: ";
ret = rsmi_dev_current_socket_power_get(i, &val_ui64);
if (ret == RSMI_STATUS_SUCCESS) {
std::cout << convert_mw_to_w(val_ui64) << " W" << std::endl;
std::cout << convert_mw_to_w(val_ui64) << " W" << "\n";
}
CHK_RSMI_NOT_SUPPORTED_RET(ret)
@@ -884,7 +885,7 @@ int main() {
ret = rsmi_dev_power_get(i, &val_ui64, &power_type);
if (ret == RSMI_STATUS_SUCCESS) {
std::cout << "[" << amd::smi::power_type_string(power_type) << "] "
<< convert_mw_to_w(val_ui64) << " W" << std::endl;
<< convert_mw_to_w(val_ui64) << " W" << "\n";
}
CHK_RSMI_NOT_SUPPORTED_RET(ret)
std::cout << "\t=======" << "\n";
@@ -897,7 +898,7 @@ int main() {
return 0;
}
for (uint32_t i = 0; i< num_monitor_devs; ++i) {
for (uint32_t i = 0; i < num_monitor_devs; ++i) {
ret = test_set_overdrive(i);
CHK_AND_PRINT_RSMI_ERR_RET(ret)
+10
Просмотреть файл
@@ -890,9 +890,12 @@ int KFDNode::get_used_memory(uint64_t* used) {
int read_node_properties(uint32_t node, std::string property_name,
uint64_t *val) {
std::ostringstream ss;
std::string propertiesFullPath = "/sys/class/kfd/kfd/topology/nodes/"
+ std::to_string(node) + "/properties";
int retVal = EINVAL;
if (property_name.empty() || val == nullptr) {
ss << __PRETTY_FUNCTION__
<< " | File: " << propertiesFullPath
<< " | Issue: Could not read node #" << std::to_string(node)
<< ", property_name is empty or *val is nullptr "
<< " | return = " << std::to_string(retVal)
@@ -905,6 +908,7 @@ int read_node_properties(uint32_t node, std::string property_name,
if (KFDNodeSupported(node)) {
retVal = myNode->get_property_value(property_name, val);
ss << __PRETTY_FUNCTION__
<< " | File: " << propertiesFullPath
<< " | Successfully read node #" << std::to_string(node)
<< " for property_name = " << property_name
<< " | Data (" << property_name << ") * val = "
@@ -915,6 +919,7 @@ int read_node_properties(uint32_t node, std::string property_name,
} else {
retVal = 1;
ss << __PRETTY_FUNCTION__
<< " | File: " << propertiesFullPath
<< " | Issue: Could not read node #" << std::to_string(node)
<< ", KFD node was an unsupported node."
<< " | return = " << std::to_string(retVal)
@@ -927,9 +932,12 @@ int read_node_properties(uint32_t node, std::string property_name,
// /sys/class/kfd/kfd/topology/nodes/*/gpu_id
int get_gpu_id(uint32_t node, uint64_t *gpu_id) {
std::ostringstream ss;
std::string gpu_id_FullPath = "/sys/class/kfd/kfd/topology/nodes/"
+ std::to_string(node) + "/gpu_id";
int retVal = EINVAL;
if (gpu_id == nullptr) {
ss << __PRETTY_FUNCTION__
<< " | File: " << gpu_id_FullPath
<< " | Issue: Could not read node #" << std::to_string(node)
<< ", gpu_id is a nullptr "
<< " | return = " << std::to_string(retVal)
@@ -942,6 +950,7 @@ int get_gpu_id(uint32_t node, uint64_t *gpu_id) {
if (KFDNodeSupported(node)) {
retVal = ReadKFDGpuId(node, gpu_id);
ss << __PRETTY_FUNCTION__
<< " | File: " << gpu_id_FullPath
<< " | Successfully read node #" << std::to_string(node)
<< " for gpu_id"
<< " | Data (gpu_id) *gpu_id = "
@@ -952,6 +961,7 @@ int get_gpu_id(uint32_t node, uint64_t *gpu_id) {
} else {
retVal = 1;
ss << __PRETTY_FUNCTION__
<< " | File: " << gpu_id_FullPath
<< " | Issue: Could not read node #" << std::to_string(node)
<< ", KFD node was an unsupported node."
<< " | return = " << std::to_string(retVal)
+109 -18
Просмотреть файл
@@ -312,6 +312,7 @@ RocmSMI::Initialize(uint64_t flags) {
auto i = 0;
uint32_t ret;
int i_ret;
std::ostringstream ss;
LOG_ALWAYS("=============== ROCM SMI initialize ================");
ROCmLogging::Logger::getInstance()->enableAllLogLevels();
@@ -355,9 +356,32 @@ RocmSMI::Initialize(uint64_t flags) {
if (ConstructBDFID(device->path(), &bdfid) != 0) {
std::cerr << "Failed to construct BDFID." << std::endl;
ret = 1;
} else if (device->bdfid() != UINT64_MAX && device->bdfid() != bdfid) {
// handles secondary partitions - compute partition feature nodes
ss << __PRETTY_FUNCTION__
<< " | [before] device->path() = " << device->path()
<< "\n | bdfid = " << bdfid
<< "\n | device->bdfid() = " << device->bdfid()
<< "\n | (xgmi node) setting to setting "
<< "device->set_bdfid(device->bdfid())";
LOG_TRACE(ss);
device->set_bdfid(device->bdfid());
} else {
// legacy & pcie card updates
ss << __PRETTY_FUNCTION__
<< " | [before] device->path() = " << device->path()
<< "\n | bdfid = " << bdfid
<< "\n | device->bdfid() = " << device->bdfid()
<< "\n | (legacy/pcie card) setting device->set_bdfid(bdfid)";
LOG_TRACE(ss);
device->set_bdfid(bdfid);
}
ss << __PRETTY_FUNCTION__
<< " | [after] device->path() = " << device->path()
<< "\n | bdfid = " << bdfid
<< "\n | device->bdfid() = " << device->bdfid()
<< "\n | final update: device->bdfid() holds correct device bdf";
LOG_TRACE(ss);
}
if (ret != 0) {
throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
@@ -386,7 +410,6 @@ RocmSMI::Initialize(uint64_t flags) {
// Remove any drm nodes that don't have a corresponding readable kfd node.
// kfd nodes will not be added if their properties file is not readable.
std::ostringstream ss;
auto dev_iter = devices_.begin();
while (dev_iter != devices_.end()) {
uint64_t bdfid = (*dev_iter)->bdfid();
@@ -665,8 +688,8 @@ RocmSMI::FindMonitor(std::string monitor_path) {
return m;
}
void
RocmSMI::AddToDeviceList(std::string dev_name) {
void RocmSMI::AddToDeviceList(std::string dev_name, uint64_t bdfid) {
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
@@ -684,10 +707,15 @@ RocmSMI::AddToDeviceList(std::string dev_name) {
dev->set_drm_render_minor(GetDrmRenderMinor(dev_path));
dev->set_card_index(card_indx);
GetSupportedEventGroups(card_indx, dev->supported_event_groups());
if (bdfid != 0) {
dev->set_bdfid(bdfid);
}
devices_.push_back(dev);
ss << __PRETTY_FUNCTION__ << " | Adding to device list dev_name = "
<< dev_name << " | path = " << dev_path
ss << __PRETTY_FUNCTION__
<< " | Adding to device list dev_name = " << dev_name
<< " | path = " << dev_path
<< " | bdfid = " << bdfid
<< " | card index = " << std::to_string(card_indx) << " | ";
LOG_DEBUG(ss);
}
@@ -768,19 +796,24 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
uint32_t s_node_id = 0;
uint64_t s_gpu_id = 0;
uint64_t s_unique_id = 0;
uint64_t s_location_id = 0;
};
// allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id}
// allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id,
// location_id}
std::multimap<uint64_t, systemNode> allSystemNodes;
uint32_t node_id = 0;
while (true) {
uint64_t gpu_id = 0, unique_id = 0;
uint64_t gpu_id = 0, unique_id = 0, location_id = 0;
int ret_gpu_id = get_gpu_id(node_id, &gpu_id);
int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id);
if (ret_gpu_id == 0 || ret_unique_id == 0) {
int ret_loc_id =
read_node_properties(node_id, "location_id", &location_id);
if (ret_gpu_id == 0 || ret_unique_id == 0 || ret_loc_id == 0) {
systemNode myNode;
myNode.s_node_id = node_id;
myNode.s_gpu_id = gpu_id;
myNode.s_unique_id = unique_id;
myNode.s_location_id = location_id;
if (gpu_id != 0) { // only add gpu nodes, 0 = CPU
allSystemNodes.emplace(unique_id, myNode);
}
@@ -795,6 +828,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
<< "; gpu_id = " << std::to_string(i.second.s_gpu_id)
<< "; unique_id = " << std::to_string(i.second.s_unique_id)
<< "; location_id = " << std::to_string(i.second.s_location_id)
<< "], ";
}
ss << "}";
@@ -807,6 +841,14 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
path += "/card";
path += std::to_string(cardId);
uint64_t primary_unique_id = 0;
uint64_t device_uuid = 0;
bool doesDeviceSupportPartitions = false;
// get current partition
int kSize = 256;
char computePartition[kSize];
std::string strCompPartition = "UNKNOWN";
uint32_t numMonDevices = 0;
rsmi_num_monitor_devices(&numMonDevices);
// each identified gpu card node is a primary node for
// potential matching unique ids
@@ -814,7 +856,25 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
(init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) {
std::string d_name = "card";
d_name += std::to_string(cardId);
AddToDeviceList(d_name);
uint32_t numMonDevices = 0;
rsmi_num_monitor_devices(&numMonDevices);
if (rsmi_dev_compute_partition_get(cardAdded, computePartition, kSize)
== RSMI_STATUS_SUCCESS) {
strCompPartition = computePartition;
doesDeviceSupportPartitions = true;
}
rsmi_status_t ret_unique_id =
rsmi_dev_unique_id_get(cardAdded, &device_uuid);
auto temp_numb_nodes = allSystemNodes.count(device_uuid);
auto primaryBdfId =
allSystemNodes.lower_bound(device_uuid)->second.s_location_id;
if (doesDeviceSupportPartitions && temp_numb_nodes > 1
&& ret_unique_id == RSMI_STATUS_SUCCESS) {
// helps identify xgmi nodes (secondary nodes) easier
AddToDeviceList(d_name, primaryBdfId);
} else {
AddToDeviceList(d_name, UINT64_MAX);
}
ss << __PRETTY_FUNCTION__
<< " | Ordered system nodes seen in lookup = {";
@@ -822,12 +882,14 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
<< "; gpu_id = " << std::to_string(i.second.s_gpu_id)
<< "; unique_id = " << std::to_string(i.second.s_unique_id)
<< "; location_id = " << std::to_string(i.second.s_location_id)
<< "], ";
}
ss << "}";
LOG_DEBUG(ss);
uint64_t temp_primary_unique_id = 0;
uint64_t primary_location_id = 0;
if (allSystemNodes.empty()) {
cardAdded++;
ss << __PRETTY_FUNCTION__
@@ -837,16 +899,11 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
}
// get current partition
const int kSize = 256;
char computePartition[kSize];
std::string strCompPartition = "UNKNOWN";
uint32_t numMonDevices = 0;
rsmi_num_monitor_devices(&numMonDevices);
if (rsmi_dev_compute_partition_get(cardAdded, computePartition, kSize)
== RSMI_STATUS_SUCCESS) {
strCompPartition = computePartition;
}
uint64_t device_uuid = 0;
if (rsmi_dev_unique_id_get(cardAdded, &device_uuid)
!= RSMI_STATUS_SUCCESS) {
cardAdded++;
@@ -860,7 +917,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
temp_primary_unique_id =
allSystemNodes.find(device_uuid)->second.s_unique_id;
auto temp_numb_nodes = allSystemNodes.count(temp_primary_unique_id);
temp_numb_nodes = allSystemNodes.count(temp_primary_unique_id);
ss << __PRETTY_FUNCTION__
<< " | device/node id (cardId) = " << std::to_string(cardId)
@@ -892,12 +949,46 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
LOG_DEBUG(ss);
while (numb_nodes > 1) {
std::string secNode = "card";
secNode += std::to_string(cardId); // add the primary node id
AddToDeviceList(secNode);
secNode += std::to_string(cardId); // maps the primary node card to
// secondary - allows get/sets
auto it = allSystemNodes.lower_bound(device_uuid);
auto it_end = allSystemNodes.upper_bound(device_uuid);
if (numb_nodes == temp_numb_nodes) {
auto removalNodeId = it->second.s_node_id;
auto removalGpuId = it->second.s_gpu_id;
auto removalUniqueId = it->second.s_unique_id;
auto removalLocId = it->second.s_location_id;
auto nodesErased = 1;
primary_location_id = removalLocId;
allSystemNodes.erase(it++);
ss << __PRETTY_FUNCTION__
<< "\nPRIMARY --> num_nodes == temp_numb_nodes; ERASING "
<< std::to_string(nodesErased) << " node -> [node_id = "
<< std::to_string(removalNodeId)
<< "; gpu_id = " << std::to_string(removalGpuId)
<< "; unique_id = " << std::to_string(removalUniqueId)
<< "; location_id = " << std::to_string(removalLocId)
<< "]";
LOG_DEBUG(ss);
}
if (it == it_end) {
break;
}
auto myBdfId = it->second.s_location_id;
AddToDeviceList(secNode, myBdfId);
ss << __PRETTY_FUNCTION__
<< "\nSECONDARY --> After adding new node; ERASING -> [node_id = "
<< std::to_string(it->second.s_node_id)
<< "; gpu_id = " << std::to_string(it->second.s_gpu_id)
<< "; unique_id = " << std::to_string(it->second.s_unique_id)
<< "; location_id = " << std::to_string(it->second.s_location_id)
<< "]";
LOG_DEBUG(ss);
allSystemNodes.erase(it++);
numb_nodes--;
cardAdded++;
}
// remove already added nodes associated with current card
// remove any remaining nodes associated with current card
auto erasedNodes = allSystemNodes.erase(primary_unique_id);
ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = "
<< std::to_string(primary_unique_id) << " erased "
+3 -3
Просмотреть файл
@@ -520,7 +520,7 @@ std::vector<std::string> readEntireFile(std::string path) {
void displayAppTmpFilesContent() {
std::vector<std::string> tmpFiles = getListOfAppTmpFiles();
if (!tmpFiles.empty()) {
for (auto &x: tmpFiles) {
for (auto &x : tmpFiles) {
std::string out = readFile(x);
std::cout << __PRETTY_FUNCTION__ << " | Temporary file: " << x
<< "; Contained content: " << out << std::endl;
@@ -539,7 +539,7 @@ std::string debugVectorContent(std::vector<std::string> v) {
for (auto it=v.begin(); it < v.end(); it++) {
ss << *it;
auto temp_it = it;
if(++temp_it != v.end()) {
if (++temp_it != v.end()) {
ss << ", ";
}
}
@@ -557,7 +557,7 @@ std::string displayAllDevicePaths(std::vector<std::shared_ptr<Device>> v) {
for (auto it=v.begin(); it < v.end(); it++) {
ss << (*it)->path();
auto temp_it = it;
if(++temp_it != v.end()) {
if (++temp_it != v.end()) {
ss << ", ";
}
}
+2
Просмотреть файл
@@ -167,6 +167,8 @@ void TestPowerRead::Run(void) {
err = rsmi_dev_power_get(i, &val_ui64, &type);
ASSERT_TRUE(err == RSMI_STATUS_SUCCESS
|| err == RSMI_STATUS_NOT_SUPPORTED);
ASSERT_TRUE(type == RSMI_AVERAGE_POWER || type == RSMI_CURRENT_POWER
|| type == RSMI_INVALID_POWER);
if (err == RSMI_STATUS_NOT_SUPPORTED) {
std::cout <<