diff --git a/projects/rocm-smi-lib/CHANGELOG.md b/projects/rocm-smi-lib/CHANGELOG.md
index 9fcb207e86..130ac2f059 100644
--- a/projects/rocm-smi-lib/CHANGELOG.md
+++ b/projects/rocm-smi-lib/CHANGELOG.md
@@ -4,7 +4,7 @@ Full documentation for rocm_smi_lib is available at [https://rocm.docs.amd.com/]
***All information listed below is for reference and subject to change.***
-## amd_smi_lib for ROCm 6.5.0
+## rocm_smi_lib for ROCm 6.5.0
### Added
@@ -42,6 +42,86 @@ Full documentation for rocm_smi_lib is available at [https://rocm.docs.amd.com/]
- N/A
+## rocm_smi_lib for ROCm 6.4.1
+
+### Added
+
+- N/A
+
+### Changed
+
+- N/A
+
+### Removed
+
+- N/A
+
+### Optimized
+
+- N/A
+
+### Resolved issues
+
+- **Fixed partition enumeration - now refer to correct DRM Render and Card paths**
+ Previously, partitions incorrectly reflected the primary node (1st GPU) and showed the DRM Render Minor as renderD128. Partition nodes mirrored renderD128's information, which was incorrect. See the "Previous Outputs in CPX" example below.
+
+ Device enumeration was updated to correctly map DRM Render Minor paths. See the "Corrected Outputs in CPX" example below.
+
+ These changes impact what information is readable/writable for the partition nodes.
+
+ Example: Previous Outputs in CPX
+ ```shell
+ $ rocm-smi
+
+ ============================================ ROCm System Management Interface ============================================
+ ====================================================== Concise Info ======================================================
+ Device Node IDs Temp Power Partitions SCLK MCLK Fan Perf PwrCap VRAM% GPU%
+ (DID, GUID) (Junction) (Socket) (Mem, Compute, ID)
+ ==========================================================================================================================
+ 0 2 0x74a1, 18421 45.0°C 152.0W NPS1, CPX, 0 133Mhz 900Mhz 0% auto 750.0W 0% 0%
+ 1 3 0x74a1, 48116 45.0°C 152.0W NPS1, CPX, 1 133Mhz 900Mhz 0% auto 750.0W 0% 0%
+ 2 4 0x74a1, 65524 45.0°C 152.0W NPS1, CPX, 2 138Mhz 900Mhz 0% auto 750.0W 0% 0%
+ 3 5 0x74a1, 1013 45.0°C 152.0W NPS1, CPX, 3 138Mhz 900Mhz 0% auto 750.0W 0% 0%
+ 4 6 0x74a1, 30708 45.0°C 152.0W NPS1, CPX, 4 138Mhz 900Mhz 0% auto 750.0W 0% 0%
+ 5 7 0x74a1, 35829 45.0°C 152.0W NPS1, CPX, 5 153Mhz 900Mhz 0% auto 750.0W 0% 0%
+ 6 8 0x74a1, 53237 45.0°C 152.0W NPS1, CPX, 6 153Mhz 900Mhz 0% auto 750.0W 0% 0%
+ 7 9 0x74a1, 13300 45.0°C 152.0W NPS1, CPX, 7 153Mhz 900Mhz 0% auto 750.0W 0% 0%
+ 8 10 0x74a1, 64360 44.0°C 158.0W NPS1, CPX, 0 144Mhz 900Mhz 0% auto 750.0W 0% 0%
+ ...
+ ==========================================================================================================================
+ ================================================== End of ROCm SMI Log ===================================================
+ ```
+ Example: Corrected outputs in CPX
+ ```shell
+ $ rocm-smi
+
+ ============================================ ROCm System Management Interface ============================================
+ ====================================================== Concise Info ======================================================
+ Device Node IDs Temp Power Partitions SCLK MCLK Fan Perf PwrCap VRAM% GPU%
+ (DID, GUID) (Junction) (Socket) (Mem, Compute, ID)
+ ==========================================================================================================================
+ 0 2 0x74a1, 18421 44.0°C 151.0W NPS1, CPX, 0 132Mhz 900Mhz 0% auto 750.0W 0% 0%
+ 1 3 N/A, 48116 N/A N/A N/A, N/A, 1 N/A N/A 0% n/a N/A 2% N/A
+ 2 4 N/A, 65524 N/A N/A N/A, N/A, 2 N/A N/A 0% n/a N/A 2% N/A
+ 3 5 N/A, 1013 N/A N/A N/A, N/A, 3 N/A N/A 0% n/a N/A 2% N/A
+ 4 6 N/A, 30708 N/A N/A N/A, N/A, 4 N/A N/A 0% n/a N/A 2% N/A
+ 5 7 N/A, 35829 N/A N/A N/A, N/A, 5 N/A N/A 0% n/a N/A 2% N/A
+ 6 8 N/A, 53237 N/A N/A N/A, N/A, 6 N/A N/A 0% n/a N/A 2% N/A
+ 7 9 N/A, 13300 N/A N/A N/A, N/A, 7 N/A N/A 0% n/a N/A 2% N/A
+ 8 10 0x74a1, 64360 44.0°C 158.0W NPS1, CPX, 0 132Mhz 900Mhz 0% auto 750.0W 0% 0%
+ ...
+ ==========================================================================================================================
+ ================================================== End of ROCm SMI Log ===================================================
+ ```
+
+### Upcoming changes
+
+- N/A
+
+### Known issues
+
+- N/A
+
## rocm_smi_lib for ROCm 6.4
### Added
diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h
index bb3c4cd49c..d751037a52 100755
--- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h
+++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h
@@ -1297,6 +1297,34 @@ typedef union id {
};
} rsmi_func_id_value_t;
+/**
+ * @struct rsmi_device_identifiers_t
+ * @brief Structure to hold various identifiers for a GPU device.
+ *
+ * @details This structure contains fields that uniquely identify a GPU device,
+ * including its card index, DRM render minor, PCI Bus/Device/Function ID (BDFID),
+ * KFD GPU ID, partition ID, and SMI device ID.
+ */
+typedef struct {
+ //!< The card index of the device.
+ uint32_t card_index;
+ //!< The DRM render minor number of the device.
+ uint32_t drm_render_minor;
+
+ //!< The PCI Bus/Device/Function identifier (BDFID) of the device.
+ uint64_t bdfid;
+
+ //!< The KFD (Kernel Fusion Driver) GPU ID of the device.
+ uint64_t kfd_gpu_id;
+
+ //!< The partition ID of the device.
+ uint32_t partition_id;
+
+ //!< The SMI (System Management Interface) device ID.
+ uint32_t smi_device_id;
+
+ uint32_t reserved[10];
+} rsmi_device_identifiers_t;
/*****************************************************************************/
/** @defgroup InitShutAdmin Initialization and Shutdown
@@ -1824,6 +1852,35 @@ rsmi_status_t rsmi_dev_guid_get(uint32_t dv_ind, uint64_t *guid);
*/
rsmi_status_t rsmi_dev_node_id_get(uint32_t dv_ind, uint32_t *node_id);
+/**
+ * @brief Retrieves the device identifiers for a specific GPU device.
+ *
+ * @details This function retrieves various identifiers for a GPU device, such as
+ * the card index, DRM render minor, BDFID, KFD GPU ID, partition ID, and SMI device ID.
+ * The identifiers are written to the provided `rsmi_device_identifiers_t` structure.
+ *
+ * @param[in] dv_ind a device index.
+ *
+ * @param[out] identifiers A pointer to a structure of type `rsmi_device_identifiers_t`
+ * where the device identifiers will be stored. The structure
+ * contains fields such as:
+ * - `card_index`: The card index of the device.
+ * - `drm_render_minor`: The DRM render minor number.
+ * - `bdfid`: The Bus/Device/Function PCI identifier.
+ * - `kfd_gpu_id`: The KFD GPU ID.
+ * - `partition_id`: The partition ID of the device.
+ * - `smi_device_id`: The SMI device ID.
+ *
+ * @retval ::RSMI_STATUS_SUCCESS The call was successful, and the device identifiers were retrieved.
+ * @retval ::RSMI_STATUS_NOT_SUPPORTED The installed software or hardware does not support this function
+ * with the given arguments.
+ * @retval ::RSMI_STATUS_INVALID_ARGS The provided arguments are invalid.
+ *
+ * @note Ensure that the `identifiers` pointer is valid and points to a properly allocated structure
+ * before calling this function.
+ */
+rsmi_status_t rsmi_dev_device_identifiers_get(uint32_t dv_ind,
+ rsmi_device_identifiers_t *identifiers);
/** @} */ // end of IDQuer
diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h
index ea2abb1cd0..a891a6694a 100755
--- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h
+++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h
@@ -248,6 +248,8 @@ class Device {
void set_smi_device_id(uint32_t i) { m_device_id = i; }
void set_smi_partition_id(uint32_t i) { m_partition_id = i; }
static const char* get_type_string(DevInfoTypes type);
+ rsmi_status_t get_smi_device_identifiers(uint32_t device_id,
+ rsmi_device_identifiers_t *device_identifiers);
private:
std::shared_ptr monitor_;
diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h
index 0a66ea227c..1d639d7bd9 100755
--- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h
+++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h
@@ -131,6 +131,15 @@ class RocmSMI {
io_link_map_;
std::map dev_ind_to_node_ind_map_;
void AddToDeviceList(std::string dev_name, uint64_t bdfid = 0);
+ typedef struct {
+ uint32_t card_index = std::numeric_limits::max();
+ std::string dev_name = "";
+ std::string drm_render_path = "";
+ std::string drm_card_path = "";
+ uint32_t drm_render_minor = std::numeric_limits::max();
+ uint64_t bdfid = std::numeric_limits::max();
+ } rsmi_device_enumeration_t;
+ rsmi_status_t AddToDeviceList2(rsmi_device_enumeration_t device);
void GetEnvVariables(void);
std::shared_ptr FindMonitor(std::string monitor_path);
diff --git a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py
index ada3b8e98e..7b0beac06c 100755
--- a/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py
+++ b/projects/rocm-smi-lib/python_smi_tools/rocm_smi.py
@@ -321,7 +321,7 @@ def getDRMDeviceId(device, silent=False):
dv_id = c_short()
ret = rocmsmi.rsmi_dev_id_get(device, byref(dv_id))
device_id_ret = "N/A"
- if rsmi_ret_ok(ret, device, 'get_device_id', silent):
+ if rsmi_ret_ok(ret, device, 'get_device_id', silent=True):
device_id_ret = hex(dv_id.value)
return device_id_ret
@@ -336,7 +336,7 @@ def getRev(device, silent=False):
dv_rev = c_short()
ret = rocmsmi.rsmi_dev_revision_get(device, byref(dv_rev))
revision_ret = "N/A"
- if rsmi_ret_ok(ret, device, 'get_device_rev', silent=silent):
+ if rsmi_ret_ok(ret, device, 'get_device_rev', silent=True):
revision_ret = padHexValue(hex(dv_rev.value), 2)
return revision_ret
@@ -350,7 +350,7 @@ def getSubsystemId(device, silent=False):
model = c_short()
ret = rocmsmi.rsmi_dev_subsystem_id_get(device, byref(model))
device_model = "N/A"
- if rsmi_ret_ok(ret, device, 'get_subsystem_name', silent=silent):
+ if rsmi_ret_ok(ret, device, 'get_subsystem_name', silent=True):
device_model = model.value
# padHexValue is used for applications that expect 4-digit card models
device_model = padHexValue(hex(device_model), 4)
@@ -1986,7 +1986,7 @@ def showAllConcise(deviceList):
(retCode, fanLevel, fanSpeed) = getFanSpeed(device, silent)
fan = str(fanSpeed) + '%'
if getPerfLevel(device, silent) != -1:
- perf = getPerfLevel(device, silent)
+ perf = str(getPerfLevel(device, silent)).lower()
else:
perf = 'N/A'
if getMaxPower(device, silent) != -1:
@@ -2007,7 +2007,7 @@ def showAllConcise(deviceList):
str(getGUID(device)),
temp_val, powerVal,
combined_partition_data,
- sclk, mclk, fan, str(perf).lower(),
+ sclk, mclk, fan, perf,
str(pwrCap),
allocated_mem_percent['combined'],
str(gpu_busy)]
@@ -2514,7 +2514,7 @@ def showMemUse(deviceList):
printLog(device, 'GPU Memory Allocated (VRAM%)',
int(allocated_mem_percent['value']))
ret = rocmsmi.rsmi_dev_memory_busy_percent_get(device, byref(memoryUse))
- if rsmi_ret_ok(ret, device, '% memory use'):
+ if rsmi_ret_ok(ret, device, '% memory use', silent=True):
printLog(device, 'GPU Memory Read/Write Activity (%)', memoryUse.value)
util_counters = getCoarseGrainUtil(device, "Memory Activity")
if util_counters != -1:
diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc
index f9ff48cdfb..d77b995284 100755
--- a/projects/rocm-smi-lib/src/rocm_smi.cc
+++ b/projects/rocm-smi-lib/src/rocm_smi.cc
@@ -5402,8 +5402,10 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) {
std::string strCompPartition = "UNKNOWN";
const uint32_t PARTITION_LEN = 10;
char compute_partition[PARTITION_LEN];
+ compute_partition[0] = '\0';
rsmi_status_t ret = rsmi_dev_compute_partition_get(dv_ind, compute_partition, PARTITION_LEN);
if (ret == RSMI_STATUS_SUCCESS) {
+ strCompPartition.clear();
strCompPartition = compute_partition;
}
uint64_t pci_id = UINT64_MAX;
@@ -5412,6 +5414,17 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) {
if (ret == RSMI_STATUS_SUCCESS) {
*partition_id = static_cast((pci_id >> 28) & 0xf);
}
+ std::ostringstream bdf_sstream;
+ bdf_sstream << std::hex << std::setfill('0') << std::setw(4)
+ << ((pci_id >> 32) & 0xFFFFFFFF) << ":";
+ bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << ((pci_id >> 8) & 0xFF) << ":";
+ bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << ((pci_id >> 3) & 0x1F) << ".";
+ bdf_sstream << std::hex << std::setfill('0') << +(pci_id & 0x7);
+ bdf_sstream << "\n[Option 1] Partition ID ((pci_id >> 28) & 0xf): " << std::dec
+ << static_cast((pci_id >> 28) & 0xf);
+ bdf_sstream << "\n[Option 2] Partition ID (pci_id & 0x7): " << std::dec
+ << static_cast(pci_id & 0x7);
+ // std::cout << __PRETTY_FUNCTION__ << " BDF: " << bdf_sstream.str() << std::endl;
/**
* Fall back is required due to driver changes within KFD.
@@ -5428,19 +5441,24 @@ rsmi_dev_partition_id_get(uint32_t dv_ind, uint32_t *partition_id) {
* bits [7:3] = Device
* bits [2:0] = Function (partition id maybe in bits [2:0]) <-- Fallback for non SPX modes
*/
+
+ // If the partition_id is still not set (bits [31:28]), we will use the fallback
+ // in function bits. We will use bits [2:0] as the partition ID.
if (*partition_id != UINT32_MAX && *partition_id == 0 &&
- (strCompPartition == "DPX" || strCompPartition == "TPX"
- || strCompPartition == "CPX" || strCompPartition == "QPX")) {
+ static_cast(pci_id & 0x7) != 0) {
*partition_id = static_cast(pci_id & 0x7);
}
ss << __PRETTY_FUNCTION__
<< " | ======= end ======= "
<< " | Success"
<< " | Device #: " << dv_ind
+ << " | Compute Partition: " << strCompPartition
<< " | Type: partition_id"
- << " | Data: " << *partition_id
+ << " | Data: " << static_cast(*partition_id)
<< " | Returning = "
- << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |";
+ << getRSMIStatusString(RSMI_STATUS_SUCCESS) << " |"
+ << "\n BDF: " << bdf_sstream.str() << std::endl;
+ // std::cout << ss.str() << std::endl;
LOG_INFO(ss);
return ret;
CATCH
@@ -6307,6 +6325,21 @@ rsmi_dev_metrics_log_get(uint32_t dv_ind)
CATCH
}
+rsmi_status_t rsmi_dev_device_identifiers_get(uint32_t dv_ind,
+ rsmi_device_identifiers_t *smi_device_identifiers) {
+ TRY
+ std::ostringstream ss;
+ ss << __PRETTY_FUNCTION__ << "| ======= start =======";
+ LOG_TRACE(ss);
+ GET_DEV_FROM_INDX
+ if (smi_device_identifiers == nullptr) {
+ return RSMI_STATUS_INVALID_ARGS;
+ }
+ rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED;
+ return ret = dev->get_smi_device_identifiers(dv_ind, smi_device_identifiers);
+ CATCH
+}
+
// UNDOCUMENTED FUNCTIONS
// This functions are not declared in rocm_smi.h. They are either not fully
diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc
index 5f6d9c98dd..62ced138e1 100755
--- a/projects/rocm-smi-lib/src/rocm_smi_device.cc
+++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc
@@ -1663,6 +1663,57 @@ std::string Device::readBootPartitionState(
return boot_state;
}
+rsmi_status_t Device::get_smi_device_identifiers(uint32_t device_id,
+ rsmi_device_identifiers_t *device_identifiers) {
+ bool found_device = false;
+ std::ostringstream ss;
+ rsmi_status_t ret = RSMI_STATUS_NOT_SUPPORTED;
+ if (device_identifiers == nullptr) {
+ return RSMI_STATUS_INVALID_ARGS;
+ }
+
+ amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
+ auto devices = smi.devices();
+ ss << __PRETTY_FUNCTION__ << " | device_id = " << device_id
+ << "; devices.size() = " << devices.size();
+ // std::cout << ss.str() << "\n";
+ LOG_DEBUG(ss);
+
+ for (uint32_t i = 0; i < devices.size(); i++) {
+ if (i != device_id) {
+ continue;
+ }
+
+ device_identifiers->card_index = devices[i]->index();
+ device_identifiers->drm_render_minor = devices[i]->drm_render_minor();
+ device_identifiers->bdfid = devices[i]->bdfid();
+ device_identifiers->kfd_gpu_id = devices[i]->kfd_gpu_id();
+ uint32_t temp_partition_id = 0;
+ rsmi_status_t ret = rsmi_dev_partition_id_get(
+ i, &temp_partition_id);
+ if (ret != RSMI_STATUS_SUCCESS) {
+ temp_partition_id = 0;
+ }
+ device_identifiers->partition_id = temp_partition_id;
+ device_identifiers->smi_device_id = i;
+ found_device = true;
+ ss << __PRETTY_FUNCTION__ << " | Found device: "
+ << "card_index = " << device_identifiers->card_index
+ << "; drm_render_minor = " << device_identifiers->drm_render_minor
+ << "; bdfid = " << std::hex << "0x" << device_identifiers->bdfid
+ << "; kfd_gpu_id = " << std::dec << device_identifiers->kfd_gpu_id
+ << "; partition_id = " << device_identifiers->partition_id
+ << "; smi_device_id = " << device_identifiers->smi_device_id;
+ // std::cout << ss.str() << "\n";
+ LOG_DEBUG(ss);
+ break;
+ }
+ if (found_device) {
+ ret = RSMI_STATUS_SUCCESS;
+ }
+ return ret;
+}
+
#undef RET_IF_NONZERO
} // namespace smi
diff --git a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc
index ae6e2b75f5..ed0ff27a42 100755
--- a/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc
+++ b/projects/rocm-smi-lib/src/rocm_smi_gpu_metrics.cc
@@ -4573,8 +4573,13 @@ rsmi_dev_gpu_metrics_info_get(uint32_t dv_ind, rsmi_gpu_metrics_t* smu) {
dev->set_smi_device_id(dv_ind);
uint32_t partition_id = 0;
- rsmi_dev_partition_id_get(dv_ind, &partition_id);
- dev->set_smi_partition_id(partition_id);
+ auto ret = rsmi_dev_partition_id_get(dv_ind, &partition_id);
+ if (ret == RSMI_STATUS_SUCCESS) {
+ dev->set_smi_partition_id(partition_id);
+ } else {
+ dev->set_smi_partition_id(0);
+ }
+
dev->dev_log_gpu_metrics(ostrstream);
const auto [error_code, external_metrics] = dev->dev_copy_internal_to_external_metrics();
diff --git a/projects/rocm-smi-lib/src/rocm_smi_main.cc b/projects/rocm-smi-lib/src/rocm_smi_main.cc
index 0185feda61..be5fc72553 100755
--- a/projects/rocm-smi-lib/src/rocm_smi_main.cc
+++ b/projects/rocm-smi-lib/src/rocm_smi_main.cc
@@ -73,8 +73,6 @@ static const char *kPathDRMRoot = "/sys/class/drm";
static const char *kPathHWMonRoot = "/sys/class/hwmon";
static const char *kPathPowerRoot = "/sys/kernel/debug/dri";
-static const char *kDeviceNamePrefix = "card";
-
static const char *kAMDMonitorTypes[] = {"radeon", "amdgpu", ""};
namespace amd {
@@ -127,6 +125,44 @@ static uint32_t GetDrmRenderMinor(const std::string s) {
return static_cast(drm_minor);
}
+// Find the drm minor from from sysfs path "/sys/class/drm/renderDX/device/drm".
+// From the directory cardN in that sysfs path, the card number can be
+// computed for renderDX.
+// On success, return drm_minor which is >= 128 otherwise return 0xFFFFFFFF
+static uint32_t GetCard(const std::string s) {
+ std::ostringstream ss;
+ std::string drm_path = s;
+ int card_num = -1;
+ const std::string card_file_prefix = "card";
+ const uint64_t prefix_size = card_file_prefix.size();
+ drm_path += "/device/drm";
+
+ auto card_dir = opendir(drm_path.c_str());
+ if (card_dir == nullptr)
+ return static_cast(-1);
+
+ auto dentry = readdir(card_dir);
+
+ while (dentry != nullptr) {
+ std::string card_file = dentry->d_name;
+ if (!card_file.compare(0, prefix_size, card_file_prefix)) {
+ card_num = stoi(card_file.substr(prefix_size));
+ if (card_num)
+ break;
+ }
+ dentry = readdir(card_dir);
+ }
+
+ if (closedir(card_dir)) {
+ return static_cast(-1);
+ }
+
+ ss << __PRETTY_FUNCTION__ << " | Discovered card = "
+ << std::to_string(card_num) << " | For drm_path = " << drm_path << " | ";
+ LOG_DEBUG(ss);
+ return static_cast(card_num);
+}
+
// Determine if provided string is a bdfid pci path directory of the form
// XXXX:XX:XX.X,
// domain:bus:device.function
@@ -190,12 +226,13 @@ static bool bdfid_from_path(const std::string in_name, uint64_t *bdfid) {
// 0 = successful bdfid found
// 1 = not a good bdfid found
-static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
+[[maybe_unused]] static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
+ std::ostringstream ss;
assert(bdfid != nullptr);
const unsigned int MAX_BDF_LENGTH = 512;
char tpath[MAX_BDF_LENGTH] = {'\0'};
ssize_t ret;
- memset(tpath,0,MAX_BDF_LENGTH);
+ memset(tpath, 0, MAX_BDF_LENGTH);
ret = readlink(path.c_str(), tpath, MAX_BDF_LENGTH);
@@ -203,6 +240,12 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
assert(ret < MAX_BDF_LENGTH);
if (ret <= 0 || ret >= MAX_BDF_LENGTH) {
+ ss << __PRETTY_FUNCTION__ << " | readlink failed for path = "
+ << path << " | ret = " << ret
+ << " | errno = " << errno
+ << " | error = " << strerror(errno);
+ // std::cout << ss.str() << std::endl;
+ LOG_ERROR(ss);
return 1;
}
@@ -220,11 +263,19 @@ static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
tmp = tpath_str.substr(slash_i + 1, end_i - slash_i);
if (bdfid_from_path(tmp, bdfid)) {
+ ss << __PRETTY_FUNCTION__ << " | Found bdfid = "
+ << print_int_as_hex(*bdfid, true, 8) << " | from path = "
+ << path << " | tmp = " << tmp << std::endl;
+ LOG_INFO(ss);
return 0;
}
end_i = slash_i - 1;
}
-
+ ss << __PRETTY_FUNCTION__ << " | No valid bdfid found in path = "
+ << path << " | tpath = " << tpath
+ << " | errno = " << errno
+ << " | error = " << strerror(errno) << std::endl;
+ LOG_ERROR(ss);
return 1;
}
@@ -273,41 +324,8 @@ RocmSMI::Initialize(uint64_t flags) {
"DiscoverAmdgpuDevices() failed.");
}
- uint64_t bdfid;
- for (auto & device : devices_) {
- if (ConstructBDFID(device->path(), &bdfid) != 0) {
- std::cerr << "Failed to construct BDFID." << std::endl;
- ret = 1;
- } else if (device->bdfid() != UINT64_MAX && device->bdfid() != bdfid) {
- // handles secondary partitions - compute partition feature nodes
- ss << __PRETTY_FUNCTION__
- << " | [before] device->path() = " << device->path()
- << "\n | bdfid = " << bdfid
- << "\n | device->bdfid() = " << device->bdfid()
- << " (" << print_int_as_hex(device->bdfid()) << ")"
- << "\n | (xgmi node) setting to setting "
- << "device->set_bdfid(device->bdfid())";
- LOG_TRACE(ss);
- device->set_bdfid(device->bdfid());
- } else {
- // legacy & pcie card updates
- ss << __PRETTY_FUNCTION__
- << " | [before] device->path() = " << device->path()
- << "\n | bdfid = " << bdfid
- << "\n | device->bdfid() = " << device->bdfid()
- << " (" << print_int_as_hex(device->bdfid()) << ")"
- << "\n | (legacy/pcie card) setting device->set_bdfid(bdfid)";
- LOG_TRACE(ss);
- device->set_bdfid(bdfid);
- }
- ss << __PRETTY_FUNCTION__
- << " | [after] device->path() = " << device->path()
- << "\n | bdfid = " << bdfid
- << "\n | device->bdfid() = " << device->bdfid()
- << " (" << print_int_as_hex(device->bdfid()) << ")"
- << "\n | final update: device->bdfid() holds correct device bdf";
- LOG_TRACE(ss);
- }
+ ss << __PRETTY_FUNCTION__ << " | about to sort by BDF..." << std::endl;
+ LOG_DEBUG(ss);
std::shared_ptr dev;
// Sort index based on the BDF, collect BDF id firstly.
@@ -402,6 +420,7 @@ RocmSMI::Initialize(uint64_t flags) {
// displayAppTmpFilesContent();
std::string amdGPUDeviceList = displayAllDevicePaths(devices_);
ss << __PRETTY_FUNCTION__ << " | current device paths = " << amdGPUDeviceList;
+ // std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
}
@@ -643,9 +662,11 @@ RocmSMI::FindMonitor(std::string monitor_path) {
}
void RocmSMI::AddToDeviceList(std::string dev_name, uint64_t bdfid) {
+ static const int BYTE = 8;
std::ostringstream ss;
ss << __PRETTY_FUNCTION__ << " | ======= start =======";
LOG_TRACE(ss);
+
auto dev_path = std::string(kPathDRMRoot);
dev_path += "/";
dev_path += dev_name;
@@ -657,7 +678,8 @@ void RocmSMI::AddToDeviceList(std::string dev_name, uint64_t bdfid) {
const std::string& d_name = dev_name;
uint32_t card_indx = GetDeviceIndex(d_name);
- dev->set_drm_render_minor(GetDrmRenderMinor(dev_path));
+ uint32_t drmRenderMinor = GetDrmRenderMinor(dev_path);
+ dev->set_drm_render_minor(drmRenderMinor);
dev->set_card_index(card_indx);
GetSupportedEventGroups(card_indx, dev->supported_event_groups());
if (bdfid != 0) {
@@ -666,16 +688,120 @@ void RocmSMI::AddToDeviceList(std::string dev_name, uint64_t bdfid) {
devices_.push_back(dev);
ss << __PRETTY_FUNCTION__
- << " | Adding to device list dev_name = " << dev_name
- << " | path = " << dev_path
- << " | bdfid = " << bdfid
- << " | card index = " << std::to_string(card_indx) << " | ";
+ << " | Adding to device list dev_name = " << dev_name << "\n"
+ << " | path = " << dev_path << "\n"
+ << " | dName = " << d_name << "\n"
+ << " | bdfid = " << (bdfid == UINT64_MAX ?
+ "N/A" : print_int_as_hex(bdfid, true, 2*BYTE)) << "\n"
+ << " | card index = " << std::to_string(card_indx) << "\n"
+ << " | drmRenderMinor = " << std::to_string(drmRenderMinor) << "\n"
+ << " | supported_event_groups = " << dev->supported_event_groups() << "\n";
+ // std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
}
+// AddToDeviceList2 is used to add a device to the device list.
+// [precondition] a. Iterate through KFD to find all accessible devices.
+// [precondition] b. Provide BDFID of the device & the device path (card or render path)
+// 1. Provide to function:
+// [optional; Will populate] rsmi_device_enumeration_t->card_index
+// [optional; Will populate
+// if card or render path provided] rsmi_device_enumeration_t->dev_name
+// [optional; Will populate] rsmi_device_enumeration_t->drm_render_path
+// [optional; Will populate] rsmi_device_enumeration_t->drm_card_path
+// [optional; Will populate] rsmi_device_enumeration_t->drm_render_minor
+// [Required] rsmi_device_enumeration_t->bdfid
+rsmi_status_t RocmSMI::AddToDeviceList2(RocmSMI::rsmi_device_enumeration_t device) {
+ static const int BYTE = 8;
+ std::ostringstream ss;
+
+ ss << __PRETTY_FUNCTION__ << " | ======= start ======="
+ << "\n | card index = [" << std::to_string(device.card_index) << "]\n"
+ << " | dev_name = [" << device.dev_name << "]\n"
+ << " | drm_render_path = [" << device.drm_render_path << "]\n"
+ << " | drm_card_path = [" << device.drm_card_path << "]\n"
+ << " | drm_render_minor = [" << std::to_string(device.drm_render_minor)
+ << "]\n | bdfid (value) = [" << (device.bdfid == UINT64_MAX ?
+ "N/A" : print_int_as_hex(device.bdfid, true, 4*BYTE)) << "]\n"
+ << " | bdfid (str) = ["
+ << std::hex << std::setfill('0') << std::setw(4)
+ << ((device.bdfid >> 32) & static_cast(0xFFFFFFFF)) << ":"
+ << std::hex << std::setfill('0') << std::setw(2) << ((device.bdfid >> 8)
+ & static_cast(0xFF)) << ":"
+ << std::hex << std::setfill('0') << std::setw(2) << ((device.bdfid >> 3)
+ & static_cast(0x1F)) << "."
+ << std::hex << std::setfill('0') << std::setw(1) << +(device.bdfid
+ & static_cast(0x7)) << "]\n";
+ // std::cout << ss.str() << std::endl;
+ LOG_TRACE(ss);
+ auto dev_path = std::string(kPathDRMRoot);
+
+ if (device.dev_name.empty()) {
+ ss << __PRETTY_FUNCTION__ << " | dev_name is empty";
+ // std::cout << ss.str() << std::endl;
+ LOG_DEBUG(ss);
+
+ dev_path += "/";
+ dev_path += ("renderD" + std::to_string(device.drm_render_minor));
+ uint32_t card_num = GetCard(dev_path);
+ device.dev_name = "card" + std::to_string(card_num);
+ device.drm_render_path = dev_path;
+ device.drm_card_path = std::string(kPathDRMRoot) + "/card" +
+ std::to_string(card_num);
+ device.card_index = card_num;
+ }
+
+ auto dev = std::make_shared(dev_path, &env_vars_);
+
+ std::shared_ptr m = FindMonitor(dev_path + "/device/hwmon");
+ dev->set_monitor(m);
+
+ const std::string& d_name = device.dev_name;
+ uint32_t card_indx = GetDeviceIndex(d_name);
+ uint32_t drmRenderMinor = GetDrmRenderMinor(dev_path);
+ dev->set_drm_render_minor(drmRenderMinor);
+ dev->set_card_index(card_indx);
+ GetSupportedEventGroups(card_indx, dev->supported_event_groups());
+ if (device.bdfid != 0) {
+ dev->set_bdfid(device.bdfid);
+ }
+
+ devices_.push_back(dev);
+ ss << __PRETTY_FUNCTION__
+ << " | Adding to device list dev_name = " << device.dev_name << "\n"
+ << " | path = " << dev_path << "\n"
+ << " | dName = " << d_name << "\n"
+ << " | bdfid = " << (device.bdfid == UINT64_MAX ?
+ "N/A" : print_int_as_hex(device.bdfid, true, 8*BYTE)) << "\n"
+ << " | card index = " << std::to_string(card_indx) << "\n"
+ << " | drmRenderMinor = " << std::to_string(drmRenderMinor) << "\n"
+ << " | supported_event_groups = " << dev->supported_event_groups() << "\n";
+ ss << " | ======= rsmi_device_enumeration_t details =======\n"
+ << " | card index = [" << std::to_string(device.card_index) << "]\n"
+ << " | dev_name = [" << device.dev_name << "]\n"
+ << " | drm_render_path = [" << device.drm_render_path << "]\n"
+ << " | drm_card_path = [" << device.drm_card_path << "]\n"
+ << " | drm_render_minor = [" << std::to_string(device.drm_render_minor)
+ << "]\n | bdfid (value) = [" << (device.bdfid == UINT64_MAX ?
+ "N/A" : print_int_as_hex(device.bdfid, true, 8*BYTE)) << "]\n"
+ << " | bdfid (str) = ["
+ << std::hex << std::setfill('0') << std::setw(4)
+ << ((device.bdfid >> 32) & static_cast(0xFFFFFFFF)) << ":"
+ << std::hex << std::setfill('0') << std::setw(2) << ((device.bdfid >> 8)
+ & static_cast(0xFF)) << ":"
+ << std::hex << std::setfill('0') << std::setw(2) << ((device.bdfid >> 3)
+ & static_cast(0x1F)) << "."
+ << std::hex << std::setfill('0') << std::setw(1) << +(device.bdfid
+ & static_cast(0x7)) << "]\n"
+ << " | END";
+ // std::cout << ss.str() << std::endl;
+ LOG_DEBUG(ss);
+ return RSMI_STATUS_SUCCESS;
+}
+
static const uint32_t kAmdGpuId = 0x1002;
-static bool isAMDGPU(std::string dev_path) {
+[[maybe_unused]] static bool isAMDGPU(std::string dev_path) {
bool isAmdGpu = false;
std::ostringstream ss;
std::string vend_path = dev_path + "/device/vendor";
@@ -711,44 +837,73 @@ static bool isAMDGPU(std::string dev_path) {
return isAmdGpu;
}
+uint32_t GetLargestNodeNumber(const std::string& path = "/sys/class/kfd/kfd/topology/nodes/") {
+ std::ostringstream ss;
+ uint32_t largest_node_number = 0;
+
+ // Open the directory
+ DIR* dir = opendir(path.c_str());
+ if (!dir) {
+ // Return UINT32_MAX on error
+ ss << __PRETTY_FUNCTION__ << " | Failed to open directory: " << path
+ << " | errno = " << errno
+ << " | error = " << strerror(errno);
+ // std::cout << ss.str() << std::endl;
+ LOG_ERROR(ss);
+ return UINT32_MAX;
+ }
+
+ struct dirent* entry;
+ while ((entry = readdir(dir)) != nullptr) {
+ // Skip "." and ".."
+ if (entry->d_name[0] == '.') {
+ continue;
+ }
+
+ // Check if the directory name is a number
+ std::string dir_name(entry->d_name);
+ if (std::all_of(dir_name.begin(), dir_name.end(), ::isdigit)) {
+ uint32_t node_number = static_cast(std::stoul(dir_name));
+ largest_node_number = std::max(largest_node_number, node_number);
+ }
+ }
+
+ if (closedir(dir)) {
+ // Return UINT32_MAX on error
+ ss << __PRETTY_FUNCTION__ << " | Failed to close directory: " << path
+ << " | errno = " << errno
+ << " | error = " << strerror(errno);
+ // std::cout << ss.str() << std::endl;
+ LOG_ERROR(ss);
+ return UINT32_MAX;
+ }
+
+ return largest_node_number;
+}
+
uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
std::string err_msg;
- uint32_t count = 0;
- int32_t cardId = 0;
- int32_t max_cardId = -1;
std::ostringstream ss;
// If this gets called more than once, clear previous findings.
devices_.clear();
monitors_.clear();
- auto drm_dir = opendir(kPathDRMRoot);
- if (drm_dir == nullptr) {
- err_msg = "Failed to open drm root directory ";
- err_msg += kPathDRMRoot;
- err_msg += ".";
- perror(err_msg.c_str());
+ uint32_t max_nodes = GetLargestNodeNumber();
+ ss << __PRETTY_FUNCTION__ << " | Discovered a potential of "
+ << std::to_string(max_nodes) << " kfd nodes";
+ // std::cout << ss.str() << std::endl;
+ LOG_DEBUG(ss);
+ if (max_nodes == UINT32_MAX) {
+ ss << __PRETTY_FUNCTION__ << " | Failed to get largest node number";
+ // std::cout << ss.str() << std::endl;
+ LOG_ERROR(ss);
return 1;
}
-
- auto dentry = readdir(drm_dir);
-
- while (dentry != nullptr) {
- if (memcmp(dentry->d_name, kDeviceNamePrefix, strlen(kDeviceNamePrefix))
- == 0) {
- if ((strcmp(dentry->d_name, ".") == 0) ||
- (strcmp(dentry->d_name, "..") == 0))
- continue;
- sscanf(&dentry->d_name[strlen(kDeviceNamePrefix)], "%d", &cardId);
- if (cardId > max_cardId)
- max_cardId = cardId;
- count++;
- }
- dentry = readdir(drm_dir);
- }
- ss << __PRETTY_FUNCTION__ << " | Discovered a potential of "
- << std::to_string(count) << " cards" << " | ";
- LOG_DEBUG(ss);
+ // Iterate through all nodes
+ // and read all properties
+ // under /sys/class/kfd/kfd/topology/nodes/
+ // and add to systemNodes vector
struct systemNode {
uint32_t s_node_id = 0;
@@ -761,24 +916,44 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
uint8_t s_device = 0;
uint8_t s_function = 0;
uint8_t s_partition_id = 0;
+ uint32_t s_drm_render_minor = 0;
uint64_t padding = 0; // padding added in case new changes in future
};
- // allSystemNodes[key = unique_id] => {node_id, gpu_id, unique_id,
- // location_id, bdf, domain, bus, device,
- // partition_id}
std::multimap allSystemNodes;
+ std::set gpuNodeIdsFound;
+ std::vector systemNodes;
uint32_t node_id = 0;
static const int BYTE = 8;
- while (true) {
- uint64_t gpu_id = 0, unique_id = 0, location_id = 0, domain = 0;
+ while (node_id <= max_nodes) {
+ ss << __PRETTY_FUNCTION__ << " | node_id = " << std::to_string(node_id);
+ // std::cout << ss.str() << std::endl;
+ LOG_DEBUG(ss);
+ uint64_t gpu_id = 0, unique_id = 0, location_id = 0, domain = 0, render_d = 0;
int ret_gpu_id = get_gpu_id(node_id, &gpu_id);
int ret_unique_id = read_node_properties(node_id, "unique_id", &unique_id);
int ret_loc_id =
read_node_properties(node_id, "location_id", &location_id);
- int ret_domain =
- read_node_properties(node_id, "domain", &domain);
- if (ret_gpu_id == 0 &&
- ~(ret_unique_id != 0 || ret_loc_id != 0 || ret_unique_id != 0)) {
+ int ret_domain = read_node_properties(node_id, "domain", &domain);
+ int ret_renderd = read_node_properties(node_id, "drm_render_minor", &render_d);
+ bool isANode = (ret_gpu_id == 0 &&
+ (ret_domain == 0 && ret_loc_id == 0 && ret_renderd == 0));
+ ss << __PRETTY_FUNCTION__ << " | isAGpuNode: "
+ << (isANode ? "TRUE" : "FALSE") << "; is_vm_guest(): "
+ << (is_vm_guest() ? "TRUE" : "FALSE")
+ << "\nret_gpu_id: " << ret_gpu_id
+ << "; ret_domain: " << ret_domain
+ << "; ret_loc_id: " << ret_loc_id
+ << "; ret_unique_id: " << ret_unique_id
+ << "\nret_renderd: " << ret_renderd
+ << "\n[node_id = " << print_unsigned_hex_and_int(node_id) << "\n"
+ << "; gpu_id = " << print_unsigned_hex_and_int(gpu_id) << "\n"
+ << "; unique_id = " << print_unsigned_hex_and_int(unique_id) << "\n"
+ << "; location_id = " << print_unsigned_hex_and_int(location_id) << "\n"
+ << "; domain = " << print_unsigned_hex_and_int(domain) << "\n"
+ << "; drm_render_minor = " << print_unsigned_hex_and_int(render_d)
+ << "]\n";
+ LOG_DEBUG(ss);
+ if (isANode || (is_vm_guest() && ret_gpu_id == 0)) {
// Do not try to build a node if one of these fields
// do not exist in KFD (0 as values okay)
systemNode myNode;
@@ -788,292 +963,66 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) {
myNode.s_location_id = location_id;
myNode.s_domain = domain & 0xFFFFFFFF;
myNode.s_bdf = (myNode.s_domain << 32) | (myNode.s_location_id);
- myNode.s_location_id = myNode.s_bdf;
- myNode.s_bdf |= ((domain & 0xFFFFFFFF) << 32);
- myNode.s_location_id = myNode.s_bdf;
- myNode.s_domain = myNode.s_location_id >> 32;
myNode.s_bus = ((myNode.s_location_id >> 8) & 0xFF);
myNode.s_device = ((myNode.s_location_id >> 3) & 0x1F);
myNode.s_function = myNode.s_location_id & 0x7;
myNode.s_partition_id = ((myNode.s_location_id >> 28) & 0xF);
+ myNode.s_drm_render_minor = static_cast((ret_renderd == 0) ? render_d : 0);
if (gpu_id != 0) { // only add gpu nodes, 0 = CPU
- allSystemNodes.emplace(unique_id, myNode);
+ auto ret = gpuNodeIdsFound.insert(node_id);
+ if (ret.second != false) {
+ // only print out nodes which do not already exist
+ ss << __PRETTY_FUNCTION__ << " | isAGpuNode: "
+ << (isANode ? "TRUE" : "FALSE") << "; is_vm_guest(): "
+ << (is_vm_guest() ? "TRUE" : "FALSE")
+ << "\nret_gpu_id: " << ret_gpu_id
+ << "; ret_domain: " << ret_domain
+ << "; ret_loc_id: " << ret_loc_id
+ << "; ret_unique_id: " << ret_unique_id
+ << "\n[node_id = " << print_unsigned_hex_and_int(node_id) << "\n"
+ << "; gpu_id = " << print_unsigned_hex_and_int(gpu_id) << "\n"
+ << "; unique_id = " << print_unsigned_hex_and_int(unique_id) << "\n"
+ << "; location_id = " << print_unsigned_hex_and_int(location_id) << "\n"
+ << "; domain = " << print_unsigned_hex_and_int(domain) << "\n"
+ << "; bus = " << print_unsigned_hex_and_int(myNode.s_bus) << "\n"
+ << "; device = " << print_unsigned_hex_and_int(myNode.s_device) << "\n"
+ << "; function = " << print_unsigned_hex_and_int(myNode.s_function) << "\n"
+ << "; partition_id = " << print_unsigned_hex_and_int(myNode.s_partition_id) << "\n"
+ << "; bdf = " << print_unsigned_hex_and_int(myNode.s_bdf) << "\n"
+ << "; drm_render_minor = " << print_unsigned_hex_and_int(myNode.s_drm_render_minor)
+ << "]\n";
+ LOG_DEBUG(ss);
+ }
+ systemNodes.push_back(myNode);
}
- } else {
- break;
}
node_id++;
}
ss << __PRETTY_FUNCTION__ << " | Ordered system nodes found = {";
- for (auto i : allSystemNodes) {
- ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
- << "; gpu_id = " << std::to_string(i.second.s_gpu_id)
- << "; unique_id = " << std::to_string(i.second.s_unique_id)
- << "; location_id = " << std::to_string(i.second.s_location_id)
- << "; bdf = " << print_int_as_hex(i.second.s_bdf)
- << "; domain = " << print_int_as_hex(i.second.s_domain, true, 2*BYTE)
- << "; bus = " << print_int_as_hex(i.second.s_bus, true, BYTE)
- << "; device = " << print_int_as_hex(i.second.s_device, true, BYTE)
- << "; function = " << std::to_string(i.second.s_function)
- << "; partition_id = " << std::to_string(i.second.s_partition_id)
- << "], ";
+
+ for (auto i : systemNodes) {
+ ss << "\n[node_id = " << std::to_string(i.s_node_id) << "\n"
+ << "; gpu_id = " << std::to_string(i.s_gpu_id) << "\n"
+ << "; unique_id = " << std::to_string(i.s_unique_id) << "\n"
+ << "; location_id = " << std::to_string(i.s_location_id) << "\n"
+ << "; bdf = " << print_int_as_hex(i.s_bdf) << "\n"
+ << "; domain = " << print_int_as_hex(i.s_domain, true, 2*BYTE) << "\n"
+ << "; bus = " << print_int_as_hex(i.s_bus, true, BYTE) << "\n"
+ << "; device = " << print_int_as_hex(i.s_device, true, BYTE) << "\n"
+ << "; function = " << std::to_string(i.s_function) << "\n"
+ << "; partition_id = " << std::to_string(i.s_partition_id) << "\n"
+ << "; drm_render_minor = " << std::to_string(i.s_drm_render_minor)
+ << "], \n";
+ rsmi_device_enumeration_t rsmi_device;
+ rsmi_device.dev_name = "";
+ rsmi_device.bdfid = i.s_bdf;
+ rsmi_device.drm_render_minor = i.s_drm_render_minor;
+ AddToDeviceList2(rsmi_device);
}
ss << "}";
+ // std::cout << ss.str() << std::endl;
LOG_DEBUG(ss);
-
- uint32_t cardAdded = 0;
- // Discover all root cards & gpu partitions associated with each
- for (uint32_t cardId = 0; cardId <= max_cardId; cardId++) {
- std::string path = kPathDRMRoot;
- path += "/card";
- path += std::to_string(cardId);
- uint64_t primary_unique_id = 0;
- uint64_t device_uuid = 0;
- bool doesDeviceSupportPartitions = false;
- // get current partition
- int kSize = 256;
- char computePartition[kSize];
- std::string strCompPartition = "UNKNOWN";
- uint32_t numMonDevices = 0;
- rsmi_num_monitor_devices(&numMonDevices);
-
- // each identified gpu card node is a primary node for
- // potential matching unique ids
- if (isAMDGPU(path) ||
- (init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) {
- std::string d_name = "card";
- d_name += std::to_string(cardId);
- uint32_t numMonDevices = 0;
- rsmi_num_monitor_devices(&numMonDevices);
- if (rsmi_dev_compute_partition_get(cardAdded, computePartition, kSize)
- == RSMI_STATUS_SUCCESS) {
- strCompPartition = computePartition;
- doesDeviceSupportPartitions = true;
- }
- rsmi_status_t ret_unique_id =
- rsmi_dev_unique_id_get(cardAdded, &device_uuid);
- auto temp_numb_nodes = allSystemNodes.count(device_uuid);
- auto primaryBdfId =
- allSystemNodes.lower_bound(device_uuid)->second.s_location_id;
- auto i = allSystemNodes.lower_bound(device_uuid);
- if (doesDeviceSupportPartitions && temp_numb_nodes > 1
- && ret_unique_id == RSMI_STATUS_SUCCESS) {
- // helps identify xgmi nodes (secondary nodes) easier
- ss << __PRETTY_FUNCTION__ << " | secondary node add ; "
- << " BDF = " << std::to_string(primaryBdfId)
- << " (" << print_int_as_hex(primaryBdfId) << ")";
- LOG_DEBUG(ss);
- if (doesDeviceSupportPartitions && strCompPartition != "SPX"
- && i->second.s_partition_id == 0) {
- i->second.s_partition_id = i->second.s_function;
- ss << __PRETTY_FUNCTION__ << " | (secondary node add) fall back - "
- << "detected !SPX && partition_id == 0"
- << "; function = " << std::to_string(i->second.s_function)
- << "; partition_id = " << std::to_string(i->second.s_partition_id);
- LOG_DEBUG(ss);
- }
- ss << __PRETTY_FUNCTION__
- << " | (secondary node add) B4 AddToDeviceList() -->"
- << "\n[node_id = " << std::to_string(i->second.s_node_id)
- << "; gpu_id = " << std::to_string(i->second.s_gpu_id)
- << "; unique_id = " << std::to_string(i->second.s_unique_id)
- << "; location_id = " << std::to_string(i->second.s_location_id)
- << "; bdf = " << print_int_as_hex(i->second.s_bdf)
- << "; domain = " << print_int_as_hex(i->second.s_domain, true, 2*BYTE)
- << "; bus = " << print_int_as_hex(i->second.s_bus, true, BYTE)
- << "; device = " << print_int_as_hex(i->second.s_device, true, BYTE)
- << "; function = " << std::to_string(i->second.s_function)
- << "; partition_id = " << std::to_string(i->second.s_partition_id)
- << "], ";
- LOG_DEBUG(ss);
- AddToDeviceList(d_name, primaryBdfId);
- } else {
- ss << __PRETTY_FUNCTION__ << " | primary node add ; "
- << " BDF = " << std::to_string(UINT64_MAX);
- if (doesDeviceSupportPartitions && strCompPartition != "SPX"
- && i->second.s_partition_id == 0) {
- i->second.s_partition_id = i->second.s_function;
- ss << __PRETTY_FUNCTION__ << " | (primary node add) fall back - "
- << "detected !SPX && partition_id == 0"
- << "; function = " << std::to_string(i->second.s_function)
- << "; partition_id = " << std::to_string(i->second.s_partition_id);
- LOG_DEBUG(ss);
- }
- LOG_DEBUG(ss);
- ss << __PRETTY_FUNCTION__
- << " | (primary node add) After AddToDeviceList() -->"
- << "\n[node_id = " << std::to_string(i->second.s_node_id)
- << "; gpu_id = " << std::to_string(i->second.s_gpu_id)
- << "; unique_id = " << std::to_string(i->second.s_unique_id)
- << "; location_id = " << std::to_string(i->second.s_location_id)
- << "; bdf = " << print_int_as_hex(i->second.s_bdf)
- << "; domain = " << print_int_as_hex(i->second.s_domain, true, 2*BYTE)
- << "; bus = " << print_int_as_hex(i->second.s_bus, true, BYTE)
- << "; device = " << print_int_as_hex(i->second.s_device, true, BYTE)
- << "; function = " << std::to_string(i->second.s_function)
- << "; partition_id = " << std::to_string(i->second.s_partition_id)
- << "], ";
- LOG_DEBUG(ss);
- AddToDeviceList(d_name, UINT64_MAX);
- }
-
- ss << __PRETTY_FUNCTION__
- << " | Ordered system nodes seen in lookup = {";
- for (auto i : allSystemNodes) {
- ss << "\n[node_id = " << std::to_string(i.second.s_node_id)
- << "; gpu_id = " << std::to_string(i.second.s_gpu_id)
- << "; unique_id = " << std::to_string(i.second.s_unique_id)
- << "; location_id = " << std::to_string(i.second.s_location_id)
- << "; bdf = " << print_int_as_hex(i.second.s_bdf)
- << "; domain = " << print_int_as_hex(i.second.s_domain, true, 2*BYTE)
- << "; bus = " << print_int_as_hex(i.second.s_bus, true, BYTE)
- << "; device = " << print_int_as_hex(i.second.s_device, true, BYTE)
- << "; function = " << std::to_string(i.second.s_function)
- << "; partition_id = " << std::to_string(i.second.s_partition_id)
- << "], ";
- }
- ss << "}";
- LOG_DEBUG(ss);
-
- uint64_t temp_primary_unique_id = 0;
- uint64_t primary_location_id = 0;
- if (allSystemNodes.empty()) {
- cardAdded++;
- ss << __PRETTY_FUNCTION__
- << " | allSystemNodes.empty() = true, continue...";
- LOG_DEBUG(ss);
- continue;
- }
-
- // get current partition
- rsmi_num_monitor_devices(&numMonDevices);
- if (rsmi_dev_compute_partition_get(cardAdded, computePartition, kSize)
- == RSMI_STATUS_SUCCESS) {
- strCompPartition = computePartition;
- }
- if (rsmi_dev_unique_id_get(cardAdded, &device_uuid)
- != RSMI_STATUS_SUCCESS) {
- cardAdded++;
- allSystemNodes.erase(device_uuid);
- ss << __PRETTY_FUNCTION__
- << " | rsmi_dev_unique_id_get(cardId, &device_uuid)"
- << " was not successful, continue.. ";
- LOG_DEBUG(ss);
- continue;
- }
-
- temp_primary_unique_id =
- allSystemNodes.find(device_uuid)->second.s_unique_id;
- temp_numb_nodes = allSystemNodes.count(temp_primary_unique_id);
-
- ss << __PRETTY_FUNCTION__
- << " | device/node id (cardId) = " << std::to_string(cardId)
- << " | card id (cardAdded) = " << std::to_string(cardAdded)
- << " | numMonDevices = " << std::to_string(numMonDevices)
- << " | compute partition = " << strCompPartition
- << " | temp_primary_unique_id = "
- << std::to_string(temp_primary_unique_id)
- << " | Num of nodes matching temp_primary_unique_id = "
- << temp_numb_nodes
- << " | device_uuid (hex/uint) = "
- << print_unsigned_hex_and_int(device_uuid)
- << " | device_uuid (uint64_t) = " << device_uuid;
- LOG_DEBUG(ss);
-
- if (temp_primary_unique_id != 0) {
- primary_unique_id = temp_primary_unique_id;
- } else {
- cardAdded++;
- // remove already added nodes associated with current card
- auto erasedNodes = allSystemNodes.erase(0);
- continue;
- }
-
- auto numb_nodes = allSystemNodes.count(primary_unique_id);
- ss << __PRETTY_FUNCTION__ << " | REFRESH - primary_unique_id = "
- << std::to_string(primary_unique_id) << " has "
- << std::to_string(numb_nodes) << " known gpu nodes";
- LOG_DEBUG(ss);
- while (numb_nodes > 1) {
- std::string secNode = "card";
- secNode += std::to_string(cardId); // maps the primary node card to
- // secondary - allows get/sets
- auto it = allSystemNodes.lower_bound(device_uuid);
- auto it_end = allSystemNodes.upper_bound(device_uuid);
- if (numb_nodes == temp_numb_nodes) {
- auto removalNodeId = it->second.s_node_id;
- auto removalGpuId = it->second.s_gpu_id;
- auto removalUniqueId = it->second.s_unique_id;
- auto removalLocId = it->second.s_location_id;
- auto removaldomain = it->second.s_domain;
- auto nodesErased = 1;
- primary_location_id = removalLocId;
- allSystemNodes.erase(it++);
- ss << __PRETTY_FUNCTION__
- << "\nPRIMARY --> num_nodes == temp_numb_nodes; ERASING "
- << std::to_string(nodesErased) << " node -> [node_id = "
- << std::to_string(removalNodeId)
- << "; gpu_id = " << std::to_string(removalGpuId)
- << "; unique_id = " << std::to_string(removalUniqueId)
- << "; location_id = " << std::to_string(removalLocId)
- << "; removaldomain = " << std::to_string(removaldomain)
- << "]";
- LOG_DEBUG(ss);
- }
- if (it == it_end) {
- break;
- }
- auto myBdfId = it->second.s_location_id;
- ss << __PRETTY_FUNCTION__ << " | secondary node add #2; "
- << " BDF = " << std::to_string(myBdfId)
- << " (" << print_int_as_hex(myBdfId) << ")";
- LOG_DEBUG(ss);
- if (doesDeviceSupportPartitions && strCompPartition != "SPX"
- && it->second.s_partition_id == 0) {
- it->second.s_partition_id = it->second.s_function;
- ss << __PRETTY_FUNCTION__ << " | (secondary node add #2) fall back - "
- << "detected !SPX && partition_id == 0"
- << "; function = " << std::to_string(it->second.s_function)
- << "; partition_id = " << std::to_string(it->second.s_partition_id);
- LOG_DEBUG(ss);
- }
- ss << __PRETTY_FUNCTION__
- << " | (secondary node add #2) B4 AddToDeviceList() -->"
- << "\n[node_id = " << std::to_string(it->second.s_node_id)
- << "; gpu_id = " << std::to_string(it->second.s_gpu_id)
- << "; unique_id = " << std::to_string(it->second.s_unique_id)
- << "; location_id = " << std::to_string(it->second.s_location_id)
- << "; bdf = " << print_int_as_hex(it->second.s_bdf)
- << "; domain = " << print_int_as_hex(it->second.s_domain, true, 2*BYTE)
- << "; bus = " << print_int_as_hex(it->second.s_bus, true, BYTE)
- << "; device = " << print_int_as_hex(it->second.s_device, true, BYTE)
- << "; function = " << std::to_string(it->second.s_function)
- << "; partition_id = " << std::to_string(it->second.s_partition_id)
- << "], ";
- LOG_DEBUG(ss);
- AddToDeviceList(secNode, myBdfId);
- allSystemNodes.erase(it++);
- numb_nodes--;
- cardAdded++;
- }
- // remove any remaining nodes associated with current card
- auto erasedNodes = allSystemNodes.erase(primary_unique_id);
- ss << __PRETTY_FUNCTION__ << " | After finding primary_unique_id = "
- << std::to_string(primary_unique_id) << " erased "
- << std::to_string(erasedNodes) << " nodes";
- LOG_DEBUG(ss);
- cardAdded++;
- }
- }
-
- if (closedir(drm_dir)) {
- err_msg = "Failed to close drm root directory ";
- err_msg += kPathDRMRoot;
- err_msg += ".";
- perror(err_msg.c_str());
- return 1;
- }
return 0;
}