[SWDEV-518325/SWDEV-518320/SWDEV-443309] Fix Partition Enumeration

* Changes:
  - Updates to DRM renderD* / card* pathing for partition devices
  - Now use KFD to discover AMD devices and populate accordingly
    Device MUST have an accessible KFD node (via cgroups)
  - Updated several ROCm SMI CLI outputs to handle SYSFS files
    which are not accessible on partition nodes
  - Added a new method to help get card/drm info
    (rsmi_dev_device_identifiers_get) from ROCm SMI

Change-Id: If844f27ffc595942272abe9c8167ed90a0b0e225
Signed-off-by: Charis Poag <Charis.Poag@amd.com>
Этот коммит содержится в:
Charis Poag
2025-04-13 22:38:31 -05:00
коммит произвёл Arif, Maisam
родитель 2630bf0a8c
Коммит a0df877fdf
9 изменённых файлов: 554 добавлений и 368 удалений
+57
Просмотреть файл
@@ -1297,6 +1297,34 @@ typedef union id {
};
} rsmi_func_id_value_t;
/**
* @struct rsmi_device_identifiers_t
* @brief Structure to hold various identifiers for a GPU device.
*
* @details This structure contains fields that uniquely identify a GPU device,
* including its card index, DRM render minor, PCI Bus/Device/Function ID (BDFID),
* KFD GPU ID, partition ID, and SMI device ID.
*/
typedef struct {
//!< The card index of the device.
uint32_t card_index;
//!< The DRM render minor number of the device.
uint32_t drm_render_minor;
//!< The PCI Bus/Device/Function identifier (BDFID) of the device.
uint64_t bdfid;
//!< The KFD (Kernel Fusion Driver) GPU ID of the device.
uint64_t kfd_gpu_id;
//!< The partition ID of the device.
uint32_t partition_id;
//!< The SMI (System Management Interface) device ID.
uint32_t smi_device_id;
uint32_t reserved[10];
} rsmi_device_identifiers_t;
/*****************************************************************************/
/** @defgroup InitShutAdmin Initialization and Shutdown
@@ -1824,6 +1852,35 @@ rsmi_status_t rsmi_dev_guid_get(uint32_t dv_ind, uint64_t *guid);
*/
rsmi_status_t rsmi_dev_node_id_get(uint32_t dv_ind, uint32_t *node_id);
/**
* @brief Retrieves the device identifiers for a specific GPU device.
*
* @details This function retrieves various identifiers for a GPU device, such as
* the card index, DRM render minor, BDFID, KFD GPU ID, partition ID, and SMI device ID.
* The identifiers are written to the provided `rsmi_device_identifiers_t` structure.
*
* @param[in] dv_ind a device index.
*
* @param[out] identifiers A pointer to a structure of type `rsmi_device_identifiers_t`
* where the device identifiers will be stored. The structure
* contains fields such as:
* - `card_index`: The card index of the device.
* - `drm_render_minor`: The DRM render minor number.
* - `bdfid`: The Bus/Device/Function PCI identifier.
* - `kfd_gpu_id`: The KFD GPU ID.
* - `partition_id`: The partition ID of the device.
* - `smi_device_id`: The SMI device ID.
*
* @retval ::RSMI_STATUS_SUCCESS The call was successful, and the device identifiers were retrieved.
* @retval ::RSMI_STATUS_NOT_SUPPORTED The installed software or hardware does not support this function
* with the given arguments.
* @retval ::RSMI_STATUS_INVALID_ARGS The provided arguments are invalid.
*
* @note Ensure that the `identifiers` pointer is valid and points to a properly allocated structure
* before calling this function.
*/
rsmi_status_t rsmi_dev_device_identifiers_get(uint32_t dv_ind,
rsmi_device_identifiers_t *identifiers);
/** @} */ // end of IDQuer
+2
Просмотреть файл
@@ -248,6 +248,8 @@ class Device {
void set_smi_device_id(uint32_t i) { m_device_id = i; }
void set_smi_partition_id(uint32_t i) { m_partition_id = i; }
static const char* get_type_string(DevInfoTypes type);
rsmi_status_t get_smi_device_identifiers(uint32_t device_id,
rsmi_device_identifiers_t *device_identifiers);
private:
std::shared_ptr<Monitor> monitor_;
+9
Просмотреть файл
@@ -131,6 +131,15 @@ class RocmSMI {
io_link_map_;
std::map<uint32_t, uint32_t> dev_ind_to_node_ind_map_;
void AddToDeviceList(std::string dev_name, uint64_t bdfid = 0);
typedef struct {
uint32_t card_index = std::numeric_limits<uint32_t>::max();
std::string dev_name = "";
std::string drm_render_path = "";
std::string drm_card_path = "";
uint32_t drm_render_minor = std::numeric_limits<uint32_t>::max();
uint64_t bdfid = std::numeric_limits<uint64_t>::max();
} rsmi_device_enumeration_t;
rsmi_status_t AddToDeviceList2(rsmi_device_enumeration_t device);
void GetEnvVariables(void);
std::shared_ptr<Monitor> FindMonitor(std::string monitor_path);