2
0

Merge amd-staging into amd-master 20231116

This merge skips Ibdaf031be9d916020b4049544dbd725858c7711d as that
change introduces a bug in gpu-metrics

Change-Id: Ied8447affd5ed3c847734d75517b04c073dc44b4
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
Este cometimento está contido em:
Galantsev, Dmitrii
2023-11-16 11:17:50 -06:00
ascendente 8aa036ae08 e5627d2bf1
cometimento 9d456edcd6
6 ficheiros modificados com 77 adições e 21 eliminações
+2 -2
Ver ficheiro
@@ -35,7 +35,7 @@ find_program (GIT NAMES git)
## Setup the package version based on git tags.
set(PKG_VERSION_GIT_TAG_PREFIX "rsmi_pkg_ver")
get_package_version_number("6.0.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
get_package_version_number("6.1.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
message("Package version: ${PKG_VERSION_STR}")
set(${ROCM_SMI_LIBS_TARGET}_VERSION_MAJOR "${VERSION_MAJOR}")
set(${ROCM_SMI_LIBS_TARGET}_VERSION_MINOR "${VERSION_MINOR}")
@@ -72,7 +72,7 @@ endif()
## Compiler flags
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti -std=c++17")
"${CMAKE_CXX_FLAGS} -Wall -Wextra -fno-rtti")
if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64")
set(CMAKE_CXX_FLAGS
"${CMAKE_CXX_FLAGS} -m64 -msse -msse2")
+29 -12
Ver ficheiro
@@ -363,16 +363,16 @@ typedef rsmi_clk_type_t rsmi_clk_type;
*/
typedef enum {
RSMI_COMPUTE_PARTITION_INVALID = 0,
RSMI_COMPUTE_PARTITION_CPX = 1, //!< Core mode (CPX)- Per-chip XCC with
//!< shared memory
RSMI_COMPUTE_PARTITION_SPX = 2, //!< Single GPU mode (SPX)- All XCCs work
//!< together with shared memory
RSMI_COMPUTE_PARTITION_DPX = 3, //!< Dual GPU mode (DPX)- Half XCCs work
//!< together with shared memory
RSMI_COMPUTE_PARTITION_TPX = 4, //!< Triple GPU mode (TPX)- One-third XCCs
//!< work together with shared memory
RSMI_COMPUTE_PARTITION_QPX = 5, //!< Quad GPU mode (QPX)- Quarter XCCs
//!< work together with shared memory
RSMI_COMPUTE_PARTITION_CPX, //!< Core mode (CPX)- Per-chip XCC with
//!< shared memory
RSMI_COMPUTE_PARTITION_SPX, //!< Single GPU mode (SPX)- All XCCs work
//!< together with shared memory
RSMI_COMPUTE_PARTITION_DPX, //!< Dual GPU mode (DPX)- Half XCCs work
//!< together with shared memory
RSMI_COMPUTE_PARTITION_TPX, //!< Triple GPU mode (TPX)- One-third XCCs
//!< work together with shared memory
RSMI_COMPUTE_PARTITION_QPX, //!< Quad GPU mode (QPX)- Quarter XCCs
//!< work together with shared memory
} rsmi_compute_partition_type_t;
/// \cond Ignore in docs.
typedef rsmi_compute_partition_type_t rsmi_compute_partition_type;
@@ -680,8 +680,8 @@ typedef enum {
*/
typedef enum _RSMI_IO_LINK_TYPE {
RSMI_IOLINK_TYPE_UNDEFINED = 0, //!< unknown type.
RSMI_IOLINK_TYPE_PCIEXPRESS = 1, //!< PCI Express
RSMI_IOLINK_TYPE_XGMI = 2, //!< XGMI
RSMI_IOLINK_TYPE_PCIEXPRESS, //!< PCI Express
RSMI_IOLINK_TYPE_XGMI, //!< XGMI
RSMI_IOLINK_TYPE_NUMIOLINKTYPES, //!< Number of IO Link types
RSMI_IOLINK_TYPE_SIZE = 0xFFFFFFFF //!< Max of IO Link types
} RSMI_IO_LINK_TYPE;
@@ -1503,6 +1503,23 @@ rsmi_status_t rsmi_dev_subsystem_vendor_id_get(uint32_t dv_ind, uint16_t *id);
*/
rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *id);
/**
* @brief Get the XGMI physical id associated with the device
*
* @details Given a device index @p dv_ind and a pointer to a uint32_t to
* which the XGMI physical id will be written
*
* @param[in] dv_ind a device index
*
* @param[inout] revision a pointer to uint32_t to which the XGMI physical id
* will be written
*
* @retval ::RSMI_STATUS_SUCCESS is returned upon successful call.
*
*/
rsmi_status_t rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id);
/** @} */ // end of IDQuer
/*****************************************************************************/
+1
Ver ficheiro
@@ -103,6 +103,7 @@ enum DevInfoTypes {
kDevOverDriveLevel,
kDevMemOverDriveLevel,
kDevDevID,
kDevXGMIPhysicalID,
kDevDevRevID,
kDevDevProdName,
kDevDevProdNum,
+15 -3
Ver ficheiro
@@ -836,6 +836,21 @@ rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) {
return ret;
}
rsmi_status_t
rsmi_dev_xgmi_physical_id_get(uint32_t dv_ind, uint16_t *id) {
std::ostringstream ss;
rsmi_status_t ret;
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
LOG_TRACE(ss);
CHK_SUPPORT_NAME_ONLY(id)
ret = get_id(dv_ind, amd::smi::kDevXGMIPhysicalID, id);
ss << __PRETTY_FUNCTION__ << " | ======= end ======="
<< ", reporting " << amd::smi::getRSMIStatusString(ret);
LOG_TRACE(ss);
return ret;
}
rsmi_status_t
rsmi_dev_revision_get(uint32_t dv_ind, uint16_t *revision) {
std::ostringstream outss;
@@ -2776,9 +2791,6 @@ rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) {
ss << __PRETTY_FUNCTION__ << "| ======= start =======";
LOG_TRACE(ss);
DEVICE_MUTEX
if (odv == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
CHK_SUPPORT_NAME_ONLY(odv)
rsmi_status_t ret = get_od_clk_volt_info(dv_ind, odv);
+5
Ver ficheiro
@@ -82,6 +82,7 @@ static const char *kDevPerfLevelFName = "power_dpm_force_performance_level";
static const char *kDevDevProdNameFName = "product_name";
static const char *kDevDevProdNumFName = "product_number";
static const char *kDevDevIDFName = "device";
static const char* kDevXGMIPhysicalIDFName = "xgmi_physical_id";
static const char *kDevDevRevIDFName = "revision";
static const char *kDevVendorIDFName = "vendor";
static const char *kDevSubSysDevIDFName = "subsystem_device";
@@ -238,6 +239,7 @@ static const std::map<DevInfoTypes, const char *> kDevAttribNameMap = {
{kDevDevProdName, kDevDevProdNameFName},
{kDevDevProdNum, kDevDevProdNumFName},
{kDevDevID, kDevDevIDFName},
{kDevXGMIPhysicalID, kDevXGMIPhysicalIDFName},
{kDevDevRevID, kDevDevRevIDFName},
{kDevVendorID, kDevVendorIDFName},
{kDevSubSysDevID, kDevSubSysDevIDFName},
@@ -379,6 +381,7 @@ static const std::map<const char *, dev_depends_t> kDevFuncDependsMap = {
// Functions with only mandatory dependencies
{"rsmi_dev_vram_vendor_get", {{kDevVramVendorFName}, {}}},
{"rsmi_dev_id_get", {{kDevDevIDFName}, {}}},
{"rsmi_dev_xgmi_physical_id_get", {{kDevXGMIPhysicalIDFName}, {}}},
{"rsmi_dev_revision_get", {{kDevDevRevIDFName}, {}}},
{"rsmi_dev_vendor_id_get", {{kDevVendorIDFName}, {}}},
{"rsmi_dev_name_get", {{kDevVendorIDFName,
@@ -956,6 +959,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) {
case kDevSubSysVendorID:
case kDevVendorID:
case kDevErrCntFeatures:
case kDevXGMIPhysicalID:
ret = readDevInfoStr(type, &tempStr);
RET_IF_NONZERO(ret);
@@ -1102,6 +1106,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) {
case kDevComputePartition:
case kDevMemoryPartition:
case kDevNumaNode:
case kDevXGMIPhysicalID:
return readDevInfoStr(type, val);
break;
+25 -4
Ver ficheiro
@@ -53,6 +53,7 @@
#include <functional>
#include <iostream>
#include <memory>
#include <algorithm>
#include <set>
#include <sstream>
#include <string>
@@ -85,6 +86,7 @@ amd::smi::RocmSMI::devInfoTypesStrings = {
{amd::smi::kDevOverDriveLevel, amdSMI + "kDevOverDriveLevel"},
{amd::smi::kDevMemOverDriveLevel, amdSMI + "kDevMemOverDriveLevel"},
{amd::smi::kDevDevID, amdSMI + "kDevDevID"},
{amd::smi::kDevXGMIPhysicalID, amdSMI + "kDevXGMIPhysicalID"},
{amd::smi::kDevDevRevID, amdSMI + "kDevDevRevID"},
{amd::smi::kDevDevProdName, amdSMI + "kDevDevProdName"},
{amd::smi::kDevDevProdNum, amdSMI + "kDevDevProdNum"},
@@ -383,9 +385,28 @@ RocmSMI::Initialize(uint64_t flags) {
<< "\n | final update: device->bdfid() holds correct device bdf";
LOG_TRACE(ss);
}
if (ret != 0) {
throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
"Failed to initialize rocm_smi library (amdgpu node discovery).");
std::shared_ptr<amd::smi::Device> dev;
// Sort index based on the BDF, collect BDF id firstly.
std::vector<std::pair<uint64_t, std::shared_ptr<amd::smi::Device>>> dv_to_id;
dv_to_id.reserve(devices_.size());
for (uint32_t dv_ind = 0; dv_ind < devices_.size(); ++dv_ind) {
dev = devices_[dv_ind];
uint64_t bdfid = dev->bdfid();
dv_to_id.push_back({bdfid, dev});
}
ss << __PRETTY_FUNCTION__ << " Sort index based on BDF.";
LOG_DEBUG(ss);
// Stable sort to keep the order if bdf is equal.
std::stable_sort(dv_to_id.begin(), dv_to_id.end(), []
(const std::pair<uint64_t, std::shared_ptr<amd::smi::Device>>& p1,
const std::pair<uint64_t, std::shared_ptr<amd::smi::Device>>& p2) {
return p1.first < p2.first;
});
devices_.clear();
for (uint32_t dv_ind = 0; dv_ind < dv_to_id.size(); ++dv_ind) {
devices_.push_back(dv_to_id[dv_ind].second);
}
std::map<uint64_t, std::shared_ptr<KFDNode>> tmp_map;
@@ -406,7 +427,6 @@ RocmSMI::Initialize(uint64_t flags) {
for (it = io_link_map_tmp.begin(); it != io_link_map_tmp.end(); it++)
io_link_map_[it->first] = it->second;
std::shared_ptr<amd::smi::Device> dev;
// Remove any drm nodes that don't have a corresponding readable kfd node.
// kfd nodes will not be added if their properties file is not readable.
@@ -451,6 +471,7 @@ RocmSMI::Initialize(uint64_t flags) {
if (ROCmLogging::Logger::getInstance()->isLoggerEnabled()) {
logSystemDetails();
}
// Leaving below to help debug temp file issues
// displayAppTmpFilesContent();
std::string amdGPUDeviceList = displayAllDevicePaths(devices_);