From 9021ef96dc92b70eacce0f660740fa57c8793dd7 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Mon, 28 Aug 2023 08:21:20 -0500 Subject: [PATCH] Support PCIe vendor name Add the support for PCIe vendor name. Change-Id: Ibc1d289a08731e4c5a14f992f3b0d31b51482396 --- include/amd_smi/amdsmi.h | 7 +-- py-interface/amdsmi_wrapper.py | 5 +- rocm_smi/example/rocm_smi_example.cc | 5 ++ rocm_smi/include/rocm_smi/rocm_smi.h | 37 ++++++++++++++ rocm_smi/include/rocm_smi/rocm_smi_device.h | 7 ++- rocm_smi/include/rocm_smi/rocm_smi_utils.h | 1 + rocm_smi/src/rocm_smi.cc | 51 +++++++++++++++++++ rocm_smi/src/rocm_smi_device.cc | 22 +++++++- rocm_smi/src/rocm_smi_main.cc | 3 +- rocm_smi/src/rocm_smi_utils.cc | 24 +++++++++ src/amd_smi/amd_smi.cc | 6 ++- .../amd_smi_test/functional/sys_info_read.cc | 9 ++-- 12 files changed, 161 insertions(+), 16 deletions(-) diff --git a/include/amd_smi/amdsmi.h b/include/amd_smi/amdsmi.h index 8ef2bb8a52..8506c86696 100644 --- a/include/amd_smi/amdsmi.h +++ b/include/amd_smi/amdsmi.h @@ -391,11 +391,12 @@ typedef struct { typedef struct { char market_name[AMDSMI_MAX_STRING_LENGTH]; uint32_t vendor_id; //< Use 32 bit to be compatible with other platform. + char vendor_name[AMDSMI_MAX_STRING_LENGTH]; uint32_t subvendor_id; //< The subsystem vendor id uint64_t device_id; //< The device id of a GPU uint32_t rev_id; char asic_serial[AMDSMI_NORMAL_STRING_LENGTH]; - uint32_t reserved[3]; + uint32_t reserved[19]; } amdsmi_asic_info_t; typedef struct { @@ -1650,7 +1651,7 @@ amdsmi_status_t amdsmi_get_gpu_bdf_id(amdsmi_processor_handle processor_handle, /** * @brief Get the NUMA node associated with a device * - * @details Given a processor handle @p processor_handle and a pointer to a uint32_t @p + * @details Given a processor handle @p processor_handle and a pointer to a int32_t @p * numa_node, this function will retrieve the NUMA node value associated * with device @p processor_handle and store the value at location pointed to by * @p numa_node. @@ -1666,7 +1667,7 @@ amdsmi_status_t amdsmi_get_gpu_bdf_id(amdsmi_processor_handle processor_handle, * * @return ::amdsmi_status_t | ::AMDSMI_STATUS_SUCCESS on success, non-zero on fail */ -amdsmi_status_t amdsmi_get_gpu_topo_numa_affinity(amdsmi_processor_handle processor_handle, uint32_t *numa_node); +amdsmi_status_t amdsmi_get_gpu_topo_numa_affinity(amdsmi_processor_handle processor_handle, int32_t *numa_node); /** * @brief Get PCIe traffic information. It is not supported on virtual machine guest diff --git a/py-interface/amdsmi_wrapper.py b/py-interface/amdsmi_wrapper.py index 9e69a70f80..6f00998c6c 100644 --- a/py-interface/amdsmi_wrapper.py +++ b/py-interface/amdsmi_wrapper.py @@ -667,11 +667,12 @@ struct_c__SA_amdsmi_asic_info_t._pack_ = 1 # source:False struct_c__SA_amdsmi_asic_info_t._fields_ = [ ('market_name', ctypes.c_char * 64), ('vendor_id', ctypes.c_uint32), + ('vendor_name', ctypes.c_char * 64), ('subvendor_id', ctypes.c_uint32), ('device_id', ctypes.c_uint64), ('rev_id', ctypes.c_uint32), ('asic_serial', ctypes.c_char * 32), - ('reserved', ctypes.c_uint32 * 3), + ('reserved', ctypes.c_uint32 * 19), ] amdsmi_asic_info_t = struct_c__SA_amdsmi_asic_info_t @@ -1434,7 +1435,7 @@ amdsmi_get_gpu_bdf_id.restype = amdsmi_status_t amdsmi_get_gpu_bdf_id.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint64)] amdsmi_get_gpu_topo_numa_affinity = _libraries['libamd_smi.so'].amdsmi_get_gpu_topo_numa_affinity amdsmi_get_gpu_topo_numa_affinity.restype = amdsmi_status_t -amdsmi_get_gpu_topo_numa_affinity.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint32)] +amdsmi_get_gpu_topo_numa_affinity.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_int32)] amdsmi_get_gpu_pci_throughput = _libraries['libamd_smi.so'].amdsmi_get_gpu_pci_throughput amdsmi_get_gpu_pci_throughput.restype = amdsmi_status_t amdsmi_get_gpu_pci_throughput.argtypes = [amdsmi_processor_handle, ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64)] diff --git a/rocm_smi/example/rocm_smi_example.cc b/rocm_smi/example/rocm_smi_example.cc index 0e78debb91..9d1aea0d90 100755 --- a/rocm_smi/example/rocm_smi_example.cc +++ b/rocm_smi/example/rocm_smi_example.cc @@ -722,6 +722,11 @@ int main() { CHK_RSMI_RET_I(ret) std::cout << "\t**Dev.Rev.ID: 0x" << std::hex << val_ui16 << std::endl; + char pcie_vendor_name[256]; + ret = rsmi_dev_pcie_vendor_name_get(i, pcie_vendor_name, 256); + CHK_RSMI_RET_I(ret) + std::cout << "\t**PCIe vendor name: " << pcie_vendor_name << std::endl; + char current_compute_partition[256]; current_compute_partition[0] = '\0'; ret = rsmi_dev_compute_partition_get(i, current_compute_partition, 256); diff --git a/rocm_smi/include/rocm_smi/rocm_smi.h b/rocm_smi/include/rocm_smi/rocm_smi.h index 29103219de..1d95ed00ac 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi.h +++ b/rocm_smi/include/rocm_smi/rocm_smi.h @@ -1155,6 +1155,43 @@ rsmi_status_t rsmi_dev_sku_get(uint32_t dv_ind, uint16_t *sku); */ rsmi_status_t rsmi_dev_vendor_id_get(uint32_t dv_ind, uint16_t *id); +/** + * @brief Get the name string for a give PCIe vendor ID + * + * @details Given a device index @p dv_ind, a pointer to a caller provided + * char buffer @p name, and a length of this buffer @p len, this function will + * write the name of the PCIe vendor (up to @p len characters) buffer @p name. + * + * If the integer ID associated with the PCIe vendor is not found in one of the + * system files containing device name information (e.g. + * /usr/share/misc/pci.ids), then this function will return RSMI_STATUS_NOT_FOUND. + * Updating the system name files can be accompplished with + * "sudo update-pciids". + * + * @param[in] dv_ind a device index + * + * @param[inout] name a pointer to a caller provided char buffer to which the + * name will be written + * If this parameter is nullptr, this function will return + * ::RSMI_STATUS_INVALID_ARGS if the function is supported with the provided, + * arguments and ::RSMI_STATUS_NOT_SUPPORTED if it is not supported with the + * provided arguments. + * + * @param[in] len the length of the caller provided buffer @p name. + * + * @retval ::RSMI_STATUS_SUCCESS call was successful + * @retval ::RSMI_STATUS_NOT_SUPPORTED installed software or hardware does not + * support this function with the given arguments + * @retval ::RSMI_STATUS_INVALID_ARGS the provided arguments are not valid + * @retval ::RSMI_STATUS_NOT_FOUND the vnedor name are not found + * @retval ::RSMI_STATUS_INSUFFICIENT_SIZE is returned if @p len bytes is not + * large enough to hold the entire name. In this case, only @p len bytes will + * be written. + * + */ +rsmi_status_t rsmi_dev_pcie_vendor_name_get(uint32_t dv_ind, char *name, + size_t len); + /** * @brief Get the name string of a gpu device. * diff --git a/rocm_smi/include/rocm_smi/rocm_smi_device.h b/rocm_smi/include/rocm_smi/rocm_smi_device.h index a1b2809457..43c1728809 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_device.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_device.h @@ -167,7 +167,12 @@ enum DevInfoTypes { kDevGpuReset, kDevAvailableComputePartition, kDevComputePartition, - kDevMemoryPartition + kDevMemoryPartition, + + // The information read from pci core sysfs + kDevPCieTypeStart = 1000, + kDevPCieVendorID = kDevPCieTypeStart, + kDevPCieTypeEND = 2000, }; typedef struct { diff --git a/rocm_smi/include/rocm_smi/rocm_smi_utils.h b/rocm_smi/include/rocm_smi/rocm_smi_utils.h index 697fcb3723..04e49b1d41 100755 --- a/rocm_smi/include/rocm_smi/rocm_smi_utils.h +++ b/rocm_smi/include/rocm_smi/rocm_smi_utils.h @@ -103,6 +103,7 @@ std::tuple kDevAttribNameMap = { {kDevDevID, kDevDevIDFName}, {kDevDevRevID, kDevDevRevIDFName}, {kDevVendorID, kDevVendorIDFName}, + {kDevPCieVendorID, kDevPCieVendorIDFName}, {kDevSubSysDevID, kDevSubSysDevIDFName}, {kDevSubSysVendorID, kDevSubSysVendorIDFName}, {kDevGPUMClk, kDevGPUMClkFName}, @@ -589,6 +593,20 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { sysfs_path += "/device/"; sysfs_path += kDevAttribNameMap.at(type); + // For the file under PCI sysfs + if (type >= kDevPCieTypeStart && type <= kDevPCieTypeEND) { + sysfs_path = "/sys/bus/pci/devices/"; + std::string bdf_str; + if (getBDFWithDomain(bdfid_, bdf_str) != RSMI_STATUS_SUCCESS) { + ss << "Fail to craft the bdf string"; + LOG_ERROR(ss); + return 1; + } + sysfs_path += bdf_str; + sysfs_path += "/"; + sysfs_path += kDevAttribNameMap.at(type); + } + DBG_FILE_ERROR(sysfs_path, str); bool reg_file; @@ -611,7 +629,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { fs->open(sysfs_path); - if (!fs->is_open()) { + if (!fs) { ss << "Could not open - SYSFS file (" << sysfs_path << ") for " << "DevInfoInfoType (" << RocmSMI::devInfoTypesStrings.at(type) << "), " << ", returning " << std::to_string(errno) << " (" @@ -901,6 +919,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevSubSysDevID: case kDevSubSysVendorID: case kDevVendorID: + case kDevPCieVendorID: case kDevErrCntFeatures: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); @@ -1038,6 +1057,7 @@ int Device::readDevInfo(DevInfoTypes type, std::string *val) { case kDevSubSysDevID: case kDevSubSysVendorID: case kDevVendorID: + case kDevPCieVendorID: case kDevVramVendor: case kDevVBiosVer: case kDevPCIEThruPut: diff --git a/rocm_smi/src/rocm_smi_main.cc b/rocm_smi/src/rocm_smi_main.cc index 8cb95fe7f2..49dec9332d 100755 --- a/rocm_smi/src/rocm_smi_main.cc +++ b/rocm_smi/src/rocm_smi_main.cc @@ -150,7 +150,8 @@ amd::smi::RocmSMI::devInfoTypesStrings = { {amd::smi::kDevAvailableComputePartition, amdSMI + "kDevAvailableComputePartition"}, {amd::smi::kDevComputePartition, amdSMI + "kDevComputePartition"}, - {amd::smi::kDevMemoryPartition, amdSMI + "kDevMemoryPartition"} + {amd::smi::kDevMemoryPartition, amdSMI + "kDevMemoryPartition"}, + {amd::smi::kDevPCieVendorID, amdSMI + "kDevPCieVendorID"}, }; namespace amd { diff --git a/rocm_smi/src/rocm_smi_utils.cc b/rocm_smi/src/rocm_smi_utils.cc index 7e90d29209..582ed39703 100755 --- a/rocm_smi/src/rocm_smi_utils.cc +++ b/rocm_smi/src/rocm_smi_utils.cc @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -792,6 +793,29 @@ bool isSystemBigEndian() { return isBigEndian; } +rsmi_status_t getBDFWithDomain(uint64_t bdf_id, std::string& bfd_str) +{ + auto result = rsmi_status_t::RSMI_STATUS_SUCCESS; + auto domain_id = static_cast(bdf_id >> 32); + auto bus_id = static_cast((bdf_id & 0x0000FF00) >> 8); + auto dev_id = static_cast((bdf_id & 0x000000F8) >> 3); + auto func_id = static_cast(bdf_id & 0x00000003); + + bfd_str = std::string(); + if (!(bus_id > 0)) { + result = rsmi_status_t::RSMI_STATUS_NO_DATA; + return result; + } + + std::stringstream bdf_sstream; + bdf_sstream << std::hex << std::setfill('0') << std::setw(4) << +domain_id << ":"; + bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << +bus_id << ":"; + bdf_sstream << std::hex << std::setfill('0') << std::setw(2) << +dev_id << "."; + bdf_sstream << std::hex << std::setfill('0') << +func_id; + bfd_str = bdf_sstream.str(); + return result; +} + rsmi_status_t getBDFString(uint64_t bdf_id, std::string& bfd_str) { auto result = rsmi_status_t::RSMI_STATUS_SUCCESS; diff --git a/src/amd_smi/amd_smi.cc b/src/amd_smi/amd_smi.cc index fa2e3edb29..608877e693 100644 --- a/src/amd_smi/amd_smi.cc +++ b/src/amd_smi/amd_smi.cc @@ -671,7 +671,6 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i info->rev_id = dev_info.pci_rev; info->vendor_id = gpu_device->get_vendor_id(); } - // For other sysfs related information, get from rocm-smi else { uint64_t dv_uid = 0; status = rsmi_wrapper(rsmi_dev_unique_id_get, processor_handle, &dv_uid); @@ -688,6 +687,9 @@ amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_i &subvendor_id); if (status == AMDSMI_STATUS_SUCCESS) info->subvendor_id = subvendor_id; } + // For other sysfs related information, get from rocm-smi + status = rsmi_wrapper(rsmi_dev_pcie_vendor_name_get, processor_handle, + info->vendor_name, AMDSMI_MAX_STRING_LENGTH); return AMDSMI_STATUS_SUCCESS; } @@ -1277,7 +1279,7 @@ amdsmi_status_t amdsmi_get_gpu_bdf_id( } amdsmi_status_t amdsmi_get_gpu_topo_numa_affinity( - amdsmi_processor_handle processor_handle, uint32_t *numa_node) { + amdsmi_processor_handle processor_handle, int32_t *numa_node) { return rsmi_wrapper(rsmi_topo_numa_affinity_get, processor_handle, numa_node); } diff --git a/tests/amd_smi_test/functional/sys_info_read.cc b/tests/amd_smi_test/functional/sys_info_read.cc index c6e2b066fc..05c327a3b0 100755 --- a/tests/amd_smi_test/functional/sys_info_read.cc +++ b/tests/amd_smi_test/functional/sys_info_read.cc @@ -139,7 +139,7 @@ void TestSysInfoRead::Run(void) { err = amdsmi_get_gpu_bdf_id(processor_handles_[i], nullptr); ASSERT_EQ(err, AMDSMI_STATUS_INVAL); - err = amdsmi_get_gpu_topo_numa_affinity(processor_handles_[i], &val_ui32); + err = amdsmi_get_gpu_topo_numa_affinity(processor_handles_[i], &val_i32); CHK_ERR_ASRT(err) IF_VERB(STANDARD) { std::cout << "\t**NUMA NODE: 0x" << std::hex << val_i32; @@ -163,11 +163,8 @@ void TestSysInfoRead::Run(void) { } else { if (err == AMDSMI_STATUS_SUCCESS) { IF_VERB(STANDARD) { - // TODO(bliu): read unique_id - /* - std::cout << "\t**GPU Unique ID : " << std::hex << asci_info.unique_id << - std::endl; - */ + std:: cout << "\t**GPU PCIe Vendor : " + << asci_info.vendor_name << std::endl; } // Verify api support checking functionality is working err = amdsmi_get_gpu_asic_info(processor_handles_[i], nullptr);