From 3ccce7ddbbad763c9e2f6613a7001621ccd698eb Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Mon, 5 Nov 2018 11:22:12 -0600 Subject: [PATCH] Add rsmi_dev_pci_id_get() to return BDFID for given device Also: * add some exception handling; * chop newline character off of device name returned from rsmi_dev_id_get() [ROCm/rocm_smi_lib commit: 59a952666f8afc61add11b65549d3402cf706667] --- projects/rocm-smi-lib/CMakeLists.txt | 1 + .../rocm-smi-lib/include/rocm_smi/rocm_smi.h | 27 +++- .../include/rocm_smi/rocm_smi_device.h | 4 + .../include/rocm_smi/rocm_smi_exception.h | 72 ++++++++++ .../include/rocm_smi/rocm_smi_main.h | 6 +- projects/rocm-smi-lib/src/rocm_smi.cc | 19 +++ projects/rocm-smi-lib/src/rocm_smi_main.cc | 125 ++++++++++++++++-- projects/rocm-smi-lib/src/rocm_smi_utils.cc | 4 + .../tests/rocm_smi_test/CMakeLists.txt | 4 +- .../rocm_smi_test/functional/rsmi_sanity.cc | 7 + .../tests/rocm_smi_test/run_build.sh | 2 +- 11 files changed, 255 insertions(+), 16 deletions(-) create mode 100755 projects/rocm-smi-lib/include/rocm_smi/rocm_smi_exception.h diff --git a/projects/rocm-smi-lib/CMakeLists.txt b/projects/rocm-smi-lib/CMakeLists.txt index 4568c1e052..a5b2916fbc 100755 --- a/projects/rocm-smi-lib/CMakeLists.txt +++ b/projects/rocm-smi-lib/CMakeLists.txt @@ -110,6 +110,7 @@ set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_monitor.h") set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_power_mon.h") set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_utils.h") set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_common.h") +set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_exception.h") # rocm_smi_device.h diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index 1c19e4a3ee..94f8921da3 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -84,6 +84,9 @@ typedef enum { RSMI_STATUS_INTERNAL_EXCEPTION, //!< An internal exception was caught RSMI_STATUS_INPUT_OUT_OF_BOUNDS, //!< The provided input is out of //!< allowable or safe range + RSMI_INITIALIZATION_ERROR, //!< An error occurred when rsmi + //!< initializing internal data + //!< structures RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred } rsmi_status_t; @@ -271,13 +274,35 @@ rsmi_status_t rsmi_shut_down(void); */ rsmi_status_t rsmi_num_monitor_devices(uint32_t *num_devices); +/** + * @brief Get the unique PCI device identifier associated for a device + * + * @details Give a device index @p dev_ind and a pointer to a uint64_t @p + * bdfid, this function will write the Bus/Device/Function PCI identifier + * (BDFID) associated with device @p dev_ind to the value pointed to by + * @bdfid. + * + * @param[in] dv_ind a device index + * + * @param[inout] bdfid a pointer to uint64_t to which the device bdfid value + * will be written + * + * @retval RSMI_STATUS_SUCCESS is returned upon successful call. + + */ +rsmi_status_t rsmi_dev_pci_id_get(uint32_t dev_ind, uint64_t *bdfid); + /** * @brief Get the device id associated with the device with provided device * index. * * @details Given a device index @p dv_ind and a pointer to a uint32_t @p id, * this function will write the device id value to the uint64_t pointed to by - * @p id + * @p id. This ID is an identification of the type of device, so calling this + * function for different devices will give the same value if they are kind + * of device. Consequently, this function should not be used to distinguish + * one device from another. rsmi_dev_pci_id_get() should be used to get a + * unique identifier. * * @param[in] dv_ind a device index * diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h index 807d5f46fd..8a3c6448d4 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_device.h @@ -86,6 +86,9 @@ class Device { uint32_t index(void) const {return index_;} void set_index(uint32_t index) {index_ = index;} static rsmi_dev_perf_level perfLvlStrToEnum(std::string s); + uint64_t bdfid(void) const {return bdfid_;} + void set_bdfid(uint64_t val) {bdfid_ = val;} + uint64_t get_bdfid(void) const {return bdfid_;} private: std::shared_ptr monitor_; @@ -97,6 +100,7 @@ class Device { int readDevInfoMultiLineStr(DevInfoTypes type, std::vector *retVec); int writeDevInfoStr(DevInfoTypes type, std::string valStr); + uint64_t bdfid_; }; } // namespace smi diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_exception.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_exception.h new file mode 100755 index 0000000000..26851dd62e --- /dev/null +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_exception.h @@ -0,0 +1,72 @@ +/* + * ============================================================================= + * ROC Runtime Conformance Release License + * ============================================================================= + * The University of Illinois/NCSA + * Open Source License (NCSA) + * + * Copyright (c) 2018, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Developed by: + * + * AMD Research and AMD ROC Software Development + * + * Advanced Micro Devices, Inc. + * + * www.amd.com + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal with the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * - Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimers. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimers in + * the documentation and/or other materials provided with the distribution. + * - Neither the names of , + * nor the names of its contributors may be used to endorse or promote + * products derived from this Software without specific prior written + * permission. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS WITH THE SOFTWARE. + * + */ + +#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_EXCEPTION_H_ +#define INCLUDE_ROCM_SMI_ROCM_SMI_EXCEPTION_H_ + +#include +#include + +namespace amd { +namespace smi { + +/// @brief Exception type which carries an error code to return to the user. +class rsmi_exception : public std::exception { + public: + rsmi_exception(rsmi_status_t error, const char* description) : + err_(error), desc_(description) {} + rsmi_status_t error_code() const noexcept { return err_; } + const char* what() const noexcept override { return desc_.c_str(); } + + private: + rsmi_status_t err_; + std::string desc_; +}; + +} // namespace smi +} // namespace amd + +#endif // INCLUDE_ROCM_SMI_ROCM_SMI_EXCEPTION_H_ + diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h index 9aed22df11..7ddcf3dc22 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h @@ -74,9 +74,9 @@ class RocmSMI { uint32_t DiscoverAMDPowerMonitors(bool force_update = false); // Will execute "func" for every Device object known about, or until func - // returns true; - void IterateSMIDevices( - std::function&, void *)> func, void *); + // returns non-zero; + uint32_t IterateSMIDevices( + std::function&, void *)> func, void *); private: std::vector> devices_; diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index f772c13753..249a932ca8 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -58,6 +58,7 @@ #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_device.h" #include "rocm_smi/rocm_smi_utils.h" +#include "rocm_smi/rocm_smi_exception.h" static const uint32_t kMaxOverdriveLevel = 20; @@ -67,6 +68,10 @@ static rsmi_status_t handleException() { } catch (const std::bad_alloc& e) { debug_print("RSMI exception: BadAlloc\n"); return RSMI_STATUS_OUT_OF_RESOURCES; + } catch (const amd::smi::rsmi_exception& e) { + debug_print("Exception caught: %s.\n", e.what()); + return e.error_code(); + return RSMI_STATUS_INTERNAL_EXCEPTION; } catch (const std::exception& e) { debug_print("Unhandled exception: %s\n", e.what()); assert(false && "Unhandled exception."); @@ -336,6 +341,20 @@ rsmi_num_monitor_devices(uint32_t *num_devices) { CATCH } +rsmi_status_t +rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) { + TRY + + if (bdfid == nullptr) { + return RSMI_STATUS_INVALID_ARGS; + } + GET_DEV_FROM_INDX + + *bdfid = dev->get_bdfid(); + return RSMI_STATUS_SUCCESS; + CATCH +} + rsmi_status_t rsmi_dev_id_get(uint32_t dv_ind, uint64_t *id) { TRY diff --git a/projects/rocm-smi-lib/src/rocm_smi_main.cc b/projects/rocm-smi-lib/src/rocm_smi_main.cc index 409e4687aa..96a71bda09 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_main.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_main.cc @@ -62,6 +62,7 @@ #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_main.h" +#include "rocm_smi/rocm_smi_exception.h" static const char *kPathDRMRoot = "/sys/class/drm"; static const char *kPathHWMonRoot = "/sys/class/hwmon"; @@ -118,10 +119,104 @@ static int SameDevice(const std::string fileA, const std::string fileB) { return SameFile(fileA + "/device", fileB + "/device"); } +// Determine if provided string is a bdfid pci path directory of the form +// XXXX:XX:XX.X, +// domain:bus:device.function +// +// where X is a hex integer (lower case is expected) +static bool is_bdfid_path_str(const std::string in_name, uint64_t *bdfid) { + char *p = nullptr; + char *name_start; + char name[13] = {'\0'}; + uint32_t tmp; + + assert(bdfid != nullptr); + + if (in_name.size() != 12) { + return false; + } + + tmp = in_name.copy(name, 12); + assert(tmp == 12); + + // BDFID = (( & 0x1f) << 8) | ((device& 0x1f) <<3 ) | (function & 0x7). + *bdfid = 0; + name_start = name; + p = name_start; + + // Match this: XXXX:xx:xx.x + tmp = std::strtoul(p, &p, 16); + if (*p != ':' || p - name_start != 4) { + return false; + } + // We are ignoring the domain part for now as KFD is not encoding it yet + + // Match this: xxxx:XX:xx.x + p++; + tmp = std::strtoul(p, &p, 16); + if (*p != ':' || p - name_start != 7) { + return false; + } + *bdfid |= tmp << 8; + + // Match this: xxxx:xx:XX.x + p++; + tmp = std::strtoul(p, &p, 16); + if (*p != '.' || p - name_start != 10) { + return false; + } + *bdfid |= tmp << 3; + + // Match this: xxxx:xx:xx.X + p++; + tmp = std::strtoul(p, &p, 16); + if (*p != '\0' || p - name_start != 12) { + return false; + } + *bdfid |= tmp; + + return true; +} + +static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) { + assert(bdfid != nullptr); + char tpath[256]; + ssize_t ret; + + ret = readlink(path.c_str(), tpath, 256); + + assert(ret > 0); + assert(ret < 256); + + if (ret <= 0 || ret >= 256) { + return -1; + } + + // We are looking for the last element in the path that has the form + // XXXX:XX:XX.X, where X is a hex integer (lower case is expected) + std::size_t slash_i, end_i; + std::string tmp; + + std::string tpath_str(tpath); + + end_i = tpath_str.size() - 1; + while (end_i > 0) { + slash_i = tpath_str.find_last_of('/', end_i); + tmp = tpath_str.substr(slash_i + 1, end_i - slash_i); + + if (is_bdfid_path_str(tmp, bdfid)) { + return 0; + } + end_i = slash_i - 1; + } + + return 1; +} // Call-back function to append to a vector of Devices -static bool GetMonitorDevices(const std::shared_ptr &d, +static uint32_t GetMonitorDevices(const std::shared_ptr &d, void *p) { std::string val_str; + uint64_t bdfid; assert(p != nullptr); @@ -129,15 +224,21 @@ static bool GetMonitorDevices(const std::shared_ptr &d, reinterpret_cast> *>(p); if (d->monitor() != nullptr) { + // Calculate BDFID and set for this device + if (ConstructBDFID(d->path(), &bdfid) != 0) { + return -1; + } + d->set_bdfid(bdfid); device_list->push_back(d); } - return false; + return 0; } std::vector> RocmSMI::s_monitor_devices; RocmSMI::RocmSMI(void) { auto i = 0; + uint32_t ret; GetEnvVariables(); @@ -152,8 +253,13 @@ RocmSMI::RocmSMI(void) { // IterateSMIDevices will iterate through all the known devices and apply // the provided call-back to each device found. - IterateSMIDevices(GetMonitorDevices, + ret = IterateSMIDevices(GetMonitorDevices, reinterpret_cast(&s_monitor_devices)); + + if (ret != 0) { + throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, + "Failed to initialize rocm_smi library."); + } } RocmSMI::~RocmSMI() { @@ -356,20 +462,23 @@ uint32_t RocmSMI::DiscoverAMDPowerMonitors(bool force_update) { return 0; } -void RocmSMI::IterateSMIDevices( - std::function&, void *)> func, void *p) { +uint32_t RocmSMI::IterateSMIDevices( + std::function&, void *)> func, void *p) { if (func == nullptr) { - return; + return -1; } auto d = devices_.begin(); + uint32_t ret; while (d != devices_.end()) { - if (func(*d, p)) { - return; + ret = func(*d, p); + if (ret != 0) { + return ret; } ++d; } + return 0; } } // namespace smi diff --git a/projects/rocm-smi-lib/src/rocm_smi_utils.cc b/projects/rocm-smi-lib/src/rocm_smi_utils.cc index 02d0e58c58..d44524325d 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_utils.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_utils.cc @@ -50,6 +50,7 @@ #include #include #include +#include namespace amd { namespace smi { @@ -88,6 +89,9 @@ int ReadSysfsStr(std::string path, std::string *retStr) { fs.close(); *retStr = ss.str(); + + retStr->erase(std::remove(retStr->begin(), retStr->end(), '\n'), + retStr->end()); return ret; } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt b/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt index 112872e47d..30e88372d5 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/CMakeLists.txt @@ -71,7 +71,7 @@ if (${IS64BIT} EQUAL 0) else() if(NOT EXISTS ${RSMI_LIB_DIR}/librocm_smi64.so) message("ERROR: Define RSMI_LIB_DIR pointing to RSMI library is not set") - message(" missing: ${RSMI_LIB_DIR}/librocm_smi.so") + message(" missing: ${RSMI_LIB_DIR}/librocm_smi64.so") return() endif() endif() @@ -204,8 +204,6 @@ include_directories(${RSMITST_ROOT}/gtest/include) # Build rules add_executable(${RSMITST} ${rsmitstSources} ${functionalSources}) -#target_link_libraries(${ROCRTST} ${ROCRTST_LIBS} c stdc++ dl pthread rt numa hwloc) - target_link_libraries(${RSMITST} ${RSMITST_LIBS} c stdc++ pthread) install(TARGETS ${RSMITST} diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/rsmi_sanity.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/rsmi_sanity.cc index 0ca4aadda2..3c167dd5e0 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/rsmi_sanity.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/rsmi_sanity.cc @@ -556,6 +556,13 @@ void TestSanity::Run(void) { std::cout << "\t**Monitor name: " << name << std::endl; } + err = rsmi_dev_pci_id_get(i, &val_ui64); + CHK_ERR_ASRT(err) + IF_VERB(STANDARD) { + std::cout << "\t**PCI ID (BDFID): 0x" << std::hex << val_ui64; + std::cout << " (" << std::dec << val_ui64 << ")" << std::endl; + } + auto print_temp_metric = [&](rsmi_temperature_metric met, std::string label) { err = rsmi_dev_temp_metric_get(i, 0, met, &val_i64); diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/run_build.sh b/projects/rocm-smi-lib/tests/rocm_smi_test/run_build.sh index abce075479..c3e6b587eb 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/run_build.sh +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/run_build.sh @@ -3,7 +3,7 @@ # these are required: -ROCM_DIR=/home/cfreehil/git/compute/out/ubuntu-16.04/16.04 +ROCM_DIR=/home/cfreehil/github/rocm_smi_lib/build #ROCM_DIR=/opt/rocm mkdir -p build