Add rsmi_dev_pci_id_get() to return BDFID for given device

Also:
* add some exception handling;
* chop newline character off of device name returned from
rsmi_dev_id_get()


[ROCm/rocm_smi_lib commit: 59a952666f]
This commit is contained in:
Chris Freehill
2018-11-05 11:22:12 -06:00
parent 8a6704659d
commit 3ccce7ddbb
11 ha cambiato i file con 255 aggiunte e 16 eliminazioni
+1
Vedi File
@@ -110,6 +110,7 @@ set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_monitor.h")
set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_power_mon.h")
set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_utils.h")
set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_common.h")
set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_exception.h")
# rocm_smi_device.h
@@ -84,6 +84,9 @@ typedef enum {
RSMI_STATUS_INTERNAL_EXCEPTION, //!< An internal exception was caught
RSMI_STATUS_INPUT_OUT_OF_BOUNDS, //!< The provided input is out of
//!< allowable or safe range
RSMI_INITIALIZATION_ERROR, //!< An error occurred when rsmi
//!< initializing internal data
//!< structures
RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred
} rsmi_status_t;
@@ -271,13 +274,35 @@ rsmi_status_t rsmi_shut_down(void);
*/
rsmi_status_t rsmi_num_monitor_devices(uint32_t *num_devices);
/**
* @brief Get the unique PCI device identifier associated for a device
*
* @details Give a device index @p dev_ind and a pointer to a uint64_t @p
* bdfid, this function will write the Bus/Device/Function PCI identifier
* (BDFID) associated with device @p dev_ind to the value pointed to by
* @bdfid.
*
* @param[in] dv_ind a device index
*
* @param[inout] bdfid a pointer to uint64_t to which the device bdfid value
* will be written
*
* @retval RSMI_STATUS_SUCCESS is returned upon successful call.
*/
rsmi_status_t rsmi_dev_pci_id_get(uint32_t dev_ind, uint64_t *bdfid);
/**
* @brief Get the device id associated with the device with provided device
* index.
*
* @details Given a device index @p dv_ind and a pointer to a uint32_t @p id,
* this function will write the device id value to the uint64_t pointed to by
* @p id
* @p id. This ID is an identification of the type of device, so calling this
* function for different devices will give the same value if they are kind
* of device. Consequently, this function should not be used to distinguish
* one device from another. rsmi_dev_pci_id_get() should be used to get a
* unique identifier.
*
* @param[in] dv_ind a device index
*
@@ -86,6 +86,9 @@ class Device {
uint32_t index(void) const {return index_;}
void set_index(uint32_t index) {index_ = index;}
static rsmi_dev_perf_level perfLvlStrToEnum(std::string s);
uint64_t bdfid(void) const {return bdfid_;}
void set_bdfid(uint64_t val) {bdfid_ = val;}
uint64_t get_bdfid(void) const {return bdfid_;}
private:
std::shared_ptr<Monitor> monitor_;
@@ -97,6 +100,7 @@ class Device {
int readDevInfoMultiLineStr(DevInfoTypes type,
std::vector<std::string> *retVec);
int writeDevInfoStr(DevInfoTypes type, std::string valStr);
uint64_t bdfid_;
};
} // namespace smi
@@ -0,0 +1,72 @@
/*
* =============================================================================
* ROC Runtime Conformance Release License
* =============================================================================
* The University of Illinois/NCSA
* Open Source License (NCSA)
*
* Copyright (c) 2018, Advanced Micro Devices, Inc.
* All rights reserved.
*
* Developed by:
*
* AMD Research and AMD ROC Software Development
*
* Advanced Micro Devices, Inc.
*
* www.amd.com
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal with the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimers.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimers in
* the documentation and/or other materials provided with the distribution.
* - Neither the names of <Name of Development Group, Name of Institution>,
* nor the names of its contributors may be used to endorse or promote
* products derived from this Software without specific prior written
* permission.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS WITH THE SOFTWARE.
*
*/
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_EXCEPTION_H_
#define INCLUDE_ROCM_SMI_ROCM_SMI_EXCEPTION_H_
#include <exception>
#include <string>
namespace amd {
namespace smi {
/// @brief Exception type which carries an error code to return to the user.
class rsmi_exception : public std::exception {
public:
rsmi_exception(rsmi_status_t error, const char* description) :
err_(error), desc_(description) {}
rsmi_status_t error_code() const noexcept { return err_; }
const char* what() const noexcept override { return desc_.c_str(); }
private:
rsmi_status_t err_;
std::string desc_;
};
} // namespace smi
} // namespace amd
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_EXCEPTION_H_
@@ -74,9 +74,9 @@ class RocmSMI {
uint32_t DiscoverAMDPowerMonitors(bool force_update = false);
// Will execute "func" for every Device object known about, or until func
// returns true;
void IterateSMIDevices(
std::function<bool(std::shared_ptr<Device>&, void *)> func, void *);
// returns non-zero;
uint32_t IterateSMIDevices(
std::function<uint32_t(std::shared_ptr<Device>&, void *)> func, void *);
private:
std::vector<std::shared_ptr<Device>> devices_;
+19
Vedi File
@@ -58,6 +58,7 @@
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_device.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_exception.h"
static const uint32_t kMaxOverdriveLevel = 20;
@@ -67,6 +68,10 @@ static rsmi_status_t handleException() {
} catch (const std::bad_alloc& e) {
debug_print("RSMI exception: BadAlloc\n");
return RSMI_STATUS_OUT_OF_RESOURCES;
} catch (const amd::smi::rsmi_exception& e) {
debug_print("Exception caught: %s.\n", e.what());
return e.error_code();
return RSMI_STATUS_INTERNAL_EXCEPTION;
} catch (const std::exception& e) {
debug_print("Unhandled exception: %s\n", e.what());
assert(false && "Unhandled exception.");
@@ -336,6 +341,20 @@ rsmi_num_monitor_devices(uint32_t *num_devices) {
CATCH
}
rsmi_status_t
rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) {
TRY
if (bdfid == nullptr) {
return RSMI_STATUS_INVALID_ARGS;
}
GET_DEV_FROM_INDX
*bdfid = dev->get_bdfid();
return RSMI_STATUS_SUCCESS;
CATCH
}
rsmi_status_t
rsmi_dev_id_get(uint32_t dv_ind, uint64_t *id) {
TRY
+117 -8
Vedi File
@@ -62,6 +62,7 @@
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_main.h"
#include "rocm_smi/rocm_smi_exception.h"
static const char *kPathDRMRoot = "/sys/class/drm";
static const char *kPathHWMonRoot = "/sys/class/hwmon";
@@ -118,10 +119,104 @@ static int SameDevice(const std::string fileA, const std::string fileB) {
return SameFile(fileA + "/device", fileB + "/device");
}
// Determine if provided string is a bdfid pci path directory of the form
// XXXX:XX:XX.X,
// domain:bus:device.function
//
// where X is a hex integer (lower case is expected)
static bool is_bdfid_path_str(const std::string in_name, uint64_t *bdfid) {
char *p = nullptr;
char *name_start;
char name[13] = {'\0'};
uint32_t tmp;
assert(bdfid != nullptr);
if (in_name.size() != 12) {
return false;
}
tmp = in_name.copy(name, 12);
assert(tmp == 12);
// BDFID = ((<BUS> & 0x1f) << 8) | ((device& 0x1f) <<3 ) | (function & 0x7).
*bdfid = 0;
name_start = name;
p = name_start;
// Match this: XXXX:xx:xx.x
tmp = std::strtoul(p, &p, 16);
if (*p != ':' || p - name_start != 4) {
return false;
}
// We are ignoring the domain part for now as KFD is not encoding it yet
// Match this: xxxx:XX:xx.x
p++;
tmp = std::strtoul(p, &p, 16);
if (*p != ':' || p - name_start != 7) {
return false;
}
*bdfid |= tmp << 8;
// Match this: xxxx:xx:XX.x
p++;
tmp = std::strtoul(p, &p, 16);
if (*p != '.' || p - name_start != 10) {
return false;
}
*bdfid |= tmp << 3;
// Match this: xxxx:xx:xx.X
p++;
tmp = std::strtoul(p, &p, 16);
if (*p != '\0' || p - name_start != 12) {
return false;
}
*bdfid |= tmp;
return true;
}
static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
assert(bdfid != nullptr);
char tpath[256];
ssize_t ret;
ret = readlink(path.c_str(), tpath, 256);
assert(ret > 0);
assert(ret < 256);
if (ret <= 0 || ret >= 256) {
return -1;
}
// We are looking for the last element in the path that has the form
// XXXX:XX:XX.X, where X is a hex integer (lower case is expected)
std::size_t slash_i, end_i;
std::string tmp;
std::string tpath_str(tpath);
end_i = tpath_str.size() - 1;
while (end_i > 0) {
slash_i = tpath_str.find_last_of('/', end_i);
tmp = tpath_str.substr(slash_i + 1, end_i - slash_i);
if (is_bdfid_path_str(tmp, bdfid)) {
return 0;
}
end_i = slash_i - 1;
}
return 1;
}
// Call-back function to append to a vector of Devices
static bool GetMonitorDevices(const std::shared_ptr<amd::smi::Device> &d,
static uint32_t GetMonitorDevices(const std::shared_ptr<amd::smi::Device> &d,
void *p) {
std::string val_str;
uint64_t bdfid;
assert(p != nullptr);
@@ -129,15 +224,21 @@ static bool GetMonitorDevices(const std::shared_ptr<amd::smi::Device> &d,
reinterpret_cast<std::vector<std::shared_ptr<amd::smi::Device>> *>(p);
if (d->monitor() != nullptr) {
// Calculate BDFID and set for this device
if (ConstructBDFID(d->path(), &bdfid) != 0) {
return -1;
}
d->set_bdfid(bdfid);
device_list->push_back(d);
}
return false;
return 0;
}
std::vector<std::shared_ptr<amd::smi::Device>> RocmSMI::s_monitor_devices;
RocmSMI::RocmSMI(void) {
auto i = 0;
uint32_t ret;
GetEnvVariables();
@@ -152,8 +253,13 @@ RocmSMI::RocmSMI(void) {
// IterateSMIDevices will iterate through all the known devices and apply
// the provided call-back to each device found.
IterateSMIDevices(GetMonitorDevices,
ret = IterateSMIDevices(GetMonitorDevices,
reinterpret_cast<void *>(&s_monitor_devices));
if (ret != 0) {
throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
"Failed to initialize rocm_smi library.");
}
}
RocmSMI::~RocmSMI() {
@@ -356,20 +462,23 @@ uint32_t RocmSMI::DiscoverAMDPowerMonitors(bool force_update) {
return 0;
}
void RocmSMI::IterateSMIDevices(
std::function<bool(std::shared_ptr<Device>&, void *)> func, void *p) {
uint32_t RocmSMI::IterateSMIDevices(
std::function<uint32_t(std::shared_ptr<Device>&, void *)> func, void *p) {
if (func == nullptr) {
return;
return -1;
}
auto d = devices_.begin();
uint32_t ret;
while (d != devices_.end()) {
if (func(*d, p)) {
return;
ret = func(*d, p);
if (ret != 0) {
return ret;
}
++d;
}
return 0;
}
} // namespace smi
@@ -50,6 +50,7 @@
#include <cstdint>
#include <iostream>
#include <sstream>
#include <algorithm>
namespace amd {
namespace smi {
@@ -88,6 +89,9 @@ int ReadSysfsStr(std::string path, std::string *retStr) {
fs.close();
*retStr = ss.str();
retStr->erase(std::remove(retStr->begin(), retStr->end(), '\n'),
retStr->end());
return ret;
}
@@ -71,7 +71,7 @@ if (${IS64BIT} EQUAL 0)
else()
if(NOT EXISTS ${RSMI_LIB_DIR}/librocm_smi64.so)
message("ERROR: Define RSMI_LIB_DIR pointing to RSMI library is not set")
message(" missing: ${RSMI_LIB_DIR}/librocm_smi.so")
message(" missing: ${RSMI_LIB_DIR}/librocm_smi64.so")
return()
endif()
endif()
@@ -204,8 +204,6 @@ include_directories(${RSMITST_ROOT}/gtest/include)
# Build rules
add_executable(${RSMITST} ${rsmitstSources} ${functionalSources})
#target_link_libraries(${ROCRTST} ${ROCRTST_LIBS} c stdc++ dl pthread rt numa hwloc)
target_link_libraries(${RSMITST} ${RSMITST_LIBS} c stdc++ pthread)
install(TARGETS ${RSMITST}
@@ -556,6 +556,13 @@ void TestSanity::Run(void) {
std::cout << "\t**Monitor name: " << name << std::endl;
}
err = rsmi_dev_pci_id_get(i, &val_ui64);
CHK_ERR_ASRT(err)
IF_VERB(STANDARD) {
std::cout << "\t**PCI ID (BDFID): 0x" << std::hex << val_ui64;
std::cout << " (" << std::dec << val_ui64 << ")" << std::endl;
}
auto print_temp_metric = [&](rsmi_temperature_metric met,
std::string label) {
err = rsmi_dev_temp_metric_get(i, 0, met, &val_i64);
@@ -3,7 +3,7 @@
# these are required:
ROCM_DIR=/home/cfreehil/git/compute/out/ubuntu-16.04/16.04
ROCM_DIR=/home/cfreehil/github/rocm_smi_lib/build
#ROCM_DIR=/opt/rocm
mkdir -p build