Add rsmi_dev_pci_id_get() to return BDFID for given device
Also:
* add some exception handling;
* chop newline character off of device name returned from
rsmi_dev_id_get()
[ROCm/rocm_smi_lib commit: 59a952666f]
This commit is contained in:
@@ -110,6 +110,7 @@ set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_monitor.h")
|
||||
set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_power_mon.h")
|
||||
set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_utils.h")
|
||||
set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_common.h")
|
||||
set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_exception.h")
|
||||
|
||||
|
||||
# rocm_smi_device.h
|
||||
|
||||
@@ -84,6 +84,9 @@ typedef enum {
|
||||
RSMI_STATUS_INTERNAL_EXCEPTION, //!< An internal exception was caught
|
||||
RSMI_STATUS_INPUT_OUT_OF_BOUNDS, //!< The provided input is out of
|
||||
//!< allowable or safe range
|
||||
RSMI_INITIALIZATION_ERROR, //!< An error occurred when rsmi
|
||||
//!< initializing internal data
|
||||
//!< structures
|
||||
RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred
|
||||
} rsmi_status_t;
|
||||
|
||||
@@ -271,13 +274,35 @@ rsmi_status_t rsmi_shut_down(void);
|
||||
*/
|
||||
rsmi_status_t rsmi_num_monitor_devices(uint32_t *num_devices);
|
||||
|
||||
/**
|
||||
* @brief Get the unique PCI device identifier associated for a device
|
||||
*
|
||||
* @details Give a device index @p dev_ind and a pointer to a uint64_t @p
|
||||
* bdfid, this function will write the Bus/Device/Function PCI identifier
|
||||
* (BDFID) associated with device @p dev_ind to the value pointed to by
|
||||
* @bdfid.
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
* @param[inout] bdfid a pointer to uint64_t to which the device bdfid value
|
||||
* will be written
|
||||
*
|
||||
* @retval RSMI_STATUS_SUCCESS is returned upon successful call.
|
||||
|
||||
*/
|
||||
rsmi_status_t rsmi_dev_pci_id_get(uint32_t dev_ind, uint64_t *bdfid);
|
||||
|
||||
/**
|
||||
* @brief Get the device id associated with the device with provided device
|
||||
* index.
|
||||
*
|
||||
* @details Given a device index @p dv_ind and a pointer to a uint32_t @p id,
|
||||
* this function will write the device id value to the uint64_t pointed to by
|
||||
* @p id
|
||||
* @p id. This ID is an identification of the type of device, so calling this
|
||||
* function for different devices will give the same value if they are kind
|
||||
* of device. Consequently, this function should not be used to distinguish
|
||||
* one device from another. rsmi_dev_pci_id_get() should be used to get a
|
||||
* unique identifier.
|
||||
*
|
||||
* @param[in] dv_ind a device index
|
||||
*
|
||||
|
||||
@@ -86,6 +86,9 @@ class Device {
|
||||
uint32_t index(void) const {return index_;}
|
||||
void set_index(uint32_t index) {index_ = index;}
|
||||
static rsmi_dev_perf_level perfLvlStrToEnum(std::string s);
|
||||
uint64_t bdfid(void) const {return bdfid_;}
|
||||
void set_bdfid(uint64_t val) {bdfid_ = val;}
|
||||
uint64_t get_bdfid(void) const {return bdfid_;}
|
||||
|
||||
private:
|
||||
std::shared_ptr<Monitor> monitor_;
|
||||
@@ -97,6 +100,7 @@ class Device {
|
||||
int readDevInfoMultiLineStr(DevInfoTypes type,
|
||||
std::vector<std::string> *retVec);
|
||||
int writeDevInfoStr(DevInfoTypes type, std::string valStr);
|
||||
uint64_t bdfid_;
|
||||
};
|
||||
|
||||
} // namespace smi
|
||||
|
||||
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
* =============================================================================
|
||||
* ROC Runtime Conformance Release License
|
||||
* =============================================================================
|
||||
* The University of Illinois/NCSA
|
||||
* Open Source License (NCSA)
|
||||
*
|
||||
* Copyright (c) 2018, Advanced Micro Devices, Inc.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Developed by:
|
||||
*
|
||||
* AMD Research and AMD ROC Software Development
|
||||
*
|
||||
* Advanced Micro Devices, Inc.
|
||||
*
|
||||
* www.amd.com
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to
|
||||
* deal with the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* - Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimers.
|
||||
* - Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimers in
|
||||
* the documentation and/or other materials provided with the distribution.
|
||||
* - Neither the names of <Name of Development Group, Name of Institution>,
|
||||
* nor the names of its contributors may be used to endorse or promote
|
||||
* products derived from this Software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
||||
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
||||
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS WITH THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef INCLUDE_ROCM_SMI_ROCM_SMI_EXCEPTION_H_
|
||||
#define INCLUDE_ROCM_SMI_ROCM_SMI_EXCEPTION_H_
|
||||
|
||||
#include <exception>
|
||||
#include <string>
|
||||
|
||||
namespace amd {
|
||||
namespace smi {
|
||||
|
||||
/// @brief Exception type which carries an error code to return to the user.
|
||||
class rsmi_exception : public std::exception {
|
||||
public:
|
||||
rsmi_exception(rsmi_status_t error, const char* description) :
|
||||
err_(error), desc_(description) {}
|
||||
rsmi_status_t error_code() const noexcept { return err_; }
|
||||
const char* what() const noexcept override { return desc_.c_str(); }
|
||||
|
||||
private:
|
||||
rsmi_status_t err_;
|
||||
std::string desc_;
|
||||
};
|
||||
|
||||
} // namespace smi
|
||||
} // namespace amd
|
||||
|
||||
#endif // INCLUDE_ROCM_SMI_ROCM_SMI_EXCEPTION_H_
|
||||
|
||||
@@ -74,9 +74,9 @@ class RocmSMI {
|
||||
uint32_t DiscoverAMDPowerMonitors(bool force_update = false);
|
||||
|
||||
// Will execute "func" for every Device object known about, or until func
|
||||
// returns true;
|
||||
void IterateSMIDevices(
|
||||
std::function<bool(std::shared_ptr<Device>&, void *)> func, void *);
|
||||
// returns non-zero;
|
||||
uint32_t IterateSMIDevices(
|
||||
std::function<uint32_t(std::shared_ptr<Device>&, void *)> func, void *);
|
||||
|
||||
private:
|
||||
std::vector<std::shared_ptr<Device>> devices_;
|
||||
|
||||
@@ -58,6 +58,7 @@
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include "rocm_smi/rocm_smi_device.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
|
||||
static const uint32_t kMaxOverdriveLevel = 20;
|
||||
|
||||
@@ -67,6 +68,10 @@ static rsmi_status_t handleException() {
|
||||
} catch (const std::bad_alloc& e) {
|
||||
debug_print("RSMI exception: BadAlloc\n");
|
||||
return RSMI_STATUS_OUT_OF_RESOURCES;
|
||||
} catch (const amd::smi::rsmi_exception& e) {
|
||||
debug_print("Exception caught: %s.\n", e.what());
|
||||
return e.error_code();
|
||||
return RSMI_STATUS_INTERNAL_EXCEPTION;
|
||||
} catch (const std::exception& e) {
|
||||
debug_print("Unhandled exception: %s\n", e.what());
|
||||
assert(false && "Unhandled exception.");
|
||||
@@ -336,6 +341,20 @@ rsmi_num_monitor_devices(uint32_t *num_devices) {
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) {
|
||||
TRY
|
||||
|
||||
if (bdfid == nullptr) {
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
GET_DEV_FROM_INDX
|
||||
|
||||
*bdfid = dev->get_bdfid();
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
CATCH
|
||||
}
|
||||
|
||||
rsmi_status_t
|
||||
rsmi_dev_id_get(uint32_t dv_ind, uint64_t *id) {
|
||||
TRY
|
||||
|
||||
@@ -62,6 +62,7 @@
|
||||
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
|
||||
static const char *kPathDRMRoot = "/sys/class/drm";
|
||||
static const char *kPathHWMonRoot = "/sys/class/hwmon";
|
||||
@@ -118,10 +119,104 @@ static int SameDevice(const std::string fileA, const std::string fileB) {
|
||||
return SameFile(fileA + "/device", fileB + "/device");
|
||||
}
|
||||
|
||||
// Determine if provided string is a bdfid pci path directory of the form
|
||||
// XXXX:XX:XX.X,
|
||||
// domain:bus:device.function
|
||||
//
|
||||
// where X is a hex integer (lower case is expected)
|
||||
static bool is_bdfid_path_str(const std::string in_name, uint64_t *bdfid) {
|
||||
char *p = nullptr;
|
||||
char *name_start;
|
||||
char name[13] = {'\0'};
|
||||
uint32_t tmp;
|
||||
|
||||
assert(bdfid != nullptr);
|
||||
|
||||
if (in_name.size() != 12) {
|
||||
return false;
|
||||
}
|
||||
|
||||
tmp = in_name.copy(name, 12);
|
||||
assert(tmp == 12);
|
||||
|
||||
// BDFID = ((<BUS> & 0x1f) << 8) | ((device& 0x1f) <<3 ) | (function & 0x7).
|
||||
*bdfid = 0;
|
||||
name_start = name;
|
||||
p = name_start;
|
||||
|
||||
// Match this: XXXX:xx:xx.x
|
||||
tmp = std::strtoul(p, &p, 16);
|
||||
if (*p != ':' || p - name_start != 4) {
|
||||
return false;
|
||||
}
|
||||
// We are ignoring the domain part for now as KFD is not encoding it yet
|
||||
|
||||
// Match this: xxxx:XX:xx.x
|
||||
p++;
|
||||
tmp = std::strtoul(p, &p, 16);
|
||||
if (*p != ':' || p - name_start != 7) {
|
||||
return false;
|
||||
}
|
||||
*bdfid |= tmp << 8;
|
||||
|
||||
// Match this: xxxx:xx:XX.x
|
||||
p++;
|
||||
tmp = std::strtoul(p, &p, 16);
|
||||
if (*p != '.' || p - name_start != 10) {
|
||||
return false;
|
||||
}
|
||||
*bdfid |= tmp << 3;
|
||||
|
||||
// Match this: xxxx:xx:xx.X
|
||||
p++;
|
||||
tmp = std::strtoul(p, &p, 16);
|
||||
if (*p != '\0' || p - name_start != 12) {
|
||||
return false;
|
||||
}
|
||||
*bdfid |= tmp;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static uint32_t ConstructBDFID(std::string path, uint64_t *bdfid) {
|
||||
assert(bdfid != nullptr);
|
||||
char tpath[256];
|
||||
ssize_t ret;
|
||||
|
||||
ret = readlink(path.c_str(), tpath, 256);
|
||||
|
||||
assert(ret > 0);
|
||||
assert(ret < 256);
|
||||
|
||||
if (ret <= 0 || ret >= 256) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// We are looking for the last element in the path that has the form
|
||||
// XXXX:XX:XX.X, where X is a hex integer (lower case is expected)
|
||||
std::size_t slash_i, end_i;
|
||||
std::string tmp;
|
||||
|
||||
std::string tpath_str(tpath);
|
||||
|
||||
end_i = tpath_str.size() - 1;
|
||||
while (end_i > 0) {
|
||||
slash_i = tpath_str.find_last_of('/', end_i);
|
||||
tmp = tpath_str.substr(slash_i + 1, end_i - slash_i);
|
||||
|
||||
if (is_bdfid_path_str(tmp, bdfid)) {
|
||||
return 0;
|
||||
}
|
||||
end_i = slash_i - 1;
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
// Call-back function to append to a vector of Devices
|
||||
static bool GetMonitorDevices(const std::shared_ptr<amd::smi::Device> &d,
|
||||
static uint32_t GetMonitorDevices(const std::shared_ptr<amd::smi::Device> &d,
|
||||
void *p) {
|
||||
std::string val_str;
|
||||
uint64_t bdfid;
|
||||
|
||||
assert(p != nullptr);
|
||||
|
||||
@@ -129,15 +224,21 @@ static bool GetMonitorDevices(const std::shared_ptr<amd::smi::Device> &d,
|
||||
reinterpret_cast<std::vector<std::shared_ptr<amd::smi::Device>> *>(p);
|
||||
|
||||
if (d->monitor() != nullptr) {
|
||||
// Calculate BDFID and set for this device
|
||||
if (ConstructBDFID(d->path(), &bdfid) != 0) {
|
||||
return -1;
|
||||
}
|
||||
d->set_bdfid(bdfid);
|
||||
device_list->push_back(d);
|
||||
}
|
||||
return false;
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::vector<std::shared_ptr<amd::smi::Device>> RocmSMI::s_monitor_devices;
|
||||
|
||||
RocmSMI::RocmSMI(void) {
|
||||
auto i = 0;
|
||||
uint32_t ret;
|
||||
|
||||
GetEnvVariables();
|
||||
|
||||
@@ -152,8 +253,13 @@ RocmSMI::RocmSMI(void) {
|
||||
|
||||
// IterateSMIDevices will iterate through all the known devices and apply
|
||||
// the provided call-back to each device found.
|
||||
IterateSMIDevices(GetMonitorDevices,
|
||||
ret = IterateSMIDevices(GetMonitorDevices,
|
||||
reinterpret_cast<void *>(&s_monitor_devices));
|
||||
|
||||
if (ret != 0) {
|
||||
throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR,
|
||||
"Failed to initialize rocm_smi library.");
|
||||
}
|
||||
}
|
||||
|
||||
RocmSMI::~RocmSMI() {
|
||||
@@ -356,20 +462,23 @@ uint32_t RocmSMI::DiscoverAMDPowerMonitors(bool force_update) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
void RocmSMI::IterateSMIDevices(
|
||||
std::function<bool(std::shared_ptr<Device>&, void *)> func, void *p) {
|
||||
uint32_t RocmSMI::IterateSMIDevices(
|
||||
std::function<uint32_t(std::shared_ptr<Device>&, void *)> func, void *p) {
|
||||
if (func == nullptr) {
|
||||
return;
|
||||
return -1;
|
||||
}
|
||||
|
||||
auto d = devices_.begin();
|
||||
uint32_t ret;
|
||||
|
||||
while (d != devices_.end()) {
|
||||
if (func(*d, p)) {
|
||||
return;
|
||||
ret = func(*d, p);
|
||||
if (ret != 0) {
|
||||
return ret;
|
||||
}
|
||||
++d;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace smi
|
||||
|
||||
@@ -50,6 +50,7 @@
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <algorithm>
|
||||
|
||||
namespace amd {
|
||||
namespace smi {
|
||||
@@ -88,6 +89,9 @@ int ReadSysfsStr(std::string path, std::string *retStr) {
|
||||
fs.close();
|
||||
|
||||
*retStr = ss.str();
|
||||
|
||||
retStr->erase(std::remove(retStr->begin(), retStr->end(), '\n'),
|
||||
retStr->end());
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ if (${IS64BIT} EQUAL 0)
|
||||
else()
|
||||
if(NOT EXISTS ${RSMI_LIB_DIR}/librocm_smi64.so)
|
||||
message("ERROR: Define RSMI_LIB_DIR pointing to RSMI library is not set")
|
||||
message(" missing: ${RSMI_LIB_DIR}/librocm_smi.so")
|
||||
message(" missing: ${RSMI_LIB_DIR}/librocm_smi64.so")
|
||||
return()
|
||||
endif()
|
||||
endif()
|
||||
@@ -204,8 +204,6 @@ include_directories(${RSMITST_ROOT}/gtest/include)
|
||||
# Build rules
|
||||
add_executable(${RSMITST} ${rsmitstSources} ${functionalSources})
|
||||
|
||||
#target_link_libraries(${ROCRTST} ${ROCRTST_LIBS} c stdc++ dl pthread rt numa hwloc)
|
||||
|
||||
target_link_libraries(${RSMITST} ${RSMITST_LIBS} c stdc++ pthread)
|
||||
|
||||
install(TARGETS ${RSMITST}
|
||||
|
||||
@@ -556,6 +556,13 @@ void TestSanity::Run(void) {
|
||||
std::cout << "\t**Monitor name: " << name << std::endl;
|
||||
}
|
||||
|
||||
err = rsmi_dev_pci_id_get(i, &val_ui64);
|
||||
CHK_ERR_ASRT(err)
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "\t**PCI ID (BDFID): 0x" << std::hex << val_ui64;
|
||||
std::cout << " (" << std::dec << val_ui64 << ")" << std::endl;
|
||||
}
|
||||
|
||||
auto print_temp_metric = [&](rsmi_temperature_metric met,
|
||||
std::string label) {
|
||||
err = rsmi_dev_temp_metric_get(i, 0, met, &val_i64);
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
|
||||
# these are required:
|
||||
ROCM_DIR=/home/cfreehil/git/compute/out/ubuntu-16.04/16.04
|
||||
ROCM_DIR=/home/cfreehil/github/rocm_smi_lib/build
|
||||
#ROCM_DIR=/opt/rocm
|
||||
|
||||
mkdir -p build
|
||||
|
||||
Fai riferimento in un nuovo problema
Block a user