diff --git a/projects/rocm-smi-lib/docs/ROCm_SMI_Manual.pdf b/projects/rocm-smi-lib/docs/ROCm_SMI_Manual.pdf index eddd3b9eba..4fd77c2150 100644 Binary files a/projects/rocm-smi-lib/docs/ROCm_SMI_Manual.pdf and b/projects/rocm-smi-lib/docs/ROCm_SMI_Manual.pdf differ diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index 7319d13df7..efc4967742 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -47,7 +47,7 @@ extern "C" { #include #else -#include +#include #endif // __cplusplus #include @@ -113,6 +113,29 @@ typedef enum { RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred } rsmi_status_t; +typedef enum { + RSMI_INIT_FLAG_ALL_GPUS = 0x1, //!< Attempt to add all GPUs found + //!< (including non-AMD) to the list + //!< of devices from which SMI + //!< information can be retrieved. By + //!< default, only AMD devices are + //!< ennumerated by RSMI. +} rsmi_init_flags_t; + +/** + * @brief Initialization flags + * + * Initialization flags may be OR'd together and passed to ::rsmi_init(). + */ +typedef enum { + RSMI_INIT_FLAG_ALL_GPUS = 0x1, //!< Attempt to add all GPUs found + //!< (including non-AMD) to the list + //!< of devices from which SMI + //!< information can be retrieved. By + //!< default, only AMD devices are + //!< ennumerated by RSMI. +} rsmi_init_flags_t; + /** * @brief PowerPlay performance levels */ @@ -156,6 +179,9 @@ typedef enum { RSMI_SW_COMP_LAST = RSMI_SW_COMP_DRIVER } rsmi_sw_component_t; +/** + * Clock types + */ typedef enum { RSMI_CLK_TYPE_SYS = 0x0, //!< System clock RSMI_CLK_TYPE_FIRST = RSMI_CLK_TYPE_SYS, @@ -493,8 +519,9 @@ typedef struct { * @details When called, this initializes internal data structures, * including those corresponding to sources of information that SMI provides. * - * @param[in] init_flags Bit flags that tell SMI how to initialze. Not - * currently used. + * @param[in] init_flags Bit flags that tell SMI how to initialze. Values of + * ::rsmi_init_flags_t may be OR'd together and passed through @p init_flags + * to modify how RSMI initializes. * * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. */ diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h index 7ddcf3dc22..46a626cc3f 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h @@ -62,12 +62,13 @@ namespace smi { class RocmSMI { public: - RocmSMI(void); // direct use of this constructor is deprecated; use - // getInstance() - + RocmSMI(uint64_t flags); ~RocmSMI(void); - static RocmSMI& getInstance(void); + static RocmSMI& getInstance(uint64_t flags = 0); + void Initialize(uint64_t flags); + void Cleanup(void); + static std::vector>& monitor_devices() {return s_monitor_devices;} uint32_t DiscoverDevices(void); @@ -78,6 +79,9 @@ class RocmSMI { uint32_t IterateSMIDevices( std::function&, void *)> func, void *); + void set_init_options(uint64_t options) {init_options_ = options;} + uint64_t init_options() const {return init_options_;} + private: std::vector> devices_; std::vector> monitors_; @@ -90,6 +94,7 @@ class RocmSMI { static std::vector> s_monitor_devices; RocmSMI_env_vars env_vars_; + uint64_t init_options_; }; } // namespace smi diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 3200197cfa..52de37df0b 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -94,7 +94,7 @@ static rsmi_status_t handleException() { #define TRY try { #define CATCH } catch (...) {return handleException();} #define GET_DEV_FROM_INDX \ - amd::smi::RocmSMI smi = amd::smi::RocmSMI::getInstance(); \ + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); \ if (dv_ind >= smi.monitor_devices().size()) { \ return RSMI_STATUS_INVALID_ARGS; \ } \ @@ -106,7 +106,8 @@ static rsmi_status_t handleException() { amd::smi::ScopedPthread _lock(_pw); static pthread_mutex_t *get_mutex(uint32_t dv_ind) { - amd::smi::RocmSMI smi = amd::smi::RocmSMI::getInstance(); + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); + if (dv_ind >= smi.monitor_devices().size()) { return nullptr; } @@ -382,7 +383,7 @@ static rsmi_status_t set_dev_mon_value(amd::smi::MonitorTypes type, static rsmi_status_t get_power_mon_value(amd::smi::PowerMonTypes type, uint32_t dv_ind, uint64_t *val) { - amd::smi::RocmSMI smi = amd::smi::RocmSMI::getInstance(); + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); if (dv_ind >= smi.monitor_devices().size() || val == nullptr) { return RSMI_STATUS_INVALID_ARGS; @@ -416,11 +417,12 @@ static bool is_power_of_2(uint64_t n) { } rsmi_status_t -rsmi_init(uint64_t init_flags) { +rsmi_init(uint64_t flags) { TRY - (void)init_flags; // unused for now; for future use - amd::smi::RocmSMI smi = amd::smi::RocmSMI::getInstance(); + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); + smi.Initialize(flags); + return RSMI_STATUS_SUCCESS; CATCH } @@ -430,6 +432,11 @@ rsmi_init(uint64_t init_flags) { rsmi_status_t rsmi_shut_down(void) { TRY + + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); + + smi.Cleanup(); + return RSMI_STATUS_SUCCESS; CATCH } @@ -441,7 +448,7 @@ rsmi_num_monitor_devices(uint32_t *num_devices) { return RSMI_STATUS_INVALID_ARGS; } - amd::smi::RocmSMI smi = amd::smi::RocmSMI::getInstance(); + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); *num_devices = smi.monitor_devices().size(); return RSMI_STATUS_SUCCESS; @@ -1086,7 +1093,7 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, assert(freqs.num_supported <= RSMI_MAX_NUM_FREQUENCIES); - amd::smi::RocmSMI smi = amd::smi::RocmSMI::getInstance(); + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); // Above call to rsmi_dev_get_gpu_clk_freq should have emitted an error if // assert below is not true @@ -1366,7 +1373,7 @@ rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask) { assert(bws.transfer_rate.num_supported <= RSMI_MAX_NUM_FREQUENCIES); - amd::smi::RocmSMI smi = amd::smi::RocmSMI::getInstance(); + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); // Above call to rsmi_dev_pci_bandwidth_get() should have emitted an error // if assert below is not true diff --git a/projects/rocm-smi-lib/src/rocm_smi_main.cc b/projects/rocm-smi-lib/src/rocm_smi_main.cc index b90c698133..2a2518a0a4 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_main.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_main.cc @@ -234,10 +234,13 @@ static uint32_t GetMonitorDevices(const std::shared_ptr &d, std::vector> RocmSMI::s_monitor_devices; -RocmSMI::RocmSMI(void) { +void +RocmSMI::Initialize(uint64_t flags) { auto i = 0; uint32_t ret; + init_options_ = flags; + GetEnvVariables(); while (std::string(kAMDMonitorTypes[i]) != "") { @@ -260,15 +263,23 @@ RocmSMI::RocmSMI(void) { } } -RocmSMI::~RocmSMI() { +void +RocmSMI::Cleanup() { + s_monitor_devices.clear(); devices_.clear(); monitors_.clear(); } -RocmSMI& RocmSMI::getInstance(void) { +RocmSMI::RocmSMI(uint64_t flags) : init_options_(flags) { +} + +RocmSMI::~RocmSMI() { +} + +RocmSMI& RocmSMI::getInstance(uint64_t flags) { // Assume c++11 or greater. static objects will be created by only 1 thread // and creation will be thread-safe. - static RocmSMI singleton; + static RocmSMI singleton(flags); return singleton; } @@ -324,6 +335,33 @@ RocmSMI::AddToDeviceList(std::string dev_name) { return; } +static const uint32_t kAmdGpuId=0x1002; + +static bool isAMDGPU(std::string dev_path) { + + std::string vend_path = dev_path + "/device/vendor"; + if (!FileExists(vend_path.c_str())) { + return false; + } + + std::ifstream fs; + fs.open(vend_path); + + if (!fs.is_open()) { + return errno; + } + + uint32_t vendor_id; + + fs >> std::hex >> vendor_id; + + fs.close(); + + if (vendor_id == kAmdGpuId) { + return true; + } + return false; +} uint32_t RocmSMI::DiscoverDevices(void) { auto ret = 0; @@ -346,7 +384,14 @@ uint32_t RocmSMI::DiscoverDevices(void) { while (dentry != nullptr) { if (memcmp(dentry->d_name, kDeviceNamePrefix, strlen(kDeviceNamePrefix)) == 0) { - AddToDeviceList(dentry->d_name); + std::string vend_str_path = kPathDRMRoot; + vend_str_path += "/"; + vend_str_path += dentry->d_name; + + if (isAMDGPU(vend_str_path) || + (init_options_ & RSMI_INIT_FLAG_ALL_GPUS)) { + AddToDeviceList(dentry->d_name); + } } dentry = readdir(drm_dir); } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/pci_read_write.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/pci_read_write.cc index 07fb96890a..988b5e32d5 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/pci_read_write.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/pci_read_write.cc @@ -138,7 +138,7 @@ void TestPciReadWrite::Run(void) { CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { - std::cout << "\tInitial PCIe is " << bw.transfer_rate.current << + std::cout << "\tInitial PCIe BW index is " << bw.transfer_rate.current << std::endl; } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc index febfd67f2c..532a8291ec 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/main.cc @@ -46,6 +46,7 @@ #include #include #include +#include #include "rocm_smi/rocm_smi.h" #include "gtest/gtest.h" @@ -79,6 +80,7 @@ static void SetFlags(TestBase *test) { test->set_verbosity(sRSMIGlvalues->verbosity); test->set_dont_fail(sRSMIGlvalues->dont_fail); + test->set_init_options(sRSMIGlvalues->init_options); } @@ -207,10 +209,30 @@ int main(int argc, char** argv) { settings.monitor_verbosity = 1; settings.num_iterations = 1; settings.dont_fail = false; + settings.init_options = 0; if (ProcessCmdline(&settings, argc, argv)) { return 1; } + + int ret = 0; sRSMIGlvalues = &settings; + ret = RUN_ALL_TESTS(); + + if (ret) { + return ret; + } + + settings.init_options = RSMI_INIT_FLAG_ALL_GPUS; + + std::cout << "****************************************" << std::endl; + std::cout << "****************************************" << std::endl; + std::cout << "****************************************" << std::endl; + std::cout << "Re-running tests with init options: " << std::hex << + settings.init_options << std::endl; + std::cout << "****************************************" << std::endl; + std::cout << "****************************************" << std::endl; + std::cout << "****************************************" << std::endl; + settings.init_options = RSMI_INIT_FLAG_ALL_GPUS; return RUN_ALL_TESTS(); } diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.cc index f5a3ae3b25..a9f428996f 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.cc @@ -84,7 +84,7 @@ void TestBase::SetUp(void) { MakeHeaderStr(kSetupLabel, &label); printf("\n\t%s\n", label.c_str()); - err = rsmi_init(0); + err = rsmi_init(init_options()); ASSERT_EQ(err, RSMI_STATUS_SUCCESS); err = rsmi_num_monitor_devices(&num_monitor_devs_); diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.h b/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.h index 22c948c7be..973262f409 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.h +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/test_base.h @@ -99,6 +99,12 @@ class TestBase { uint32_t num_monitor_devs(void) const { return num_monitor_devs_; } + void set_init_options(uint64_t x) { + init_options_ = x; + } + uint64_t init_options(void) const { + return init_options_; + } protected: void PrintDeviceHeader(uint32_t dv_ind); @@ -109,6 +115,7 @@ class TestBase { std::string title_; ///< Displayed title of test uint32_t verbosity_; ///< How much additional output to produce bool dont_fail_; ///< Don't quit test on individual failure if true + uint64_t init_options_; ///< rsmi initialization options }; #define IF_VERB(VB) if (verbosity() && verbosity() >= (TestBase::VERBOSE_##VB)) diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/test_common.h b/projects/rocm-smi-lib/tests/rocm_smi_test/test_common.h index 8033a4fb23..545334f8fd 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/test_common.h +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/test_common.h @@ -55,6 +55,7 @@ struct RSMITstGlobals { uint32_t verbosity; uint32_t monitor_verbosity; uint32_t num_iterations; + uint64_t init_options; bool dont_fail; };