diff --git a/CMakeLists.txt b/CMakeLists.txt index a6265607d1..7410cd48fc 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,7 +89,6 @@ set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_monitor.cc") set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi.cc") set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_power_mon.cc") set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_utils.cc") -set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_counters.cc") set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/shared_mutex/shared_mutex.c") set(SMI_INC_LIST "${INC_DIR}/rocm_smi_device.h") @@ -99,7 +98,6 @@ set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_power_mon.h") set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_utils.h") set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_common.h") set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_exception.h") -set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_counters.h") set(SMI_INC_LIST ${SMI_INC_LIST} "${SRC_DIR}/shared_mutex/shared_mutex.h") set(SMI_EXAMPLE_EXE "rocm_smi_ex") diff --git a/docs/ROCm_SMI_Manual.pdf b/docs/ROCm_SMI_Manual.pdf index 5793b22be6..b6da2a742c 100644 Binary files a/docs/ROCm_SMI_Manual.pdf and b/docs/ROCm_SMI_Manual.pdf differ diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index b838f0c1a7..4311042bfd 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -109,11 +109,7 @@ typedef enum { RSMI_STATUS_NOT_FOUND, //!< An item was searched for but not //!< found RSMI_STATUS_INSUFFICIENT_SIZE, //!< Not enough resources were - //!< available for the operation - RSMI_STATUS_INTERRUPT, //!< An interrupt occurred during - //!< execution of function - RSMI_STATUS_UNEXPECTED_SIZE, //!< An unexpected amount of data - //!< was read + //!< for the operation RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred } rsmi_status_t; @@ -175,74 +171,6 @@ typedef enum { RSMI_SW_COMP_LAST = RSMI_SW_COMP_DRIVER } rsmi_sw_component_t; -/** - * Event counter types - */ - -/** - * @brief Handle to performance event counter - */ -typedef uintptr_t rsmi_event_handle_t; - -/** - * Event Groups - * - * @brief Enum denoting an event group. The value of the enum is the - * base value for all the event enums in the group. - */ -typedef enum { - RSMI_EVNT_GRP_XGMI = 0, //!< Data Fabric (XGMI) related events - - RSMI_EVNT_GRP_INVALID = 0xFFFFFFFF -} rsmi_event_group_t; - -/** - * Event types - * @brief Event type enum. Events belonging to a particular event group - * ::rsmi_event_group_t should begin ennumerating at the ::rsmi_event_group_t - * value for that group. - */ -typedef enum { - RSMI_EVNT_FIRST = RSMI_EVNT_GRP_XGMI, - - RSMI_EVNT_XGMI_FIRST = RSMI_EVNT_GRP_XGMI, - RSMI_EVNT_XGMI_0_NOP_TX = RSMI_EVNT_XGMI_FIRST, //!< NOPs sent to neighbor 0 - RSMI_EVNT_XGMI_0_REQUEST_TX, //!< Outgoing requests to - //!< neighbor 0 - RSMI_EVNT_XGMI_0_RESPONSE_TX, //!< Outgoing responses to - //!< neighbor 0 - RSMI_EVNT_XGMI_0_BEATS_TX, //!< Data beats sent to - //!< neighbor 0 - RSMI_EVNT_XGMI_1_NOP_TX, //!< NOPs sent to neighbor 1 - RSMI_EVNT_XGMI_1_REQUEST_TX, //!< Outgoing requests to - //!< neighbor 1 - RSMI_EVNT_XGMI_1_RESPONSE_TX, //!< Outgoing responses to - //!< neighbor 1 - RSMI_EVNT_XGMI_1_BEATS_TX, //!< Data beats sent to - //!< neighbor 1 - - RSMI_EVNT_XGMI_LAST = RSMI_EVNT_XGMI_1_BEATS_TX, - - RSMI_EVNT_LAST = RSMI_EVNT_XGMI_LAST -} rsmi_event_type_t; - -/** - * Event counter commands - */ -typedef enum { - RSMI_CNTR_CMD_START = 0, //!< Start the counter - RSMI_CNTR_CMD_STOP, //!< Stop the counter -} rsmi_counter_command_t; - -/** - * Counter value - */ -typedef struct { - uint64_t value; //!< Counter value - uint64_t time_enabled; //!< Time that the counter was enabled - uint64_t time_running; //!< Time that che counter was running -} rsmi_counter_value_t; - /** * Clock types */ @@ -932,8 +860,9 @@ rsmi_status_t rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask); * device index. * * @details Given a device index @p dv_ind and a pointer to a uint64_t - * @p power, this function will write the current average power consumption - * (in microwatts) to the uint64_t pointed to by @p power. + * @p power, this function will write the current average power consumption to + * the uint64_t in microwatts pointed to by @p power. This function requires + * root privilege. * * @param[in] dv_ind a device index * @@ -1132,13 +1061,16 @@ rsmi_status_t rsmi_dev_fan_rpms_get(uint32_t dv_ind, uint32_t sensor_ind, int64_t *speed); /** - * @brief Get the fan speed for the specified device as a value relative to - * ::RSMI_MAX_FAN_SPEED + * @brief Get the fan speed for the specified device in RPMs. + * + * @details Given a device index @p dv_ind + * this function will get the fan speed. + * + * @param[in] dv_ind a device index * * @details Given a device index @p dv_ind and a pointer to a uint32_t * @p speed, this function will write the current fan speed (a value - * between 0 and the maximum fan speed, ::RSMI_MAX_FAN_SPEED) to the uint32_t - * pointed to by @p speed + * between 0 and 255) to the uint32_t pointed to by @p speed * * @param[in] dv_ind a device index * @@ -1680,134 +1612,6 @@ rsmi_status_string(rsmi_status_t status, const char **status_string); /** @} */ // end of ErrQuer -/*****************************************************************************/ -/** @defgroup PerfCntr Performance Counter Functions - * These functions are used to configure, query and control performance - * counting. - * @{ - */ - -/** - * @brief Tell if an event group is supported by a given device - * - * @details Given a device index @p dv_ind and an event group specifier @p - * group, tell if @p group type events are supported by the device associated - * with @p dv_ind - * - * @param[in] dv_ind device index of device being queried - * - * @param[in] group ::rsmi_event_group_t identifier of group for which support - * is being queried - * - * @retval - * ::RSMI_STATUS_SUCCESS if the device associatee with @p dv_ind - * support counting events of the type indicated by @p group. - * - * ::RSMI_STATUS_NOT_SUPPORTED If the device does not support event group @p - * group - * - */ -rsmi_status_t -rsmi_dev_counter_group_supported(uint32_t dv_ind, rsmi_event_group_t group); - -/** - * @brief Create a performance counter object - * - * @details Create a performance counter object of type @p type for the device - * with a device index of @p dv_ind, and write a handle to the object to the - * memory location pointed to by @p evnt_handle. @p evnt_handle can be used - * with other performance event operations. The handle should be deallocated - * with ::rsmi_dev_counter_destroy() when no longer needed. - * - * @param[in] dv_ind a device index - * - * @param[in] type the type of performance event to create - * - * @param[inout] evnt_handle A pointer to a ::rsmi_event_handle_t which will be - * associated with a newly allocated counter - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call - * - */ -rsmi_status_t -rsmi_dev_counter_create(uint32_t dv_ind, rsmi_event_type_t type, - rsmi_event_handle_t *evnt_handle); - -/** - * @brief Deallocate a performance counter object - * - * @details Deallocate the performance counter object with the provided - * ::rsmi_event_handle_t @p evnt_handle - * - * @param[in] evnt_handle handle to event object to be deallocated - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call - * - */ -rsmi_status_t -rsmi_dev_counter_destroy(rsmi_event_handle_t evnt_handle); - -/** - * @brief Issue performance counter control commands - * - * @details Issue a command @p cmd on the event counter associated with the - * provided handle @p evt_handle. - * - * @param[in] evt_handle an event handle - * - * @param[in] cmd The event counter command to be issued - * - * @param[inout] cmd_args Currently not used. Should be set to NULL. - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call - * - */ -rsmi_status_t -rsmi_counter_control(rsmi_event_handle_t evt_handle, - rsmi_counter_command_t cmd, void *cmd_args); - -/** - * @brief Read the current value of a performance counter - * - * @details Read the current counter value of the counter associated with the - * provided handle @p evt_handle and write the value to the location pointed - * to by @p value. - * - * @param[in] evt_handle an event handle - * - * @param[inout] value pointer to memory of size of ::rsmi_counter_value_t to - * which the counter value will be written - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call - * - */ -rsmi_status_t -rsmi_counter_read(rsmi_event_handle_t evt_handle, - rsmi_counter_value_t *value); - -/** - * @brief Get the number of currently available counters - * - * @details Given a device index @p dv_ind, a performance event group @p grp, - * and a pointer to a uint32_t @p available, this function will write the - * number of @p grp type counters that are available on the device with index - * @p dv_ind to the memory that @p available points to. - * - * @param[in] dv_ind a device index - * - * @param[in] grp an event device group - * - * @param[inout] available A pointer to a uint32_t to which the number of - * available counters will be written - * - * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call - * - */ -rsmi_status_t -rsmi_counter_available_counters_get(uint32_t dv_ind, - rsmi_event_group_t grp, uint32_t *available); -/** @} */ // end of PerfCntr - #ifdef __cplusplus } #endif // __cplusplus diff --git a/include/rocm_smi/rocm_smi_device.h b/include/rocm_smi/rocm_smi_device.h index 6cbfc9cbf2..c7fcad7dc1 100755 --- a/include/rocm_smi/rocm_smi_device.h +++ b/include/rocm_smi/rocm_smi_device.h @@ -50,13 +50,11 @@ #include #include #include -#include #include "rocm_smi/rocm_smi_monitor.h" #include "rocm_smi/rocm_smi_power_mon.h" #include "rocm_smi/rocm_smi_common.h" #include "rocm_smi/rocm_smi.h" -#include "rocm_smi/rocm_smi_counters.h" extern "C" { #include "shared_mutex.h" // NOLINT }; @@ -93,7 +91,6 @@ enum DevInfoTypes { kDevMemUsedVisVRAM, kDevMemUsedVRAM, kDevPCIEReplayCount, - kDevDFCountersAvailable, }; class Device { @@ -120,8 +117,6 @@ class Device { void set_bdfid(uint64_t val) {bdfid_ = val;} uint64_t get_bdfid(void) const {return bdfid_;} pthread_mutex_t *mutex(void) {return mutex_.ptr;} - evt::dev_evt_grp_set_t* supported_event_groups(void) { - return &supported_event_groups_;} private: std::shared_ptr monitor_; @@ -138,8 +133,6 @@ class Device { std::vector *retVec); int writeDevInfoStr(DevInfoTypes type, std::string valStr); uint64_t bdfid_; - std::unordered_set supported_event_groups_; }; } // namespace smi diff --git a/include/rocm_smi/rocm_smi_exception.h b/include/rocm_smi/rocm_smi_exception.h index 470a37845d..26851dd62e 100755 --- a/include/rocm_smi/rocm_smi_exception.h +++ b/include/rocm_smi/rocm_smi_exception.h @@ -55,7 +55,7 @@ namespace smi { /// @brief Exception type which carries an error code to return to the user. class rsmi_exception : public std::exception { public: - rsmi_exception(rsmi_status_t error, const std::string description) : + rsmi_exception(rsmi_status_t error, const char* description) : err_(error), desc_(description) {} rsmi_status_t error_code() const noexcept { return err_; } const char* what() const noexcept override { return desc_.c_str(); } diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index da6c683547..7445194d61 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -61,7 +61,6 @@ #include "rocm_smi/rocm_smi_device.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_exception.h" -#include "rocm_smi/rocm_smi_counters.h" #include "rocm_smi/rocm_smi64Config.h" @@ -94,15 +93,11 @@ static rsmi_status_t handleException() { #define TRY try { #define CATCH } catch (...) {return handleException();} - -#define CHECK_DV_IND_RANGE \ +#define GET_DEV_FROM_INDX \ amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); \ if (dv_ind >= smi.monitor_devices().size()) { \ return RSMI_STATUS_INVALID_ARGS; \ } \ - -#define GET_DEV_FROM_INDX \ - CHECK_DV_IND_RANGE \ std::shared_ptr dev = smi.monitor_devices()[dv_ind]; \ assert(dev != nullptr); @@ -133,10 +128,7 @@ static rsmi_status_t errno_to_rsmi_status(uint32_t err) { case EACCES: return RSMI_STATUS_PERMISSION; case EPERM: case ENOENT: return RSMI_STATUS_NOT_SUPPORTED; - case EBADF: case EISDIR: return RSMI_STATUS_FILE_ERROR; - case EINTR: return RSMI_STATUS_INTERRUPT; - case EIO: return RSMI_STATUS_UNEXPECTED_SIZE; default: return RSMI_STATUS_UNKNOWN_ERROR; } } @@ -1978,10 +1970,6 @@ rsmi_status_string(rsmi_status_t status, const char **status_string) { " successfully"; break; - case RSMI_STATUS_INTERRUPT: - *status_string = "An interrupt occurred while executing the function"; - break; - default: *status_string = "An unknown error occurred"; return RSMI_STATUS_UNKNOWN_ERROR; @@ -2119,152 +2107,3 @@ rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, uint64_t *counter) { CATCH } -rsmi_status_t -rsmi_dev_counter_create(uint32_t dv_ind, rsmi_event_type_t type, - rsmi_event_handle_t *evnt_handle) { - TRY - DEVICE_MUTEX - REQUIRE_ROOT_ACCESS - CHECK_DV_IND_RANGE - - if (evnt_handle == nullptr) { - return RSMI_STATUS_INVALID_ARGS; - } - if (type < RSMI_EVNT_FIRST || type > RSMI_EVNT_LAST) { - return RSMI_STATUS_INVALID_ARGS; - } - - *evnt_handle = reinterpret_cast( - new amd::smi::evt::Event(type, dv_ind)); - - if (evnt_handle == nullptr) { - return RSMI_STATUS_OUT_OF_RESOURCES; - } - - return RSMI_STATUS_SUCCESS; - CATCH -} - -rsmi_status_t -rsmi_dev_counter_destroy(rsmi_event_handle_t evnt_handle) { - TRY - - if (evnt_handle == 0) { - return RSMI_STATUS_INVALID_ARGS; - } - - amd::smi::evt::Event *evt = - reinterpret_cast(evnt_handle); - uint32_t dv_ind = evt->dev_ind(); - DEVICE_MUTEX - REQUIRE_ROOT_ACCESS - - delete evt; - return RSMI_STATUS_SUCCESS; - CATCH -} - -rsmi_status_t -rsmi_counter_control(rsmi_event_handle_t evt_handle, - rsmi_counter_command_t cmd, void *cmd_args) { - TRY - - amd::smi::evt::Event *evt = - reinterpret_cast(evt_handle); - amd::smi::pthread_wrap _pw(*get_mutex(evt->dev_ind())); - amd::smi::ScopedPthread _lock(_pw); - - REQUIRE_ROOT_ACCESS - - uint32_t ret; - - // This is for future command args. This would work in conjunction with a - // new function to set perf attributes. - (void) cmd_args; - - if (evt_handle == 0) { - return RSMI_STATUS_INVALID_ARGS; - } - - switch (cmd) { - case RSMI_CNTR_CMD_START: - ret = evt->startCounter(); - break; - - case RSMI_CNTR_CMD_STOP: - ret = evt->stopCounter(); - break; - - default: - assert(!"Unexpected perf counter command"); - } - return errno_to_rsmi_status(ret); - - CATCH -} - -rsmi_status_t -rsmi_counter_read(rsmi_event_handle_t evt_handle, - rsmi_counter_value_t *value) { - TRY - - if (value == nullptr || evt_handle == 0) { - return RSMI_STATUS_INVALID_ARGS; - } - - amd::smi::evt::Event *evt = - reinterpret_cast(evt_handle); - - uint32_t dv_ind = evt->dev_ind(); - DEVICE_MUTEX - REQUIRE_ROOT_ACCESS - - uint32_t ret; - - ret = evt->getValue(value); - - return errno_to_rsmi_status(ret); - CATCH -} - -rsmi_status_t -rsmi_counter_available_counters_get(uint32_t dv_ind, - rsmi_event_group_t grp, uint32_t *available) { - rsmi_status_t ret; - - TRY - if (available == nullptr) { - return RSMI_STATUS_INVALID_ARGS; - } - DEVICE_MUTEX - uint64_t val; - - switch (grp) { - case RSMI_EVNT_GRP_XGMI: - ret = get_dev_value_int(amd::smi::kDevDFCountersAvailable, dv_ind, &val); - assert(val < UINT32_MAX); - *available = static_cast(val); - break; - - default: - return RSMI_STATUS_INVALID_ARGS; - } - return ret; - CATCH -} - -rsmi_status_t -rsmi_dev_counter_group_supported(uint32_t dv_ind, rsmi_event_group_t group) { - TRY - DEVICE_MUTEX - GET_DEV_FROM_INDX - - amd::smi::evt::dev_evt_grp_set_t *grp = dev->supported_event_groups(); - - if (grp->find(group) == grp->end()) { - return RSMI_STATUS_NOT_SUPPORTED; - } else { - return RSMI_STATUS_SUCCESS; - } - CATCH -} diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index db19021e16..fbbe6fdc97 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -97,7 +97,6 @@ static const char *kDevMemUsedGTTFName = "mem_info_gtt_used"; static const char *kDevMemUsedVisVRAMFName = "mem_info_vis_vram_used"; static const char *kDevMemUsedVRAMFName = "mem_info_vram_used"; static const char *kDevPCIEReplayCountFName = "pcie_replay_count"; -static const char *kDevDFCountersAvailableFName = "df_cntr_avail"; // Strings that are found within sysfs files static const char *kDevPerfLevelAutoStr = "auto"; @@ -139,7 +138,6 @@ static const std::map kDevAttribNameMap = { {kDevMemUsedVisVRAM, kDevMemUsedVisVRAMFName}, {kDevMemUsedVRAM, kDevMemUsedVRAMFName}, {kDevPCIEReplayCount, kDevPCIEReplayCountFName}, - {kDevDFCountersAvailable, kDevDFCountersAvailableFName}, }; static const std::map kDevPerfLvlMap = { @@ -155,18 +153,10 @@ static const std::map kDevPerfLvlMap = { {RSMI_DEV_PERF_LEVEL_UNKNOWN, kDevPerfLevelUnknownStr}, }; -static int isRegularFile(std::string fname, bool *is_reg) { +static bool isRegularFile(std::string fname) { struct stat file_stat; - int ret; - - assert(is_reg != nullptr); - - ret = stat(fname.c_str(), &file_stat); - if (ret) { - return errno; - } - *is_reg = S_ISREG(file_stat.st_mode); - return 0; + stat(fname.c_str(), &file_stat); + return S_ISREG(file_stat.st_mode); } #define RET_IF_NONZERO(X) { \ @@ -213,14 +203,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { sysfs_path += kDevAttribNameMap.at(type); DBG_FILE_ERROR(sysfs_path, str); - bool reg_file; - - int ret = isRegularFile(sysfs_path, ®_file); - - if (ret != 0) { - return ret; - } - if (!reg_file) { + if (!isRegularFile(sysfs_path)) { return ENOENT; } @@ -387,7 +370,6 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevMemUsedVisVRAM: case kDevMemUsedVRAM: case kDevPCIEReplayCount: - case kDevDFCountersAvailable: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); *val = std::stoul(tempStr, 0); diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc index f07871c1ae..7185cfd2d8 100755 --- a/src/rocm_smi_main.cc +++ b/src/rocm_smi_main.cc @@ -58,7 +58,6 @@ #include #include "rocm_smi/rocm_smi.h" -#include "rocm_smi/rocm_smi_device.h" #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_exception.h" @@ -333,7 +332,6 @@ RocmSMI::AddToDeviceList(std::string dev_name) { uint32_t d_index = GetDeviceIndex(d_name); dev->set_index(d_index); - GetSupportedEventGroups(d_index, dev->supported_event_groups()); devices_.push_back(dev); return; diff --git a/tests/rocm_smi_test/main.cc b/tests/rocm_smi_test/main.cc index d98542059a..532a8291ec 100755 --- a/tests/rocm_smi_test/main.cc +++ b/tests/rocm_smi_test/main.cc @@ -72,7 +72,6 @@ #include "functional/err_cnt_read.h" #include "functional/mem_util_read.h" #include "functional/id_info_read.h" -#include "rocm_smi_test/functional/perf_cntr_read_write.h" static RSMITstGlobals *sRSMIGlvalues = nullptr; @@ -199,10 +198,7 @@ TEST(rsmitstReadOnly, TestIdInfoRead) { TestIdInfoRead tst; RunGenericTest(&tst); } -TEST(rsmitstreadWrite,TestPerfCntrReadWrite) { - TestPerfCntrReadWrite tst; - RunGenericTest(&tst); -} + int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv);