From dc75c65729efa55f3105a98201fcba504dfcf602 Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Sat, 4 May 2019 15:10:58 -0500 Subject: [PATCH 1/2] Add mutex for multi-process access to device sysfs files This commit uses a pthread mutex in shared memory to prevent almost all cases of multiple processes simultaneously reading/writing to device sysfs files. The main existing race condition is when 2 processes are starting at the same time, setting up their shared memory and mutexes. Since this is meant to prevent collisions among thread and processes, the small shared memory segments (big enough for a pthread_mutex) will persist until reboot. [ROCm/amdsmi commit: 5e24a77193cb3858492f5fc8ae615821e01b016b] --- projects/amdsmi/CMakeLists.txt | 6 +- .../amdsmi/include/rocm_smi/rocm_smi_device.h | 9 +- .../amdsmi/include/rocm_smi/rocm_smi_utils.h | 24 ++++ projects/amdsmi/src/rocm_smi.cc | 91 +++++++++++- projects/amdsmi/src/rocm_smi_device.cc | 26 ++++ projects/amdsmi/src/shared_mutex/LICENSE | 21 +++ .../amdsmi/src/shared_mutex/shared_mutex.c | 131 ++++++++++++++++++ .../amdsmi/src/shared_mutex/shared_mutex.h | 67 +++++++++ 8 files changed, 372 insertions(+), 3 deletions(-) create mode 100644 projects/amdsmi/src/shared_mutex/LICENSE create mode 100755 projects/amdsmi/src/shared_mutex/shared_mutex.c create mode 100755 projects/amdsmi/src/shared_mutex/shared_mutex.h diff --git a/projects/amdsmi/CMakeLists.txt b/projects/amdsmi/CMakeLists.txt index 01b00d62bc..53c8d5977d 100755 --- a/projects/amdsmi/CMakeLists.txt +++ b/projects/amdsmi/CMakeLists.txt @@ -106,13 +106,15 @@ endif () set(SRC_DIR "src") set(INC_DIR "include/rocm_smi") -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/src/shared_mutex) set(SMI_SRC_LIST "${SRC_DIR}/rocm_smi_device.cc") set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_main.cc") set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_monitor.cc") set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi.cc") set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_power_mon.cc") set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_utils.cc") +set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/shared_mutex/shared_mutex.c") set(SMI_INC_LIST "${INC_DIR}/rocm_smi_device.h") set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_main.h") @@ -121,12 +123,14 @@ set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_power_mon.h") set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_utils.h") set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_common.h") set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_exception.h") +set(SMI_INC_LIST ${SMI_INC_LIST} "${SRC_DIR}/shared_mutex/shared_mutex.h") set(SMI_EXAMPLE_EXE "rocm_smi_ex") add_executable(${SMI_EXAMPLE_EXE} "example/rocm_smi_example.cc") target_link_libraries(${SMI_EXAMPLE_EXE} ${ROCM_SMI_TARGET}) add_library(${ROCM_SMI_TARGET} SHARED ${SMI_SRC_LIST} ${SMI_INC_LIST}) +target_link_libraries(${ROCM_SMI_TARGET} pthread rt) ## Set the VERSION and SOVERSION values set_property(TARGET ${ROCM_SMI_TARGET} diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h index 94557204c0..7197814209 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h @@ -42,6 +42,9 @@ */ #ifndef INCLUDE_ROCM_SMI_ROCM_SMI_DEVICE_H_ #define INCLUDE_ROCM_SMI_ROCM_SMI_DEVICE_H_ + +#include + #include #include #include @@ -52,6 +55,9 @@ #include "rocm_smi/rocm_smi_power_mon.h" #include "rocm_smi/rocm_smi_common.h" #include "rocm_smi/rocm_smi.h" +extern "C" { +#include "shared_mutex.h" +}; namespace amd { namespace smi { @@ -109,11 +115,12 @@ class Device { uint64_t bdfid(void) const {return bdfid_;} void set_bdfid(uint64_t val) {bdfid_ = val;} uint64_t get_bdfid(void) const {return bdfid_;} - + pthread_mutex_t *mutex(void) {return mutex_.ptr;} private: std::shared_ptr monitor_; std::shared_ptr power_monitor_; std::string path_; + shared_mutex_t mutex_; uint32_t index_; const RocmSMI_env_vars *env_; template int openSysfsFileStream(DevInfoTypes type, T *fs, diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h index 15bae95e14..8949eaac7a 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h @@ -43,6 +43,8 @@ #ifndef INCLUDE_ROCM_SMI_ROCM_SMI_UTILS_H_ #define INCLUDE_ROCM_SMI_ROCM_SMI_UTILS_H_ +#include + #include #include @@ -63,6 +65,28 @@ namespace smi { int ReadSysfsStr(std::string path, std::string *retStr); int WriteSysfsStr(std::string path, std::string val); +struct pthread_wrap { + public: + pthread_wrap(pthread_mutex_t &p_mut) : mutex_(p_mut) {} + + void Acquire() { pthread_mutex_lock(&mutex_); } + void Release() { pthread_mutex_unlock(&mutex_); } + private: + pthread_mutex_t& mutex_; +}; +struct ScopedPthread { + ScopedPthread(pthread_wrap& mutex) : pthrd_ref_(mutex) { + pthrd_ref_.Acquire(); + }; + + ~ScopedPthread() { + pthrd_ref_.Release(); + } + private: + ScopedPthread(const ScopedPthread&); + + pthread_wrap& pthrd_ref_; +}; } // namespace smi } // namespace amd diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index 2da8a54ba4..3a8cec8e31 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -100,6 +101,21 @@ static rsmi_status_t handleException() { std::shared_ptr dev = smi.monitor_devices()[dv_ind]; \ assert(dev != nullptr); +#define DEVICE_MUTEX \ + amd::smi::pthread_wrap _pw(*get_mutex(dv_ind)); \ + amd::smi::ScopedPthread _lock(_pw); + +static pthread_mutex_t *get_mutex(uint32_t dv_ind) { + amd::smi::RocmSMI smi = amd::smi::RocmSMI::getInstance(); + if (dv_ind >= smi.monitor_devices().size()) { + return nullptr; + } + std::shared_ptr dev = smi.monitor_devices()[dv_ind]; + assert(dev != nullptr); + + return dev->mutex(); +} + static rsmi_status_t errno_to_rsmi_status(uint32_t err) { switch (err) { case 0: return RSMI_STATUS_SUCCESS; @@ -440,6 +456,9 @@ rsmi_status_t rsmi_dev_ecc_enabled_get(uint32_t dv_ind, if (enabled_mask == nullptr) { return RSMI_STATUS_INVALID_ARGS; } + + DEVICE_MUTEX + std::vector val_vec; ret = get_dev_value_vec(amd::smi::kDevErrCntFeatures, dv_ind, &val_vec); @@ -508,6 +527,8 @@ rsmi_status_t rsmi_dev_ecc_status_get(uint32_t dv_ind, rsmi_gpu_block_t block, rsmi_status_t ret; std::vector val_vec; + DEVICE_MUTEX + ret = get_dev_value_vec(amd::smi::kDevErrCntFeatures, dv_ind, &val_vec); if (ret == RSMI_STATUS_FILE_ERROR) { @@ -569,6 +590,9 @@ rsmi_dev_ecc_count_get(uint32_t dv_ind, rsmi_gpu_block_t block, default: return RSMI_STATUS_NOT_SUPPORTED; } + + DEVICE_MUTEX + ret = get_dev_value_vec(type, dv_ind, &val_vec); if (ret == RSMI_STATUS_FILE_ERROR) { @@ -605,6 +629,8 @@ rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid) { } GET_DEV_FROM_INDX + DEVICE_MUTEX + *bdfid = dev->get_bdfid(); return RSMI_STATUS_SUCCESS; CATCH @@ -614,6 +640,9 @@ static rsmi_status_t get_id(uint32_t dv_ind, amd::smi::DevInfoTypes typ, uint16_t *id) { TRY std::string val_str; + + DEVICE_MUTEX + rsmi_status_t ret = get_dev_value_str(typ, dv_ind, &val_str); if (ret != RSMI_STATUS_SUCCESS) { @@ -630,21 +659,25 @@ get_id(uint32_t dv_ind, amd::smi::DevInfoTypes typ, uint16_t *id) { rsmi_status_t rsmi_dev_id_get(uint32_t dv_ind, uint16_t *id) { + DEVICE_MUTEX return get_id(dv_ind, amd::smi::kDevDevID, id); } rsmi_status_t rsmi_dev_subsystem_id_get(uint32_t dv_ind, uint16_t *id) { + DEVICE_MUTEX return get_id(dv_ind, amd::smi::kDevSubSysDevID, id); } rsmi_status_t rsmi_dev_vendor_id_get(uint32_t dv_ind, uint16_t *id) { + DEVICE_MUTEX return get_id(dv_ind, amd::smi::kDevVendorID, id); } rsmi_status_t rsmi_dev_subsystem_vendor_id_get(uint32_t dv_ind, uint16_t *id) { + DEVICE_MUTEX return get_id(dv_ind, amd::smi::kDevSubSysVendorID, id); } @@ -652,6 +685,8 @@ rsmi_status_t rsmi_dev_perf_level_get(uint32_t dv_ind, rsmi_dev_perf_level_t *perf) { TRY std::string val_str; + DEVICE_MUTEX + rsmi_status_t ret = get_dev_value_str(amd::smi::kDevPerfLevel, dv_ind, &val_str); if (ret != RSMI_STATUS_SUCCESS) { @@ -668,6 +703,8 @@ rsmi_status_t rsmi_dev_overdrive_level_get(uint32_t dv_ind, uint32_t *od) { TRY std::string val_str; + DEVICE_MUTEX + rsmi_status_t ret = get_dev_value_str(amd::smi::kDevOverDriveLevel, dv_ind, &val_str); if (ret != RSMI_STATUS_SUCCESS) { @@ -688,7 +725,7 @@ rsmi_dev_overdrive_level_set(int32_t dv_ind, uint32_t od) { if (od > kMaxOverdriveLevel) { return RSMI_STATUS_INVALID_ARGS; } - + DEVICE_MUTEX return set_dev_value(amd::smi::kDevOverDriveLevel, dv_ind, od); CATCH } @@ -700,6 +737,7 @@ rsmi_dev_perf_level_set(int32_t dv_ind, rsmi_dev_perf_level_t perf_level) { return RSMI_STATUS_INVALID_ARGS; } + DEVICE_MUTEX return set_dev_value(amd::smi::kDevPerfLevel, dv_ind, perf_level); CATCH } @@ -1009,6 +1047,8 @@ rsmi_dev_gpu_clk_freq_get(uint32_t dv_ind, rsmi_clk_type_t clk_type, return RSMI_STATUS_INVALID_ARGS; } + DEVICE_MUTEX + return get_frequencies(dev_type, dv_ind, f); CATCH @@ -1035,6 +1075,9 @@ rsmi_dev_gpu_clk_freq_set(uint32_t dv_ind, rsmi_frequencies_t freqs; TRY + + DEVICE_MUTEX + ret = rsmi_dev_gpu_clk_freq_get(dv_ind, clk_type, &freqs); if (ret != RSMI_STATUS_SUCCESS) { @@ -1247,6 +1290,8 @@ rsmi_dev_name_get(uint32_t dv_ind, char *name, size_t len) { return RSMI_STATUS_INVALID_ARGS; } + DEVICE_MUTEX + ret = get_dev_name_from_id(dv_ind, name, len, NAME_STR_DEVICE); if (ret != RSMI_STATUS_SUCCESS) { return ret; @@ -1265,6 +1310,8 @@ rsmi_dev_subsystem_name_get(uint32_t dv_ind, char *name, size_t len) { return RSMI_STATUS_INVALID_ARGS; } + DEVICE_MUTEX + ret = get_dev_name_from_id(dv_ind, name, len, NAME_STR_SUBSYS); return ret; CATCH @@ -1279,6 +1326,7 @@ rsmi_dev_vendor_name_get(uint32_t dv_ind, char *name, size_t len) { return RSMI_STATUS_INVALID_ARGS; } + DEVICE_MUTEX ret = get_dev_name_from_id(dv_ind, name, len, NAME_STR_VENDOR); return ret; CATCH @@ -1294,6 +1342,8 @@ rsmi_dev_pci_bandwidth_get(uint32_t dv_ind, rsmi_pcie_bandwidth_t *b) { return RSMI_STATUS_INVALID_ARGS; } + DEVICE_MUTEX + return get_frequencies(amd::smi::kDevPCIEClk, dv_ind, &b->transfer_rate, b->lanes); @@ -1306,6 +1356,8 @@ rsmi_dev_pci_bandwidth_set(uint32_t dv_ind, uint64_t bw_bitmask) { rsmi_pcie_bandwidth_t bws; TRY + + DEVICE_MUTEX ret = rsmi_dev_pci_bandwidth_get(dv_ind, &bws); if (ret != RSMI_STATUS_SUCCESS) { @@ -1346,6 +1398,9 @@ rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent, rsmi_status_t ret; std::string val_str; + + DEVICE_MUTEX + ret = get_dev_value_line(amd::smi::kDevPCIEThruPut, dv_ind, &val_str); if (ret != RSMI_STATUS_SUCCESS) { @@ -1435,6 +1490,8 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_ind, mon_type = amd::smi::kMonInvalid; } + DEVICE_MUTEX + ret = get_dev_mon_value(mon_type, dv_ind, sensor_ind, temperature); return ret; @@ -1453,6 +1510,8 @@ rsmi_dev_fan_speed_get(uint32_t dv_ind, uint32_t sensor_ind, int64_t *speed) { ++sensor_ind; // fan sysfs files have 1-based indices + DEVICE_MUTEX + ret = get_dev_mon_value(amd::smi::kMonFanSpeed, dv_ind, sensor_ind, speed); return ret; @@ -1470,6 +1529,8 @@ rsmi_dev_fan_rpms_get(uint32_t dv_ind, uint32_t sensor_ind, int64_t *speed) { rsmi_status_t ret; + DEVICE_MUTEX + ret = get_dev_mon_value(amd::smi::kMonFanRPMs, dv_ind, sensor_ind, speed); return ret; @@ -1484,6 +1545,8 @@ rsmi_dev_fan_reset(uint32_t dv_ind, uint32_t sensor_ind) { ++sensor_ind; // fan sysfs files have 1-based indices + DEVICE_MUTEX + ret = set_dev_mon_value(amd::smi::kMonFanCntrlEnable, dv_ind, sensor_ind, 2); @@ -1499,6 +1562,7 @@ rsmi_dev_fan_speed_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t speed) { rsmi_status_t ret; uint64_t max_speed; + DEVICE_MUTEX ret = rsmi_dev_fan_speed_max_get(dv_ind, sensor_ind, &max_speed); @@ -1541,6 +1605,8 @@ rsmi_dev_fan_speed_max_get(uint32_t dv_ind, uint32_t sensor_ind, rsmi_status_t ret; + DEVICE_MUTEX + ret = get_dev_mon_value(amd::smi::kMonMaxFanSpeed, dv_ind, sensor_ind, reinterpret_cast(max_speed)); @@ -1551,6 +1617,8 @@ rsmi_dev_fan_speed_max_get(uint32_t dv_ind, uint32_t sensor_ind, rsmi_status_t rsmi_dev_od_volt_info_get(uint32_t dv_ind, rsmi_od_volt_freq_data_t *odv) { TRY + DEVICE_MUTEX + rsmi_status_t ret = get_od_clk_volt_info(dv_ind, odv); return ret; @@ -1564,6 +1632,8 @@ rsmi_status_t rsmi_dev_od_volt_curve_regions_get(uint32_t dv_ind, if (buffer == nullptr || num_regions == nullptr || *num_regions == 0) { return RSMI_STATUS_INVALID_ARGS; } + + DEVICE_MUTEX rsmi_status_t ret = get_od_clk_volt_curve_regions(dv_ind, num_regions, buffer); return ret; @@ -1582,6 +1652,8 @@ rsmi_dev_power_max_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *power) { // ++sensor_ind; // power sysfs files have 1-based indices rsmi_status_t ret; + + DEVICE_MUTEX ret = get_power_mon_value(amd::smi::kPowerMaxGPUPower, dv_ind, power); return ret; @@ -1598,6 +1670,8 @@ rsmi_dev_power_ave_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *power) { ++sensor_ind; // power sysfs files have 1-based indices rsmi_status_t ret; + + DEVICE_MUTEX ret = get_dev_mon_value(amd::smi::kMonPowerAve, dv_ind, sensor_ind, power); return ret; @@ -1615,6 +1689,8 @@ rsmi_dev_power_cap_get(uint32_t dv_ind, uint32_t sensor_ind, uint64_t *cap) { ++sensor_ind; // power sysfs files have 1-based indices rsmi_status_t ret; + + DEVICE_MUTEX ret = get_dev_mon_value(amd::smi::kMonPowerCap, dv_ind, sensor_ind, cap); return ret; @@ -1633,6 +1709,8 @@ rsmi_dev_power_cap_range_get(uint32_t dv_ind, uint32_t sensor_ind, ++sensor_ind; // power sysfs files have 1-based indices rsmi_status_t ret; + + DEVICE_MUTEX ret = get_dev_mon_value(amd::smi::kMonPowerCapMax, dv_ind, sensor_ind, max); if (ret == RSMI_STATUS_SUCCESS) { @@ -1650,6 +1728,7 @@ rsmi_dev_power_cap_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t cap) { rsmi_status_t ret; uint64_t min, max; + DEVICE_MUTEX ret = rsmi_dev_power_cap_range_get(dv_ind, sensor_ind, &max, &min); if (ret != RSMI_STATUS_SUCCESS) { @@ -1678,6 +1757,7 @@ rsmi_dev_power_profile_presets_get(uint32_t dv_ind, uint32_t sensor_ind, ++sensor_ind; // power sysfs files have 1-based indices + DEVICE_MUTEX rsmi_status_t ret = get_power_profiles(dv_ind, status, nullptr); return ret; CATCH @@ -1689,6 +1769,7 @@ rsmi_dev_power_profile_set(uint32_t dv_ind, uint32_t sensor_ind, TRY ++sensor_ind; // power sysfs files have 1-based indices + DEVICE_MUTEX rsmi_status_t ret = set_power_profile(dv_ind, profile); return ret; CATCH @@ -1722,6 +1803,8 @@ rsmi_dev_memory_total_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, assert(!"Unexpected memory type"); return RSMI_STATUS_INVALID_ARGS; } + + DEVICE_MUTEX ret = get_dev_value_int(mem_type_file, dv_ind, total); return ret; @@ -1755,6 +1838,8 @@ rsmi_dev_memory_usage_get(uint32_t dv_ind, rsmi_memory_type_t mem_type, assert(!"Unexpected memory type"); return RSMI_STATUS_INVALID_ARGS; } + + DEVICE_MUTEX ret = get_dev_value_int(mem_type_file, dv_ind, used); return ret; @@ -1847,6 +1932,8 @@ rsmi_status_t rsmi_dev_busy_percent_get(uint32_t dv_ind, uint32_t *busy_percent) { TRY std::string val_str; + + DEVICE_MUTEX rsmi_status_t ret = get_dev_value_str(amd::smi::kDevUsage, dv_ind, &val_str); if (ret != RSMI_STATUS_SUCCESS) { @@ -1871,6 +1958,8 @@ rsmi_dev_vbios_version_get(uint32_t dv_ind, char *vbios, uint32_t len) { TRY GET_DEV_FROM_INDX std::string val_str; + + DEVICE_MUTEX int ret = dev->readDevInfo(amd::smi::kDevVBiosVer, &val_str); if (ret != 0) { diff --git a/projects/amdsmi/src/rocm_smi_device.cc b/projects/amdsmi/src/rocm_smi_device.cc index 2a8cbc3f23..f50fc319ee 100755 --- a/projects/amdsmi/src/rocm_smi_device.cc +++ b/projects/amdsmi/src/rocm_smi_device.cc @@ -41,6 +41,10 @@ * */ +#include +#include +#include + #include #include #include @@ -55,6 +59,11 @@ #include "rocm_smi/rocm_smi_main.h" #include "rocm_smi/rocm_smi_device.h" #include "rocm_smi/rocm_smi.h" +#include "rocm_smi/rocm_smi_exception.h" + +extern "C" { +#include "shared_mutex.h" // NOLINT +}; namespace amd { namespace smi { @@ -154,9 +163,26 @@ static bool isRegularFile(std::string fname) { Device::Device(std::string p, RocmSMI_env_vars const *e) : path_(p), env_(e) { monitor_ = nullptr; + + // Get the device name + size_t i = path_.rfind('/', path_.length()); + std::string dev = path_.substr(i + 1, path_.length() - i); + + std::string m_name("/rocm_smi_"); + m_name += dev; + m_name += '_'; + m_name += std::to_string(geteuid()); + + mutex_ = shared_mutex_init(m_name.c_str(), 0777); + + if (mutex_.ptr == nullptr) { + throw amd::smi::rsmi_exception(RSMI_INITIALIZATION_ERROR, + "Failed to create shared mem. mutex."); + } } Device:: ~Device() { + shared_mutex_close(mutex_); } template diff --git a/projects/amdsmi/src/shared_mutex/LICENSE b/projects/amdsmi/src/shared_mutex/LICENSE new file mode 100644 index 0000000000..d85e0d6d9f --- /dev/null +++ b/projects/amdsmi/src/shared_mutex/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Oleg Yamnikov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/projects/amdsmi/src/shared_mutex/shared_mutex.c b/projects/amdsmi/src/shared_mutex/shared_mutex.c new file mode 100755 index 0000000000..33c4d38729 --- /dev/null +++ b/projects/amdsmi/src/shared_mutex/shared_mutex.c @@ -0,0 +1,131 @@ +#include "shared_mutex.h" +#include // errno, ENOENT +#include // O_RDWR, O_CREATE +#include // NAME_MAX +#include // shm_open, shm_unlink, mmap, munmap, + // PROT_READ, PROT_WRITE, MAP_SHARED, MAP_FAILED +#include // ftruncate, close +#include // perror +#include // malloc, free +#include // strcpy + +shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { + shared_mutex_t mutex = {NULL, 0, NULL, 0}; + errno = 0; + + // Open existing shared memory object, or create one. + // Two separate calls are needed here, to mark fact of creation + // for later initialization of pthread mutex. + mutex.shm_fd = shm_open(name, O_RDWR, mode); + if (errno == ENOENT) { + mutex.shm_fd = shm_open(name, O_RDWR|O_CREAT, mode); + mutex.created = 1; + // Change permissions of shared memory, so every body can access it. Avoiding the umask of shm_open + if (fchmod(mutex.shm_fd, mode) != 0) { + perror("fchmod"); + } + } + if (mutex.shm_fd == -1) { + perror("shm_open"); + return mutex; + } + + // Truncate shared memory segment so it would contain + // pthread_mutex_t AND the ref. count + if (ftruncate(mutex.shm_fd, sizeof(pthread_mutex_t)) != 0) { + perror("ftruncate"); + return mutex; + } + + // Map pthread mutex into the shared memory. + void *addr = mmap( + NULL, + sizeof(pthread_mutex_t), + PROT_READ|PROT_WRITE, + MAP_SHARED, + mutex.shm_fd, + 0 + ); + if (addr == MAP_FAILED) { + perror("mmap"); + return mutex; + } + + if (mutex.created == 0 && ((shared_mutex_t *)addr)->ptr == NULL) { + // Something is out of sync. Unlink shm and start over. + if (shm_unlink(name)) { + mutex.shm_fd = 0; + perror("shm_unlink"); + } + free(mutex.name); + + return shared_mutex_init(name, mode); + } + + pthread_mutex_t *mutex_ptr = (pthread_mutex_t *)addr; + + if (mutex.created) { + pthread_mutexattr_t attr; + if (pthread_mutexattr_init(&attr)) { + perror("pthread_mutexattr_init"); + return mutex; + } + if (pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)) { + perror("pthread_mutexattr_setpshared"); + return mutex; + } + + if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) { + perror("pthread_mutexattr_settype"); + return mutex; + } + if (pthread_mutex_init(mutex_ptr, &attr)) { + perror("pthread_mutex_init"); + return mutex; + } + } + + mutex.ptr = mutex_ptr; + mutex.name = (char *)malloc(NAME_MAX+1); + strcpy(mutex.name, name); + return mutex; +} + +int shared_mutex_close(shared_mutex_t mutex) { + if (munmap((void *)mutex.ptr, sizeof(pthread_mutex_t))) { + perror("munmap"); + return -1; + } + mutex.ptr = NULL; + if (close(mutex.shm_fd)) { + perror("close"); + return -1; + } + mutex.shm_fd = 0; + free(mutex.name); + + return 0; +} + +int shared_mutex_destroy(shared_mutex_t mutex) { + if ((errno = pthread_mutex_destroy(mutex.ptr))) { + perror("pthread_mutex_destroy"); + return -1; + } + if (munmap((void *)mutex.ptr, sizeof(pthread_mutex_t))) { + perror("munmap"); + return -1; + } + mutex.ptr = NULL; + if (close(mutex.shm_fd)) { + perror("close"); + return -1; + } + mutex.shm_fd = 0; + if (shm_unlink(mutex.name)) { + perror("shm_unlink"); + return -1; + } + free(mutex.name); + return 0; +} diff --git a/projects/amdsmi/src/shared_mutex/shared_mutex.h b/projects/amdsmi/src/shared_mutex/shared_mutex.h new file mode 100755 index 0000000000..18e70bd6de --- /dev/null +++ b/projects/amdsmi/src/shared_mutex/shared_mutex.h @@ -0,0 +1,67 @@ +#ifndef SRC_SHARED_MUTEX_SHARED_MUTEX_H_ +#define SRC_SHARED_MUTEX_SHARED_MUTEX_H_ + +#include + +#include // pthread_mutex_t, pthread_mutexattr_t, + // pthread_mutexattr_init, pthread_mutexattr_setpshared, + // pthread_mutex_init, pthread_mutex_destroy + +// Structure of a shared mutex. +typedef struct shared_mutex_t { + pthread_mutex_t *ptr; // Pointer to the pthread mutex and + // shared memory segment. + int shm_fd; // Descriptor of shared memory object. + char* name; // Name of the mutex and associated + // shared memory object. + int created; // Equals 1 (true) if initialization + // of this structure caused creation + // of a new shared mutex. + // Equals 0 (false) if this mutex was + // just retrieved from shared memory. +} shared_mutex_t; + +// Initialize a new shared mutex with given `name`. If a mutex +// with such name exists in the system, it will be loaded. +// Otherwise a new mutes will by created. +// +// In case of any error, it will be printed into the standard output +// and the returned structure will have `ptr` equal `NULL`. +// `errno` wil not be reset in such case, so you may used it. +// +// **NOTE:** In case when the mutex appears to be uncreated, +// this function becomes *non-thread-safe*. If multiple threads +// call it at one moment, there occur several race conditions, +// in which one call might recreate another's shared memory +// object or rewrite another's pthread mutex in the shared memory. +// There is no workaround currently, except to run first +// initialization only before multi-threaded or multi-process +// functionality. +shared_mutex_t shared_mutex_init(const char *name, mode_t mode); + +// Close access to the shared mutex and free all the resources, +// used by the structure. +// +// Returns 0 in case of success. If any error occurs, it will be +// printed into the standard output and the function will return -1. +// `errno` wil not be reset in such case, so you may used it. +// +// **NOTE:** It will not destroy the mutex. The mutex would not +// only be available to other processes using it right now, +// but also to any process which might want to use it later on. +// For complete desctruction use `shared_mutex_destroy` instead. +// +// **NOTE:** It will not unlock locked mutex. +int shared_mutex_close(shared_mutex_t mutex); + +// Close and destroy shared mutex. +// Any open pointers to it will be invalidated. +// +// Returns 0 in case of success. If any error occurs, it will be +// printed into the standard output and the function will return -1. +// `errno` wil not be reset in such case, so you may used it. +// +// **NOTE:** It will not unlock locked mutex. +int shared_mutex_destroy(shared_mutex_t mutex); + +#endif // SRC_SHARED_MUTEX_SHARED_MUTEX_H_ From e9c3d5dec663b6580f0b3a79694bb9be99b270e3 Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Mon, 6 May 2019 11:26:40 -0500 Subject: [PATCH 2/2] Added rsmi_dev_pci_replay_counter_get() Also, added code to destroy/recreate mutex if we can't get a lock within 3 seconds, when shared memory mutex is initialized. [ROCm/amdsmi commit: 34c977bd06fe11754f7f768a9b03096a0645e22a] --- projects/amdsmi/include/rocm_smi/rocm_smi.h | 18 +++++++++++++ .../amdsmi/include/rocm_smi/rocm_smi_device.h | 4 ++- .../amdsmi/include/rocm_smi/rocm_smi_utils.h | 12 ++++----- projects/amdsmi/src/rocm_smi.cc | 17 ++++++++++-- projects/amdsmi/src/rocm_smi_device.cc | 5 +++- .../amdsmi/src/shared_mutex/shared_mutex.c | 18 +++++++++++-- .../amdsmi/src/shared_mutex/shared_mutex.h | 3 +++ .../rocm_smi_test/functional/err_cnt_read.cc | 2 +- .../functional/pci_read_write.cc | 26 ++++++++++++++----- .../rocm_smi_test/functional/temp_read.cc | 2 +- .../functional/volt_freq_curv_read.cc | 2 +- 11 files changed, 88 insertions(+), 21 deletions(-) diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi.h b/projects/amdsmi/include/rocm_smi/rocm_smi.h index f6b20536ae..7319d13df7 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi.h @@ -744,6 +744,24 @@ rsmi_status_t rsmi_dev_pci_id_get(uint32_t dv_ind, uint64_t *bdfid); rsmi_status_t rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent, uint64_t *received, uint64_t *max_pkt_sz); +/** + * @brief Get PCIe replay counter + * + * @details Given a device index @p dv_ind and a pointer to a uint64_t @p + * counter, this function will write the sum of the number of NAK's received + * by the GPU and the NAK's generated by the GPU to memory pointed to by @p + * counter. + * + * @param[in] dv_ind a device index + * + * @param[inout] counter a pointer to uint64_t to which the sum of the NAK's + * received and generated by the GPU is written + * + * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. + */ +rsmi_status_t rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, + uint64_t *counter); + /** @} */ // end of PCIeQuer /*****************************************************************************/ /** @defgroup PCIeCont PCIe Control diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h index 7197814209..c7fcad7dc1 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_device.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_device.h @@ -56,7 +56,7 @@ #include "rocm_smi/rocm_smi_common.h" #include "rocm_smi/rocm_smi.h" extern "C" { -#include "shared_mutex.h" +#include "shared_mutex.h" // NOLINT }; namespace amd { @@ -90,6 +90,7 @@ enum DevInfoTypes { kDevMemUsedGTT, kDevMemUsedVisVRAM, kDevMemUsedVRAM, + kDevPCIEReplayCount, }; class Device { @@ -116,6 +117,7 @@ class Device { void set_bdfid(uint64_t val) {bdfid_ = val;} uint64_t get_bdfid(void) const {return bdfid_;} pthread_mutex_t *mutex(void) {return mutex_.ptr;} + private: std::shared_ptr monitor_; std::shared_ptr power_monitor_; diff --git a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h index 8949eaac7a..4612fadce4 100755 --- a/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h +++ b/projects/amdsmi/include/rocm_smi/rocm_smi_utils.h @@ -66,23 +66,23 @@ int ReadSysfsStr(std::string path, std::string *retStr); int WriteSysfsStr(std::string path, std::string val); struct pthread_wrap { - public: - pthread_wrap(pthread_mutex_t &p_mut) : mutex_(p_mut) {} + public: + explicit pthread_wrap(pthread_mutex_t &p_mut) : mutex_(p_mut) {} void Acquire() { pthread_mutex_lock(&mutex_); } void Release() { pthread_mutex_unlock(&mutex_); } - private: + private: pthread_mutex_t& mutex_; }; struct ScopedPthread { - ScopedPthread(pthread_wrap& mutex) : pthrd_ref_(mutex) { + explicit ScopedPthread(pthread_wrap& mutex) : pthrd_ref_(mutex) { pthrd_ref_.Acquire(); - }; + } ~ScopedPthread() { pthrd_ref_.Release(); } - private: + private: ScopedPthread(const ScopedPthread&); pthread_wrap& pthrd_ref_; diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index 3a8cec8e31..3200197cfa 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -120,8 +120,8 @@ static rsmi_status_t errno_to_rsmi_status(uint32_t err) { switch (err) { case 0: return RSMI_STATUS_SUCCESS; case EACCES: return RSMI_STATUS_PERMISSION; - case EPERM: return RSMI_STATUS_NOT_SUPPORTED; - case ENOENT: + case EPERM: + case ENOENT: return RSMI_STATUS_NOT_SUPPORTED; case EISDIR: return RSMI_STATUS_FILE_ERROR; default: return RSMI_STATUS_UNKNOWN_ERROR; } @@ -2044,3 +2044,16 @@ rsmi_version_str_get(rsmi_sw_component_t component, char *ver_str, CATCH } + +rsmi_status_t +rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, uint64_t *counter) { + TRY + + DEVICE_MUTEX + rsmi_status_t ret; + + ret = get_dev_value_int(amd::smi::kDevPCIEReplayCount, dv_ind, counter); + return ret; + + CATCH +} diff --git a/projects/amdsmi/src/rocm_smi_device.cc b/projects/amdsmi/src/rocm_smi_device.cc index f50fc319ee..fbbe6fdc97 100755 --- a/projects/amdsmi/src/rocm_smi_device.cc +++ b/projects/amdsmi/src/rocm_smi_device.cc @@ -96,6 +96,7 @@ static const char *kDevMemTotVRAMFName = "mem_info_vram_total"; static const char *kDevMemUsedGTTFName = "mem_info_gtt_used"; static const char *kDevMemUsedVisVRAMFName = "mem_info_vis_vram_used"; static const char *kDevMemUsedVRAMFName = "mem_info_vram_used"; +static const char *kDevPCIEReplayCountFName = "pcie_replay_count"; // Strings that are found within sysfs files static const char *kDevPerfLevelAutoStr = "auto"; @@ -136,6 +137,7 @@ static const std::map kDevAttribNameMap = { {kDevMemUsedGTT, kDevMemUsedGTTFName}, {kDevMemUsedVisVRAM, kDevMemUsedVisVRAMFName}, {kDevMemUsedVRAM, kDevMemUsedVRAMFName}, + {kDevPCIEReplayCount, kDevPCIEReplayCountFName}, }; static const std::map kDevPerfLvlMap = { @@ -202,7 +204,7 @@ int Device::openSysfsFileStream(DevInfoTypes type, T *fs, const char *str) { DBG_FILE_ERROR(sysfs_path, str); if (!isRegularFile(sysfs_path)) { - return EISDIR; + return ENOENT; } fs->open(sysfs_path); @@ -367,6 +369,7 @@ int Device::readDevInfo(DevInfoTypes type, uint64_t *val) { case kDevMemUsedGTT: case kDevMemUsedVisVRAM: case kDevMemUsedVRAM: + case kDevPCIEReplayCount: ret = readDevInfoStr(type, &tempStr); RET_IF_NONZERO(ret); *val = std::stoul(tempStr, 0); diff --git a/projects/amdsmi/src/shared_mutex/shared_mutex.c b/projects/amdsmi/src/shared_mutex/shared_mutex.c index 33c4d38729..af25e1ba90 100755 --- a/projects/amdsmi/src/shared_mutex/shared_mutex.c +++ b/projects/amdsmi/src/shared_mutex/shared_mutex.c @@ -8,6 +8,7 @@ #include // perror #include // malloc, free #include // strcpy +#include // clock_gettime shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { shared_mutex_t mutex = {NULL, 0, NULL, 0}; @@ -51,7 +52,17 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { return mutex; } - if (mutex.created == 0 && ((shared_mutex_t *)addr)->ptr == NULL) { + pthread_mutex_t *mutex_ptr = (pthread_mutex_t *)addr; + + // Make sure the mutex wasn't left in a locked state. If we can't + // acquire it in 3 sec., re-do everything. + struct timespec expireTime; + clock_gettime(CLOCK_REALTIME, &expireTime); + expireTime.tv_sec += 3; + + int ret = pthread_mutex_timedlock(mutex_ptr, &expireTime); + + if (ret || (mutex.created == 0 && ((shared_mutex_t *)addr)->ptr == NULL)) { // Something is out of sync. Unlink shm and start over. if (shm_unlink(name)) { mutex.shm_fd = 0; @@ -60,9 +71,12 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { free(mutex.name); return shared_mutex_init(name, mode); + } else { + if (pthread_mutex_unlock(mutex_ptr)) { + perror("pthread_mutex_unlock"); + } } - pthread_mutex_t *mutex_ptr = (pthread_mutex_t *)addr; if (mutex.created) { pthread_mutexattr_t attr; diff --git a/projects/amdsmi/src/shared_mutex/shared_mutex.h b/projects/amdsmi/src/shared_mutex/shared_mutex.h index 18e70bd6de..77aff5509d 100755 --- a/projects/amdsmi/src/shared_mutex/shared_mutex.h +++ b/projects/amdsmi/src/shared_mutex/shared_mutex.h @@ -1,3 +1,6 @@ +// NOLINT(legal/copyright) +// See LICENSE file + #ifndef SRC_SHARED_MUTEX_SHARED_MUTEX_H_ #define SRC_SHARED_MUTEX_SHARED_MUTEX_H_ diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc b/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc index 8f6a9ac26a..03c5b70df5 100755 --- a/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc +++ b/projects/amdsmi/tests/rocm_smi_test/functional/err_cnt_read.cc @@ -96,7 +96,7 @@ void TestErrCntRead::Run(void) { err = rsmi_dev_ecc_enabled_get(i, &enabled_mask); if (err == RSMI_STATUS_NOT_SUPPORTED) { std::cout << - "\t**Error Count Enabled Mask for is not supported on this machine" + "\t**Error Count Enabled Mask get is not supported on this machine" << std::endl; } else { CHK_ERR_ASRT(err) diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/pci_read_write.cc b/projects/amdsmi/tests/rocm_smi_test/functional/pci_read_write.cc index 6e63e495a4..07fb96890a 100755 --- a/projects/amdsmi/tests/rocm_smi_test/functional/pci_read_write.cc +++ b/projects/amdsmi/tests/rocm_smi_test/functional/pci_read_write.cc @@ -89,13 +89,26 @@ void TestPciReadWrite::Run(void) { rsmi_status_t ret; rsmi_pcie_bandwidth_t bw; uint32_t freq_bitmask; - uint64_t sent, received, max_pkt_sz; + uint64_t sent, received, max_pkt_sz, u64int; TestBase::Run(); for (uint32_t dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) { PrintDeviceHeader(dv_ind); + ret = rsmi_dev_pci_replay_counter_get(dv_ind, &u64int); + + if (ret == RSMI_STATUS_NOT_SUPPORTED) { + std::cout << + "\t**rsmi_dev_pci_replay_counter_get() is not supported" + " on this machine" << std::endl; + } else { + CHK_ERR_ASRT(ret) + IF_VERB(STANDARD) { + std::cout << "\tPCIe Replay Counter: " << u64int << std::endl; + } + } + ret = rsmi_dev_pci_throughput_get(dv_ind, &sent, &received, &max_pkt_sz); if (ret == RSMI_STATUS_NOT_SUPPORTED) { std::cout << "TEST FAILURE: Current PCIe throughput is not detected. " @@ -106,7 +119,7 @@ void TestPciReadWrite::Run(void) { CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { - std::cout << "PCIe Throughput (1 sec.): " << std::endl; + std::cout << "\tPCIe Throughput (1 sec.): " << std::endl; std::cout << "\t\tSent: " << sent << " bytes" << std::endl; std::cout << "\t\tReceived: " << received << " bytes" << std::endl; std::cout << "\t\tMax Packet Size: " << max_pkt_sz << " bytes" << @@ -125,7 +138,8 @@ void TestPciReadWrite::Run(void) { CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { - std::cout << "Initial PCIe is " << bw.transfer_rate.current << std::endl; + std::cout << "\tInitial PCIe is " << bw.transfer_rate.current << + std::endl; } // First set the bitmask to all supported bandwidths @@ -141,7 +155,7 @@ void TestPciReadWrite::Run(void) { freq_bm_str.size()-1)); IF_VERB(STANDARD) { - std::cout << "Setting bandwidth mask to " << "0b" << freq_bm_str << + std::cout << "\tSetting bandwidth mask to " << "0b" << freq_bm_str << " ..." << std::endl; } ret = rsmi_dev_pci_bandwidth_set(dv_ind, freq_bitmask); @@ -151,9 +165,9 @@ void TestPciReadWrite::Run(void) { CHK_ERR_ASRT(ret) IF_VERB(STANDARD) { - std::cout << "Bandwidth is now index " << bw.transfer_rate.current << + std::cout << "\tBandwidth is now index " << bw.transfer_rate.current << std::endl; - std::cout << "Resetting mask to all bandwidths." << std::endl; + std::cout << "\tResetting mask to all bandwidths." << std::endl; } ret = rsmi_dev_pci_bandwidth_set(dv_ind, 0xFFFFFFFF); CHK_ERR_ASRT(ret) diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/temp_read.cc b/projects/amdsmi/tests/rocm_smi_test/functional/temp_read.cc index 9f04a0deeb..e68185e689 100755 --- a/projects/amdsmi/tests/rocm_smi_test/functional/temp_read.cc +++ b/projects/amdsmi/tests/rocm_smi_test/functional/temp_read.cc @@ -99,7 +99,7 @@ void TestTempRead::Run(void) { err = rsmi_dev_temp_metric_get(i, 0, met, &val_i64); if (err != RSMI_STATUS_SUCCESS) { - if (err == RSMI_STATUS_FILE_ERROR) { + if (err == RSMI_STATUS_NOT_SUPPORTED) { IF_VERB(STANDARD) { std::cout << "\t**" << label << ": " << "Not supported on this machine" << std::endl; diff --git a/projects/amdsmi/tests/rocm_smi_test/functional/volt_freq_curv_read.cc b/projects/amdsmi/tests/rocm_smi_test/functional/volt_freq_curv_read.cc index 4f99880c2d..60b8080a1a 100755 --- a/projects/amdsmi/tests/rocm_smi_test/functional/volt_freq_curv_read.cc +++ b/projects/amdsmi/tests/rocm_smi_test/functional/volt_freq_curv_read.cc @@ -155,7 +155,7 @@ void TestVoltCurvRead::Run(void) { err = rsmi_dev_od_volt_info_get(i, &odv); if (err == RSMI_STATUS_FILE_ERROR || - err == RSMI_STATUS_NOT_YET_IMPLEMENTED) { + err == RSMI_STATUS_NOT_SUPPORTED) { IF_VERB(STANDARD) { std::cout << "\t**rsmi_dev_od_volt_info_get: Not supported on this machine"