From 52196caaee8f06312f4bb96ff31c24b86b7d1705 Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Mon, 6 Apr 2020 17:08:33 -0500 Subject: [PATCH] Shared mutex fixes and improvements * Don't make different shared memory mutexes for different users * Don't delete (unlink) the shared mutex file if the mutex initialization fails. This may mess up other processes that are using it. Instead, print a message on how to resolve the situation, and then throw an error. Note, this situation comes up when debug builds (usually) either assert() or otherwise end execution without a proper clean up. * Remove cpplint from shared_mutex code Change-Id: I5f8ca6150cac5c2405fb97007516da345093f966 --- CMakeLists.txt | 2 +- include/rocm_smi/rocm_smi.h | 3 + include/rocm_smi/rocm_smi_device.h | 4 +- src/rocm_smi_device.cc | 5 -- .../{shared_mutex.c => shared_mutex.cc} | 67 ++++++++++--------- 5 files changed, 41 insertions(+), 40 deletions(-) rename src/shared_mutex/{shared_mutex.c => shared_mutex.cc} (63%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2737a2ef6b..6baef76322 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -132,7 +132,7 @@ set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_power_mon.cc") set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_utils.cc") set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_counters.cc") set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_kfd.cc") -set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/shared_mutex/shared_mutex.c") +set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/shared_mutex/shared_mutex.cc") set(SMI_INC_LIST "${INC_DIR}/rocm_smi_device.h") set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_main.h") diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 24813ce03f..6a564ea4c1 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -118,6 +118,9 @@ typedef enum { //!< input RSMI_STATUS_UNEXPECTED_DATA, //!< The data read or provided to //!< function is not what was expected + RSMI_STATUS_RESOURCE_BUSY, //!< A function timed out trying to + //!< a resource. This could be a + //!< mutex time-out. RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred } rsmi_status_t; diff --git a/include/rocm_smi/rocm_smi_device.h b/include/rocm_smi/rocm_smi_device.h index 560f0200a3..83395ea668 100755 --- a/include/rocm_smi/rocm_smi_device.h +++ b/include/rocm_smi/rocm_smi_device.h @@ -58,9 +58,7 @@ #include "rocm_smi/rocm_smi_common.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi/rocm_smi_counters.h" -extern "C" { -#include "shared_mutex.h" // NOLINT -}; +#include "shared_mutex.h" //NOLINT namespace amd { namespace smi { diff --git a/src/rocm_smi_device.cc b/src/rocm_smi_device.cc index 8bbb53010a..131fd273c4 100755 --- a/src/rocm_smi_device.cc +++ b/src/rocm_smi_device.cc @@ -64,10 +64,7 @@ #include "rocm_smi/rocm_smi_exception.h" #include "rocm_smi/rocm_smi_utils.h" #include "rocm_smi/rocm_smi_kfd.h" - -extern "C" { #include "shared_mutex.h" // NOLINT -}; namespace amd { namespace smi { @@ -474,8 +471,6 @@ Device::Device(std::string p, RocmSMI_env_vars const *e) : path_(p), env_(e) { std::string m_name("/rocm_smi_"); m_name += dev; - m_name += '_'; - m_name += std::to_string(geteuid()); mutex_ = shared_mutex_init(m_name.c_str(), 0777); diff --git a/src/shared_mutex/shared_mutex.c b/src/shared_mutex/shared_mutex.cc similarity index 63% rename from src/shared_mutex/shared_mutex.c rename to src/shared_mutex/shared_mutex.cc index af25e1ba90..db17d1769d 100755 --- a/src/shared_mutex/shared_mutex.c +++ b/src/shared_mutex/shared_mutex.cc @@ -1,14 +1,17 @@ -#include "shared_mutex.h" -#include // errno, ENOENT -#include // O_RDWR, O_CREATE -#include // NAME_MAX -#include // shm_open, shm_unlink, mmap, munmap, +// NOLINT(legal/copyright) +#include "shared_mutex.h" // NOLINT(build/include) +#include // errno, ENOENT +#include // O_RDWR, O_CREATE +#include // NAME_MAX +#include // shm_open, shm_unlink, mmap, munmap, // PROT_READ, PROT_WRITE, MAP_SHARED, MAP_FAILED -#include // ftruncate, close -#include // perror -#include // malloc, free -#include // strcpy -#include // clock_gettime +#include // ftruncate, close +#include // perror +#include // malloc, free +#include // strcpy +#include // clock_gettime + +#include "rocm_smi/rocm_smi_exception.h" shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { shared_mutex_t mutex = {NULL, 0, NULL, 0}; @@ -21,7 +24,8 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { if (errno == ENOENT) { mutex.shm_fd = shm_open(name, O_RDWR|O_CREAT, mode); mutex.created = 1; - // Change permissions of shared memory, so every body can access it. Avoiding the umask of shm_open + // Change permissions of shared memory, so everybody can access it. + // Avoiding the umask of shm_open if (fchmod(mutex.shm_fd, mode) != 0) { perror("fchmod"); } @@ -45,39 +49,40 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { PROT_READ|PROT_WRITE, MAP_SHARED, mutex.shm_fd, - 0 - ); + 0); + if (addr == MAP_FAILED) { perror("mmap"); return mutex; } - pthread_mutex_t *mutex_ptr = (pthread_mutex_t *)addr; + pthread_mutex_t *mutex_ptr = reinterpret_cast(addr); // Make sure the mutex wasn't left in a locked state. If we can't // acquire it in 3 sec., re-do everything. struct timespec expireTime; clock_gettime(CLOCK_REALTIME, &expireTime); - expireTime.tv_sec += 3; + expireTime.tv_sec += 5; int ret = pthread_mutex_timedlock(mutex_ptr, &expireTime); - - if (ret || (mutex.created == 0 && ((shared_mutex_t *)addr)->ptr == NULL)) { - // Something is out of sync. Unlink shm and start over. - if (shm_unlink(name)) { - mutex.shm_fd = 0; - perror("shm_unlink"); - } + + if (ret || (mutex.created == 0 && + reinterpret_cast(addr)->ptr == NULL)) { + // Something is out of sync. + perror("Failed to initialize RSMI device mutex after 5 seconds. Previous " + "execution may not have shutdown cleanly. To fix problem, stop all " + "rocm_smi programs, and then delete the rocm_smi* shared memory files in" + " /dev/shm."); free(mutex.name); - return shared_mutex_init(name, mode); + throw amd::smi::rsmi_exception(RSMI_STATUS_RESOURCE_BUSY, __FUNCTION__); + return mutex; } else { if (pthread_mutex_unlock(mutex_ptr)) { perror("pthread_mutex_unlock"); } } - if (mutex.created) { pthread_mutexattr_t attr; if (pthread_mutexattr_init(&attr)) { @@ -88,7 +93,7 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { perror("pthread_mutexattr_setpshared"); return mutex; } - + if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) { perror("pthread_mutexattr_settype"); return mutex; @@ -98,15 +103,15 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { return mutex; } } - + mutex.ptr = mutex_ptr; - mutex.name = (char *)malloc(NAME_MAX+1); - strcpy(mutex.name, name); + mutex.name = reinterpret_cast(malloc(NAME_MAX+1)); + (void)snprintf(mutex.name, NAME_MAX + 1, "%s", name); return mutex; } -int shared_mutex_close(shared_mutex_t mutex) { - if (munmap((void *)mutex.ptr, sizeof(pthread_mutex_t))) { +int shared_mutex_close(shared_mutex_t mutex) { + if (munmap(reinterpret_cast(mutex.ptr), sizeof(pthread_mutex_t))) { perror("munmap"); return -1; } @@ -126,7 +131,7 @@ int shared_mutex_destroy(shared_mutex_t mutex) { perror("pthread_mutex_destroy"); return -1; } - if (munmap((void *)mutex.ptr, sizeof(pthread_mutex_t))) { + if (munmap(reinterpret_cast(mutex.ptr), sizeof(pthread_mutex_t))) { perror("munmap"); return -1; }