Shared mutex fixes and improvements

* Don't make different shared memory mutexes for different users
* Don't delete (unlink) the shared mutex file if the mutex
  initialization fails. This may mess up other processes that
  are using it. Instead, print a message on how to resolve the
  situation, and then throw an error.

  Note, this situation comes up when debug builds (usually)
  either assert() or otherwise end execution without a proper
  clean up.
* Remove cpplint from shared_mutex code

Change-Id: I5f8ca6150cac5c2405fb97007516da345093f966
Tento commit je obsažen v:
Chris Freehill
2020-04-06 17:08:33 -05:00
rodič fd79e5c161
revize 52196caaee
5 změnil soubory, kde provedl 41 přidání a 40 odebrání
+1 -1
Zobrazit soubor
@@ -132,7 +132,7 @@ set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_power_mon.cc")
set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_utils.cc")
set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_counters.cc")
set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_kfd.cc")
set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/shared_mutex/shared_mutex.c")
set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/shared_mutex/shared_mutex.cc")
set(SMI_INC_LIST "${INC_DIR}/rocm_smi_device.h")
set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_main.h")
+3
Zobrazit soubor
@@ -118,6 +118,9 @@ typedef enum {
//!< input
RSMI_STATUS_UNEXPECTED_DATA, //!< The data read or provided to
//!< function is not what was expected
RSMI_STATUS_RESOURCE_BUSY, //!< A function timed out trying to
//!< a resource. This could be a
//!< mutex time-out.
RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred
} rsmi_status_t;
+1 -3
Zobrazit soubor
@@ -58,9 +58,7 @@
#include "rocm_smi/rocm_smi_common.h"
#include "rocm_smi/rocm_smi.h"
#include "rocm_smi/rocm_smi_counters.h"
extern "C" {
#include "shared_mutex.h" // NOLINT
};
#include "shared_mutex.h" //NOLINT
namespace amd {
namespace smi {
-5
Zobrazit soubor
@@ -64,10 +64,7 @@
#include "rocm_smi/rocm_smi_exception.h"
#include "rocm_smi/rocm_smi_utils.h"
#include "rocm_smi/rocm_smi_kfd.h"
extern "C" {
#include "shared_mutex.h" // NOLINT
};
namespace amd {
namespace smi {
@@ -474,8 +471,6 @@ Device::Device(std::string p, RocmSMI_env_vars const *e) : path_(p), env_(e) {
std::string m_name("/rocm_smi_");
m_name += dev;
m_name += '_';
m_name += std::to_string(geteuid());
mutex_ = shared_mutex_init(m_name.c_str(), 0777);
+36 -31
Zobrazit soubor
@@ -1,14 +1,17 @@
#include "shared_mutex.h"
#include <errno.h> // errno, ENOENT
#include <fcntl.h> // O_RDWR, O_CREATE
#include <linux/limits.h> // NAME_MAX
#include <sys/mman.h> // shm_open, shm_unlink, mmap, munmap,
// NOLINT(legal/copyright)
#include "shared_mutex.h" // NOLINT(build/include)
#include <errno.h> // errno, ENOENT
#include <fcntl.h> // O_RDWR, O_CREATE
#include <linux/limits.h> // NAME_MAX
#include <sys/mman.h> // shm_open, shm_unlink, mmap, munmap,
// PROT_READ, PROT_WRITE, MAP_SHARED, MAP_FAILED
#include <unistd.h> // ftruncate, close
#include <stdio.h> // perror
#include <stdlib.h> // malloc, free
#include <string.h> // strcpy
#include <time.h> // clock_gettime
#include <unistd.h> // ftruncate, close
#include <stdio.h> // perror
#include <stdlib.h> // malloc, free
#include <string.h> // strcpy
#include <time.h> // clock_gettime
#include "rocm_smi/rocm_smi_exception.h"
shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
shared_mutex_t mutex = {NULL, 0, NULL, 0};
@@ -21,7 +24,8 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
if (errno == ENOENT) {
mutex.shm_fd = shm_open(name, O_RDWR|O_CREAT, mode);
mutex.created = 1;
// Change permissions of shared memory, so every body can access it. Avoiding the umask of shm_open
// Change permissions of shared memory, so everybody can access it.
// Avoiding the umask of shm_open
if (fchmod(mutex.shm_fd, mode) != 0) {
perror("fchmod");
}
@@ -45,39 +49,40 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
PROT_READ|PROT_WRITE,
MAP_SHARED,
mutex.shm_fd,
0
);
0);
if (addr == MAP_FAILED) {
perror("mmap");
return mutex;
}
pthread_mutex_t *mutex_ptr = (pthread_mutex_t *)addr;
pthread_mutex_t *mutex_ptr = reinterpret_cast<pthread_mutex_t *>(addr);
// Make sure the mutex wasn't left in a locked state. If we can't
// acquire it in 3 sec., re-do everything.
struct timespec expireTime;
clock_gettime(CLOCK_REALTIME, &expireTime);
expireTime.tv_sec += 3;
expireTime.tv_sec += 5;
int ret = pthread_mutex_timedlock(mutex_ptr, &expireTime);
if (ret || (mutex.created == 0 && ((shared_mutex_t *)addr)->ptr == NULL)) {
// Something is out of sync. Unlink shm and start over.
if (shm_unlink(name)) {
mutex.shm_fd = 0;
perror("shm_unlink");
}
if (ret || (mutex.created == 0 &&
reinterpret_cast<shared_mutex_t *>(addr)->ptr == NULL)) {
// Something is out of sync.
perror("Failed to initialize RSMI device mutex after 5 seconds. Previous "
"execution may not have shutdown cleanly. To fix problem, stop all "
"rocm_smi programs, and then delete the rocm_smi* shared memory files in"
" /dev/shm.");
free(mutex.name);
return shared_mutex_init(name, mode);
throw amd::smi::rsmi_exception(RSMI_STATUS_RESOURCE_BUSY, __FUNCTION__);
return mutex;
} else {
if (pthread_mutex_unlock(mutex_ptr)) {
perror("pthread_mutex_unlock");
}
}
if (mutex.created) {
pthread_mutexattr_t attr;
if (pthread_mutexattr_init(&attr)) {
@@ -88,7 +93,7 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
perror("pthread_mutexattr_setpshared");
return mutex;
}
if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) {
perror("pthread_mutexattr_settype");
return mutex;
@@ -98,15 +103,15 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
return mutex;
}
}
mutex.ptr = mutex_ptr;
mutex.name = (char *)malloc(NAME_MAX+1);
strcpy(mutex.name, name);
mutex.name = reinterpret_cast<char *>(malloc(NAME_MAX+1));
(void)snprintf(mutex.name, NAME_MAX + 1, "%s", name);
return mutex;
}
int shared_mutex_close(shared_mutex_t mutex) {
if (munmap((void *)mutex.ptr, sizeof(pthread_mutex_t))) {
int shared_mutex_close(shared_mutex_t mutex) {
if (munmap(reinterpret_cast<void *>(mutex.ptr), sizeof(pthread_mutex_t))) {
perror("munmap");
return -1;
}
@@ -126,7 +131,7 @@ int shared_mutex_destroy(shared_mutex_t mutex) {
perror("pthread_mutex_destroy");
return -1;
}
if (munmap((void *)mutex.ptr, sizeof(pthread_mutex_t))) {
if (munmap(reinterpret_cast<void *>(mutex.ptr), sizeof(pthread_mutex_t))) {
perror("munmap");
return -1;
}