Shared mutex fixes and improvements
* Don't make different shared memory mutexes for different users * Don't delete (unlink) the shared mutex file if the mutex initialization fails. This may mess up other processes that are using it. Instead, print a message on how to resolve the situation, and then throw an error. Note, this situation comes up when debug builds (usually) either assert() or otherwise end execution without a proper clean up. * Remove cpplint from shared_mutex code Change-Id: I5f8ca6150cac5c2405fb97007516da345093f966
Tento commit je obsažen v:
@@ -132,7 +132,7 @@ set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_power_mon.cc")
|
||||
set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_utils.cc")
|
||||
set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_counters.cc")
|
||||
set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/rocm_smi_kfd.cc")
|
||||
set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/shared_mutex/shared_mutex.c")
|
||||
set(SMI_SRC_LIST ${SMI_SRC_LIST} "${SRC_DIR}/shared_mutex/shared_mutex.cc")
|
||||
|
||||
set(SMI_INC_LIST "${INC_DIR}/rocm_smi_device.h")
|
||||
set(SMI_INC_LIST ${SMI_INC_LIST} "${INC_DIR}/rocm_smi_main.h")
|
||||
|
||||
@@ -118,6 +118,9 @@ typedef enum {
|
||||
//!< input
|
||||
RSMI_STATUS_UNEXPECTED_DATA, //!< The data read or provided to
|
||||
//!< function is not what was expected
|
||||
RSMI_STATUS_RESOURCE_BUSY, //!< A function timed out trying to
|
||||
//!< a resource. This could be a
|
||||
//!< mutex time-out.
|
||||
RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF, //!< An unknown error occurred
|
||||
} rsmi_status_t;
|
||||
|
||||
|
||||
@@ -58,9 +58,7 @@
|
||||
#include "rocm_smi/rocm_smi_common.h"
|
||||
#include "rocm_smi/rocm_smi.h"
|
||||
#include "rocm_smi/rocm_smi_counters.h"
|
||||
extern "C" {
|
||||
#include "shared_mutex.h" // NOLINT
|
||||
};
|
||||
#include "shared_mutex.h" //NOLINT
|
||||
|
||||
namespace amd {
|
||||
namespace smi {
|
||||
|
||||
@@ -64,10 +64,7 @@
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
#include "rocm_smi/rocm_smi_utils.h"
|
||||
#include "rocm_smi/rocm_smi_kfd.h"
|
||||
|
||||
extern "C" {
|
||||
#include "shared_mutex.h" // NOLINT
|
||||
};
|
||||
|
||||
namespace amd {
|
||||
namespace smi {
|
||||
@@ -474,8 +471,6 @@ Device::Device(std::string p, RocmSMI_env_vars const *e) : path_(p), env_(e) {
|
||||
|
||||
std::string m_name("/rocm_smi_");
|
||||
m_name += dev;
|
||||
m_name += '_';
|
||||
m_name += std::to_string(geteuid());
|
||||
|
||||
mutex_ = shared_mutex_init(m_name.c_str(), 0777);
|
||||
|
||||
|
||||
@@ -1,14 +1,17 @@
|
||||
#include "shared_mutex.h"
|
||||
#include <errno.h> // errno, ENOENT
|
||||
#include <fcntl.h> // O_RDWR, O_CREATE
|
||||
#include <linux/limits.h> // NAME_MAX
|
||||
#include <sys/mman.h> // shm_open, shm_unlink, mmap, munmap,
|
||||
// NOLINT(legal/copyright)
|
||||
#include "shared_mutex.h" // NOLINT(build/include)
|
||||
#include <errno.h> // errno, ENOENT
|
||||
#include <fcntl.h> // O_RDWR, O_CREATE
|
||||
#include <linux/limits.h> // NAME_MAX
|
||||
#include <sys/mman.h> // shm_open, shm_unlink, mmap, munmap,
|
||||
// PROT_READ, PROT_WRITE, MAP_SHARED, MAP_FAILED
|
||||
#include <unistd.h> // ftruncate, close
|
||||
#include <stdio.h> // perror
|
||||
#include <stdlib.h> // malloc, free
|
||||
#include <string.h> // strcpy
|
||||
#include <time.h> // clock_gettime
|
||||
#include <unistd.h> // ftruncate, close
|
||||
#include <stdio.h> // perror
|
||||
#include <stdlib.h> // malloc, free
|
||||
#include <string.h> // strcpy
|
||||
#include <time.h> // clock_gettime
|
||||
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
|
||||
shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
|
||||
shared_mutex_t mutex = {NULL, 0, NULL, 0};
|
||||
@@ -21,7 +24,8 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
|
||||
if (errno == ENOENT) {
|
||||
mutex.shm_fd = shm_open(name, O_RDWR|O_CREAT, mode);
|
||||
mutex.created = 1;
|
||||
// Change permissions of shared memory, so every body can access it. Avoiding the umask of shm_open
|
||||
// Change permissions of shared memory, so everybody can access it.
|
||||
// Avoiding the umask of shm_open
|
||||
if (fchmod(mutex.shm_fd, mode) != 0) {
|
||||
perror("fchmod");
|
||||
}
|
||||
@@ -45,39 +49,40 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
|
||||
PROT_READ|PROT_WRITE,
|
||||
MAP_SHARED,
|
||||
mutex.shm_fd,
|
||||
0
|
||||
);
|
||||
0);
|
||||
|
||||
if (addr == MAP_FAILED) {
|
||||
perror("mmap");
|
||||
return mutex;
|
||||
}
|
||||
|
||||
pthread_mutex_t *mutex_ptr = (pthread_mutex_t *)addr;
|
||||
pthread_mutex_t *mutex_ptr = reinterpret_cast<pthread_mutex_t *>(addr);
|
||||
|
||||
// Make sure the mutex wasn't left in a locked state. If we can't
|
||||
// acquire it in 3 sec., re-do everything.
|
||||
struct timespec expireTime;
|
||||
clock_gettime(CLOCK_REALTIME, &expireTime);
|
||||
expireTime.tv_sec += 3;
|
||||
expireTime.tv_sec += 5;
|
||||
|
||||
int ret = pthread_mutex_timedlock(mutex_ptr, &expireTime);
|
||||
|
||||
if (ret || (mutex.created == 0 && ((shared_mutex_t *)addr)->ptr == NULL)) {
|
||||
// Something is out of sync. Unlink shm and start over.
|
||||
if (shm_unlink(name)) {
|
||||
mutex.shm_fd = 0;
|
||||
perror("shm_unlink");
|
||||
}
|
||||
|
||||
if (ret || (mutex.created == 0 &&
|
||||
reinterpret_cast<shared_mutex_t *>(addr)->ptr == NULL)) {
|
||||
// Something is out of sync.
|
||||
perror("Failed to initialize RSMI device mutex after 5 seconds. Previous "
|
||||
"execution may not have shutdown cleanly. To fix problem, stop all "
|
||||
"rocm_smi programs, and then delete the rocm_smi* shared memory files in"
|
||||
" /dev/shm.");
|
||||
free(mutex.name);
|
||||
|
||||
return shared_mutex_init(name, mode);
|
||||
throw amd::smi::rsmi_exception(RSMI_STATUS_RESOURCE_BUSY, __FUNCTION__);
|
||||
return mutex;
|
||||
} else {
|
||||
if (pthread_mutex_unlock(mutex_ptr)) {
|
||||
perror("pthread_mutex_unlock");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (mutex.created) {
|
||||
pthread_mutexattr_t attr;
|
||||
if (pthread_mutexattr_init(&attr)) {
|
||||
@@ -88,7 +93,7 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
|
||||
perror("pthread_mutexattr_setpshared");
|
||||
return mutex;
|
||||
}
|
||||
|
||||
|
||||
if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) {
|
||||
perror("pthread_mutexattr_settype");
|
||||
return mutex;
|
||||
@@ -98,15 +103,15 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
|
||||
return mutex;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
mutex.ptr = mutex_ptr;
|
||||
mutex.name = (char *)malloc(NAME_MAX+1);
|
||||
strcpy(mutex.name, name);
|
||||
mutex.name = reinterpret_cast<char *>(malloc(NAME_MAX+1));
|
||||
(void)snprintf(mutex.name, NAME_MAX + 1, "%s", name);
|
||||
return mutex;
|
||||
}
|
||||
|
||||
int shared_mutex_close(shared_mutex_t mutex) {
|
||||
if (munmap((void *)mutex.ptr, sizeof(pthread_mutex_t))) {
|
||||
int shared_mutex_close(shared_mutex_t mutex) {
|
||||
if (munmap(reinterpret_cast<void *>(mutex.ptr), sizeof(pthread_mutex_t))) {
|
||||
perror("munmap");
|
||||
return -1;
|
||||
}
|
||||
@@ -126,7 +131,7 @@ int shared_mutex_destroy(shared_mutex_t mutex) {
|
||||
perror("pthread_mutex_destroy");
|
||||
return -1;
|
||||
}
|
||||
if (munmap((void *)mutex.ptr, sizeof(pthread_mutex_t))) {
|
||||
if (munmap(reinterpret_cast<void *>(mutex.ptr), sizeof(pthread_mutex_t))) {
|
||||
perror("munmap");
|
||||
return -1;
|
||||
}
|
||||
Odkázat v novém úkolu
Zablokovat Uživatele