From c10a21670237975f94585d3b19f22b89a18adcf3 Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Mon, 22 Apr 2024 23:59:19 -0500 Subject: [PATCH 1/2] fix: [SWDEV-458101] [rocm/rocm_smi_lib] Drops checks that are invalid with the new pp_od_clk_voltage format Code changes related to the following: * get_od_clk_volt_info() * get_od_clk_volt_curve_regions() Change-Id: I5ebe23aa0ed4ea77d5ab5a94ce34ad9b1b51281f Signed-off-by: Oliveira, Daniel [ROCm/rocm_smi_lib commit: e95d80f7ef9c8141cf5ee11e961894198b965a8d] --- projects/rocm-smi-lib/src/rocm_smi.cc | 54 +++------------------------ 1 file changed, 6 insertions(+), 48 deletions(-) diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 486e2c94c8..fb38823d13 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -1265,6 +1265,7 @@ an additional value followed by * at index 1 and max value at index 2. constexpr uint32_t kOD_SCLK_label_array_index = 0; constexpr uint32_t kOD_MCLK_label_array_index = kOD_SCLK_label_array_index + 3; + constexpr uint32_t kOD_VDDC_CURVE_label_array_index = kOD_MCLK_label_array_index + 2; constexpr uint32_t kOD_OD_RANGE_label_array_index = @@ -1273,6 +1274,7 @@ constexpr uint32_t kOD_VDDC_CURVE_start_index = kOD_OD_RANGE_label_array_index + 3; // constexpr uint32_t kOD_VDDC_CURVE_num_lines = // kOD_VDDC_CURVE_start_index + 4; +constexpr uint32_t kMIN_VALID_LINES = 2; static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, rsmi_od_volt_freq_data_t *p) { @@ -1292,7 +1294,7 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, // This is a work-around to handle systems where kDevPowerODVoltage is not // fully supported yet. - if (val_vec.size() < 2) { + if (val_vec.size() < kMIN_VALID_LINES) { return RSMI_STATUS_NOT_YET_IMPLEMENTED; } @@ -1303,6 +1305,7 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, return RSMI_STATUS_UNEXPECTED_DATA; } + // find last_item but skip empty lines int last_item = val_vec.size()-1; while (val_vec[last_item].empty() || val_vec[last_item][0] == 0) @@ -1348,39 +1351,9 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, if (val_vec.size() < kOD_VDDC_CURVE_label_array_index) { return RSMI_STATUS_UNEXPECTED_SIZE; } - assert(val_vec[kOD_VDDC_CURVE_label_array_index] == "OD_VDDC_CURVE:"); - if (val_vec[kOD_VDDC_CURVE_label_array_index] != "OD_VDDC_CURVE:") { - return RSMI_STATUS_UNEXPECTED_DATA; - } - - uint32_t tmp = kOD_VDDC_CURVE_label_array_index + 1; - if (val_vec.size() < (tmp + RSMI_NUM_VOLTAGE_CURVE_POINTS)) { - return RSMI_STATUS_UNEXPECTED_SIZE; - } - for (uint32_t i = 0; i < RSMI_NUM_VOLTAGE_CURVE_POINTS; ++i) { - freq_volt_string_to_point(val_vec[tmp + i], &(p->curve.vc_points[i])); - } - - if (val_vec.size() < (kOD_OD_RANGE_label_array_index + 2)) { - return RSMI_STATUS_UNEXPECTED_SIZE; - } - assert(val_vec[kOD_OD_RANGE_label_array_index] == "OD_RANGE:"); - if (val_vec[kOD_OD_RANGE_label_array_index] != "OD_RANGE:") { - return RSMI_STATUS_UNEXPECTED_DATA; - } - - od_value_pair_str_to_range(val_vec[kOD_OD_RANGE_label_array_index + 1], - &(p->sclk_freq_limits)); - od_value_pair_str_to_range(val_vec[kOD_OD_RANGE_label_array_index + 2], - &(p->mclk_freq_limits)); - - assert((val_vec.size() - kOD_VDDC_CURVE_start_index)%2 == 0); - if ((val_vec.size() - kOD_VDDC_CURVE_start_index)%2 != 0) { - return RSMI_STATUS_UNEXPECTED_SIZE; - } p->num_regions = - static_cast((val_vec.size() - kOD_VDDC_CURVE_start_index) / 2); + static_cast((val_vec.size()) / 2); return RSMI_STATUS_SUCCESS; CATCH @@ -1653,28 +1626,13 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind, uint32_t val_vec_size = static_cast(val_vec.size()); assert((val_vec_size - kOD_VDDC_CURVE_start_index) > 0); - assert((val_vec_size - kOD_VDDC_CURVE_start_index)%2 == 0); ss << __PRETTY_FUNCTION__ << " | val_vec_size = " << std::dec << val_vec_size << " | kOD_VDDC_CURVE_start_index = " << kOD_VDDC_CURVE_start_index; LOG_DEBUG(ss); - if (((val_vec_size - kOD_VDDC_CURVE_start_index) <= 0) || - (((val_vec_size - kOD_VDDC_CURVE_start_index)%2 != 0))) { - ss << __PRETTY_FUNCTION__ << " | Issue: od vdd curve returned unexpected " - << "data" << "; returning " - << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_SIZE); - LOG_ERROR(ss); - throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_SIZE, __FUNCTION__); - } - - *num_regions = std::min((val_vec_size - kOD_VDDC_CURVE_start_index) / 2, - *num_regions); - - for (uint32_t i=0; i < *num_regions; ++i) { - get_vc_region(kOD_VDDC_CURVE_start_index + i*2, &val_vec, p + i); - } + *num_regions = std::min((val_vec_size) / 2, *num_regions); return RSMI_STATUS_SUCCESS; CATCH From ada2cf681d92335e038eb37e99312c2fa5d19caf Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Fri, 19 Apr 2024 13:16:29 -0500 Subject: [PATCH 2/2] Support thread only mutex The environment variable RSMI_MUTEX_THREAD_ONLY=1 to enable thread only mutex. The RSMI_INIT_FLAG_THRAD_ONLY_MUTEX can also be pass to rsmi_init() to enable thread only mutex. Change-Id: I2d9844039b774e386f03bb9bb130d8c342504ea6 [ROCm/rocm_smi_lib commit: 6ff95e55da7064ef7d705d26c58111a94172c107] --- .../rocm-smi-lib/include/rocm_smi/rocm_smi.h | 1 + .../include/rocm_smi/rocm_smi_main.h | 3 + .../third_party/shared_mutex/shared_mutex.cc | 217 ++++++++++++------ 3 files changed, 152 insertions(+), 69 deletions(-) diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index 74c83ca561..963d1e0c28 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -149,6 +149,7 @@ typedef enum { //!< information can be retrieved. By //!< default, only AMD devices are //!< enumerated by RSMI. + RSMI_INIT_FLAG_THRAD_ONLY_MUTEX = 0x400000000000000, //!< The mutex limit to thread RSMI_INIT_FLAG_RESRV_TEST1 = 0x800000000000000, //!< Reserved for test } rsmi_init_flags_t; diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h index c957f5123c..0a66ea227c 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_main.h @@ -88,6 +88,9 @@ class RocmSMI { void set_init_options(uint64_t options) {init_options_ = options;} uint64_t init_options() const {return init_options_;} + uint64_t is_thread_only_mutex() const { + return init_options_ & RSMI_INIT_FLAG_THRAD_ONLY_MUTEX; + } uint32_t euid() const {return euid_;} diff --git a/projects/rocm-smi-lib/third_party/shared_mutex/shared_mutex.cc b/projects/rocm-smi-lib/third_party/shared_mutex/shared_mutex.cc index a1f711fd12..16da69d5a2 100755 --- a/projects/rocm-smi-lib/third_party/shared_mutex/shared_mutex.cc +++ b/projects/rocm-smi-lib/third_party/shared_mutex/shared_mutex.cc @@ -41,6 +41,22 @@ THE SOFTWARE. #include #include "rocm_smi/rocm_smi_exception.h" +#include "rocm_smi/rocm_smi_main.h" + +#define THREAD_ONLY_ENV_VAR "RSMI_MUTEX_THREAD_ONLY" +#define MUTEX_TIME_OUT_ENV_VAR "RSMI_MUTEX_TIMEOUT" +#define DEFAULT_MUTEX_TIMEOUT_SECONDS 5 + +static int GetEnvVarUInteger(const char *ev_str) { + ev_str = getenv(ev_str); + + if (ev_str) { + const int ret = atoi(ev_str); + return ret; + } + + return -1; +} // find which processes are using the file by searching /proc/*/fd static std::vector lsof(const char* filename) { @@ -81,9 +97,53 @@ static std::vector lsof(const char* filename) { return matched_process; } -shared_mutex_t shared_mutex_init(const char *name, mode_t mode, bool retried) { - shared_mutex_t mutex = {NULL, 0, NULL, 0}; +// RSMI_MUTEX_THREAD_ONLY = 1 to enable thread safe mutex +shared_mutex_t init_thread_safe_only(const char *name) { + shared_mutex_t mutex = {nullptr, 0, nullptr, 0}; errno = 0; + mutex.shm_fd = -1; + mutex.created = 0; + pthread_mutex_t *mutex_ptr = new pthread_mutex_t(); + + pthread_mutexattr_t attr; + if (pthread_mutexattr_init(&attr)) { + perror("pthread_mutexattr_init"); + return mutex; + } + if (pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)) { + perror("pthread_mutexattr_setpshared"); + return mutex; + } + + if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) { + perror("pthread_mutexattr_settype"); + return mutex; + } + if (pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST)) { + perror("pthread_mutexattr_setrobust"); + return mutex; + } + if (pthread_mutex_init(mutex_ptr, &attr)) { + perror("pthread_mutex_init"); + return mutex; + } + + mutex.ptr = mutex_ptr; + mutex.name = reinterpret_cast(malloc(NAME_MAX+1)); + (void)snprintf(mutex.name, NAME_MAX + 1, "%s", name); + return mutex; +} + +shared_mutex_t shared_mutex_init(const char *name, mode_t mode, bool retried) { + shared_mutex_t mutex = {nullptr, 0, nullptr, 0}; + errno = 0; + + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); + + if (GetEnvVarUInteger(THREAD_ONLY_ENV_VAR) == 1 || smi.is_thread_only_mutex()) { + fprintf(stderr, "rocm-smi: using thread safe only mutex\n"); + return init_thread_safe_only(name); + } // Open existing shared memory object, or create one. // Two separate calls are needed here, to mark fact of creation @@ -112,7 +172,7 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode, bool retried) { // Map pthread mutex into the shared memory. void *addr = mmap( - NULL, + nullptr, sizeof(pthread_mutex_t), PROT_READ|PROT_WRITE, MAP_SHARED, @@ -130,67 +190,12 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode, bool retried) { // acquire it in 5 sec., re-do everything. struct timespec expireTime; clock_gettime(CLOCK_REALTIME, &expireTime); - expireTime.tv_sec += 5; + int time_out = GetEnvVarUInteger(MUTEX_TIME_OUT_ENV_VAR); + time_out = time_out < DEFAULT_MUTEX_TIMEOUT_SECONDS ? DEFAULT_MUTEX_TIMEOUT_SECONDS: time_out; + expireTime.tv_sec += time_out; - int ret; - - ret = pthread_mutex_timedlock(mutex_ptr, &expireTime); pid_t cur_pid = getpid(); - - if (ret == EOWNERDEAD) { - ret = pthread_mutex_consistent(mutex_ptr); - // This function should not fail unless mutex_ptr is not robust - // or mutex_ptr is not in an inconsistent state. Neither scenario - // should ever be true at this point in the code. - assert(!ret); - - // ...but if there are undocumented failure cases for - // pthread_mutex_consistent() handle them for release builds. - if (ret) { - fprintf(stderr, "pthread_mutex_consistent() returned %d\n", ret); - free(mutex.name); - - throw amd::smi::rsmi_exception(RSMI_STATUS_BUSY, __FUNCTION__); - return mutex; - } - - fprintf(stderr, "%s: %d detected dead process, and make mutex consistent.\n", name, cur_pid); - // The mutex is locked even if EOWNERDEAD was returned,and need to unlock it. - if (pthread_mutex_unlock(mutex_ptr)) { - perror("pthread_mutex_unlock"); - } - } else if (ret || (mutex.created == 0 && - reinterpret_cast(addr)->ptr == NULL)) { - // Something is out of sync. - - // When process crash before unlock the mutex, the mutex is in bad status. - // reset the mutex if no process is using it, and then retry lock - if (!retried) { - std::vector ids = lsof(name); - if (ids.size() == 0) { // no process is using it - fprintf(stderr, "%s: %d re-init the mutex since no one use it.\n", name, cur_pid); - memset(mutex_ptr, 0, sizeof(pthread_mutex_t)); - // Set mutex.created == 1 so that it can be initialized latter. - mutex.created = 1; - free(mutex.name); - return shared_mutex_init(name, mode, true); - } - } - - fprintf(stderr, "pthread_mutex_timedlock() returned %d\n", ret); - perror("Failed to initialize RSMI device mutex after 5 seconds. Previous " - "execution may not have shutdown cleanly. To fix problem, stop all " - "rocm_smi programs, and then delete the rocm_smi* shared memory files in" - " /dev/shm."); - free(mutex.name); - - throw amd::smi::rsmi_exception(RSMI_STATUS_BUSY, __FUNCTION__); - return mutex; - } else { - if (pthread_mutex_unlock(mutex_ptr)) { - perror("pthread_mutex_unlock"); - } - } + int ret; // also need to set the attribute when retried as the mutex is re-initialized. if (mutex.created || retried) { @@ -218,6 +223,70 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode, bool retried) { } } + ret = pthread_mutex_timedlock(mutex_ptr, &expireTime); + + + if (ret == EOWNERDEAD) { + ret = pthread_mutex_consistent(mutex_ptr); + // This function should not fail unless mutex_ptr is not robust + // or mutex_ptr is not in an inconsistent state. Neither scenario + // should ever be true at this point in the code. + assert(!ret); + + // ...but if there are undocumented failure cases for + // pthread_mutex_consistent() handle them for release builds. + if (ret) { + fprintf(stderr, "pthread_mutex_consistent() returned %d\n", ret); + free(mutex.name); + + throw amd::smi::rsmi_exception(RSMI_STATUS_BUSY, __FUNCTION__); + return mutex; + } + + fprintf(stderr, "%d detected dead process, and making mutex %s consistent.\n", + cur_pid, name); + // The mutex is locked even if EOWNERDEAD was returned,and need to unlock it. + if (pthread_mutex_unlock(mutex_ptr)) { + perror("pthread_mutex_unlock"); + } + } else if (ret || (mutex.created == 0 && + reinterpret_cast(addr)->ptr == nullptr)) { + // Something is out of sync. + + // When process crash before unlock the mutex, the mutex is in bad status. + // reset the mutex if no process is using it, and then retry lock + if (!retried) { + std::vector ids = lsof(name); + if (ids.size() == 0) { // no process is using it + fprintf(stderr, "%d re-init the mutex %s since no one use it. ret:%d ptr:%p\n", + cur_pid, name, ret, reinterpret_cast(addr)->ptr); + memset(mutex_ptr, 0, sizeof(pthread_mutex_t)); + // Set mutex.created == 1 so that it can be initialized latter. + mutex.created = 1; + free(mutex.name); + return shared_mutex_init(name, mode, true); + } + } + + fprintf(stderr, "pthread_mutex_timedlock() returned %d\n", ret); + perror("Failed to initialize RSMI device mutex after 5 seconds. Previous " + "execution may not have shutdown cleanly. To fix problem, stop all " + "rocm_smi programs, and then delete the rocm_smi* shared memory files in" + " /dev/shm."); + free(mutex.name); + + throw amd::smi::rsmi_exception(RSMI_STATUS_BUSY, __FUNCTION__); + return mutex; + } else { + const int ret = pthread_mutex_unlock(mutex_ptr); + if (ret) { + perror("pthread_mutex_unlock"); + fprintf(stderr, "%d init_mutex %s: unlock timed lock, ret: %d\n", cur_pid, name, ret); + } + } + + + mutex.ptr = mutex_ptr; mutex.name = reinterpret_cast(malloc(NAME_MAX+1)); (void)snprintf(mutex.name, NAME_MAX + 1, "%s", name); @@ -225,12 +294,17 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode, bool retried) { } int shared_mutex_close(shared_mutex_t mutex) { - if (munmap(reinterpret_cast(mutex.ptr), sizeof(pthread_mutex_t))) { + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); + const bool is_thread_only = GetEnvVarUInteger(THREAD_ONLY_ENV_VAR) == 1 || + smi.is_thread_only_mutex(); + if (is_thread_only) { + delete mutex.ptr; + } else if (munmap(reinterpret_cast(mutex.ptr), sizeof(pthread_mutex_t))) { perror("munmap"); return -1; } - mutex.ptr = NULL; - if (close(mutex.shm_fd)) { + mutex.ptr = nullptr; + if (!is_thread_only && close(mutex.shm_fd)) { perror("close"); return -1; } @@ -241,21 +315,26 @@ int shared_mutex_close(shared_mutex_t mutex) { } int shared_mutex_destroy(shared_mutex_t mutex) { + amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); + const bool is_thread_only = GetEnvVarUInteger(THREAD_ONLY_ENV_VAR) == 1 || + smi.is_thread_only_mutex(); if ((errno = pthread_mutex_destroy(mutex.ptr))) { perror("pthread_mutex_destroy"); return -1; } - if (munmap(reinterpret_cast(mutex.ptr), sizeof(pthread_mutex_t))) { + if (is_thread_only) { + delete mutex.ptr; + } else if (munmap(reinterpret_cast(mutex.ptr), sizeof(pthread_mutex_t))) { perror("munmap"); return -1; } - mutex.ptr = NULL; - if (close(mutex.shm_fd)) { + mutex.ptr = nullptr; + if (!is_thread_only && close(mutex.shm_fd)) { perror("close"); return -1; } mutex.shm_fd = 0; - if (shm_unlink(mutex.name)) { + if (!is_thread_only && shm_unlink(mutex.name)) { perror("shm_unlink"); return -1; }