Merge amd-staging into amd-master 20240423
Change-Id: I55800aeb88b62f85aae2715a9d314aa4530d8576
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
[ROCm/rocm_smi_lib commit: 913d6fd173]
Этот коммит содержится в:
@@ -149,6 +149,7 @@ typedef enum {
|
||||
//!< information can be retrieved. By
|
||||
//!< default, only AMD devices are
|
||||
//!< enumerated by RSMI.
|
||||
RSMI_INIT_FLAG_THRAD_ONLY_MUTEX = 0x400000000000000, //!< The mutex limit to thread
|
||||
RSMI_INIT_FLAG_RESRV_TEST1 = 0x800000000000000, //!< Reserved for test
|
||||
} rsmi_init_flags_t;
|
||||
|
||||
|
||||
@@ -88,6 +88,9 @@ class RocmSMI {
|
||||
|
||||
void set_init_options(uint64_t options) {init_options_ = options;}
|
||||
uint64_t init_options() const {return init_options_;}
|
||||
uint64_t is_thread_only_mutex() const {
|
||||
return init_options_ & RSMI_INIT_FLAG_THRAD_ONLY_MUTEX;
|
||||
}
|
||||
|
||||
uint32_t euid() const {return euid_;}
|
||||
|
||||
|
||||
@@ -1265,6 +1265,7 @@ an additional value followed by * at index 1 and max value at index 2.
|
||||
constexpr uint32_t kOD_SCLK_label_array_index = 0;
|
||||
constexpr uint32_t kOD_MCLK_label_array_index =
|
||||
kOD_SCLK_label_array_index + 3;
|
||||
|
||||
constexpr uint32_t kOD_VDDC_CURVE_label_array_index =
|
||||
kOD_MCLK_label_array_index + 2;
|
||||
constexpr uint32_t kOD_OD_RANGE_label_array_index =
|
||||
@@ -1273,6 +1274,7 @@ constexpr uint32_t kOD_VDDC_CURVE_start_index =
|
||||
kOD_OD_RANGE_label_array_index + 3;
|
||||
// constexpr uint32_t kOD_VDDC_CURVE_num_lines =
|
||||
// kOD_VDDC_CURVE_start_index + 4;
|
||||
constexpr uint32_t kMIN_VALID_LINES = 2;
|
||||
|
||||
static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind,
|
||||
rsmi_od_volt_freq_data_t *p) {
|
||||
@@ -1292,7 +1294,7 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind,
|
||||
|
||||
// This is a work-around to handle systems where kDevPowerODVoltage is not
|
||||
// fully supported yet.
|
||||
if (val_vec.size() < 2) {
|
||||
if (val_vec.size() < kMIN_VALID_LINES) {
|
||||
return RSMI_STATUS_NOT_YET_IMPLEMENTED;
|
||||
}
|
||||
|
||||
@@ -1303,6 +1305,7 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind,
|
||||
return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
|
||||
// find last_item but skip empty lines
|
||||
int last_item = val_vec.size()-1;
|
||||
while (val_vec[last_item].empty() || val_vec[last_item][0] == 0)
|
||||
@@ -1348,39 +1351,9 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind,
|
||||
if (val_vec.size() < kOD_VDDC_CURVE_label_array_index) {
|
||||
return RSMI_STATUS_UNEXPECTED_SIZE;
|
||||
}
|
||||
assert(val_vec[kOD_VDDC_CURVE_label_array_index] == "OD_VDDC_CURVE:");
|
||||
if (val_vec[kOD_VDDC_CURVE_label_array_index] != "OD_VDDC_CURVE:") {
|
||||
return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
uint32_t tmp = kOD_VDDC_CURVE_label_array_index + 1;
|
||||
if (val_vec.size() < (tmp + RSMI_NUM_VOLTAGE_CURVE_POINTS)) {
|
||||
return RSMI_STATUS_UNEXPECTED_SIZE;
|
||||
}
|
||||
for (uint32_t i = 0; i < RSMI_NUM_VOLTAGE_CURVE_POINTS; ++i) {
|
||||
freq_volt_string_to_point(val_vec[tmp + i], &(p->curve.vc_points[i]));
|
||||
}
|
||||
|
||||
if (val_vec.size() < (kOD_OD_RANGE_label_array_index + 2)) {
|
||||
return RSMI_STATUS_UNEXPECTED_SIZE;
|
||||
}
|
||||
assert(val_vec[kOD_OD_RANGE_label_array_index] == "OD_RANGE:");
|
||||
if (val_vec[kOD_OD_RANGE_label_array_index] != "OD_RANGE:") {
|
||||
return RSMI_STATUS_UNEXPECTED_DATA;
|
||||
}
|
||||
|
||||
od_value_pair_str_to_range(val_vec[kOD_OD_RANGE_label_array_index + 1],
|
||||
&(p->sclk_freq_limits));
|
||||
od_value_pair_str_to_range(val_vec[kOD_OD_RANGE_label_array_index + 2],
|
||||
&(p->mclk_freq_limits));
|
||||
|
||||
assert((val_vec.size() - kOD_VDDC_CURVE_start_index)%2 == 0);
|
||||
if ((val_vec.size() - kOD_VDDC_CURVE_start_index)%2 != 0) {
|
||||
return RSMI_STATUS_UNEXPECTED_SIZE;
|
||||
}
|
||||
|
||||
p->num_regions =
|
||||
static_cast<uint32_t>((val_vec.size() - kOD_VDDC_CURVE_start_index) / 2);
|
||||
static_cast<uint32_t>((val_vec.size()) / 2);
|
||||
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
CATCH
|
||||
@@ -1653,28 +1626,13 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind,
|
||||
|
||||
uint32_t val_vec_size = static_cast<uint32_t>(val_vec.size());
|
||||
assert((val_vec_size - kOD_VDDC_CURVE_start_index) > 0);
|
||||
assert((val_vec_size - kOD_VDDC_CURVE_start_index)%2 == 0);
|
||||
|
||||
ss << __PRETTY_FUNCTION__
|
||||
<< " | val_vec_size = " << std::dec
|
||||
<< val_vec_size
|
||||
<< " | kOD_VDDC_CURVE_start_index = " << kOD_VDDC_CURVE_start_index;
|
||||
LOG_DEBUG(ss);
|
||||
if (((val_vec_size - kOD_VDDC_CURVE_start_index) <= 0) ||
|
||||
(((val_vec_size - kOD_VDDC_CURVE_start_index)%2 != 0))) {
|
||||
ss << __PRETTY_FUNCTION__ << " | Issue: od vdd curve returned unexpected "
|
||||
<< "data" << "; returning "
|
||||
<< getRSMIStatusString(RSMI_STATUS_UNEXPECTED_SIZE);
|
||||
LOG_ERROR(ss);
|
||||
throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_SIZE, __FUNCTION__);
|
||||
}
|
||||
|
||||
*num_regions = std::min((val_vec_size - kOD_VDDC_CURVE_start_index) / 2,
|
||||
*num_regions);
|
||||
|
||||
for (uint32_t i=0; i < *num_regions; ++i) {
|
||||
get_vc_region(kOD_VDDC_CURVE_start_index + i*2, &val_vec, p + i);
|
||||
}
|
||||
*num_regions = std::min((val_vec_size) / 2, *num_regions);
|
||||
|
||||
return RSMI_STATUS_SUCCESS;
|
||||
CATCH
|
||||
|
||||
+148
-69
@@ -41,6 +41,22 @@ THE SOFTWARE.
|
||||
#include <vector>
|
||||
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
#include "rocm_smi/rocm_smi_main.h"
|
||||
|
||||
#define THREAD_ONLY_ENV_VAR "RSMI_MUTEX_THREAD_ONLY"
|
||||
#define MUTEX_TIME_OUT_ENV_VAR "RSMI_MUTEX_TIMEOUT"
|
||||
#define DEFAULT_MUTEX_TIMEOUT_SECONDS 5
|
||||
|
||||
static int GetEnvVarUInteger(const char *ev_str) {
|
||||
ev_str = getenv(ev_str);
|
||||
|
||||
if (ev_str) {
|
||||
const int ret = atoi(ev_str);
|
||||
return ret;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
// find which processes are using the file by searching /proc/*/fd
|
||||
static std::vector<std::string> lsof(const char* filename) {
|
||||
@@ -81,9 +97,53 @@ static std::vector<std::string> lsof(const char* filename) {
|
||||
return matched_process;
|
||||
}
|
||||
|
||||
shared_mutex_t shared_mutex_init(const char *name, mode_t mode, bool retried) {
|
||||
shared_mutex_t mutex = {NULL, 0, NULL, 0};
|
||||
// RSMI_MUTEX_THREAD_ONLY = 1 to enable thread safe mutex
|
||||
shared_mutex_t init_thread_safe_only(const char *name) {
|
||||
shared_mutex_t mutex = {nullptr, 0, nullptr, 0};
|
||||
errno = 0;
|
||||
mutex.shm_fd = -1;
|
||||
mutex.created = 0;
|
||||
pthread_mutex_t *mutex_ptr = new pthread_mutex_t();
|
||||
|
||||
pthread_mutexattr_t attr;
|
||||
if (pthread_mutexattr_init(&attr)) {
|
||||
perror("pthread_mutexattr_init");
|
||||
return mutex;
|
||||
}
|
||||
if (pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)) {
|
||||
perror("pthread_mutexattr_setpshared");
|
||||
return mutex;
|
||||
}
|
||||
|
||||
if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) {
|
||||
perror("pthread_mutexattr_settype");
|
||||
return mutex;
|
||||
}
|
||||
if (pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST)) {
|
||||
perror("pthread_mutexattr_setrobust");
|
||||
return mutex;
|
||||
}
|
||||
if (pthread_mutex_init(mutex_ptr, &attr)) {
|
||||
perror("pthread_mutex_init");
|
||||
return mutex;
|
||||
}
|
||||
|
||||
mutex.ptr = mutex_ptr;
|
||||
mutex.name = reinterpret_cast<char *>(malloc(NAME_MAX+1));
|
||||
(void)snprintf(mutex.name, NAME_MAX + 1, "%s", name);
|
||||
return mutex;
|
||||
}
|
||||
|
||||
shared_mutex_t shared_mutex_init(const char *name, mode_t mode, bool retried) {
|
||||
shared_mutex_t mutex = {nullptr, 0, nullptr, 0};
|
||||
errno = 0;
|
||||
|
||||
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
|
||||
|
||||
if (GetEnvVarUInteger(THREAD_ONLY_ENV_VAR) == 1 || smi.is_thread_only_mutex()) {
|
||||
fprintf(stderr, "rocm-smi: using thread safe only mutex\n");
|
||||
return init_thread_safe_only(name);
|
||||
}
|
||||
|
||||
// Open existing shared memory object, or create one.
|
||||
// Two separate calls are needed here, to mark fact of creation
|
||||
@@ -112,7 +172,7 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode, bool retried) {
|
||||
|
||||
// Map pthread mutex into the shared memory.
|
||||
void *addr = mmap(
|
||||
NULL,
|
||||
nullptr,
|
||||
sizeof(pthread_mutex_t),
|
||||
PROT_READ|PROT_WRITE,
|
||||
MAP_SHARED,
|
||||
@@ -130,67 +190,12 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode, bool retried) {
|
||||
// acquire it in 5 sec., re-do everything.
|
||||
struct timespec expireTime;
|
||||
clock_gettime(CLOCK_REALTIME, &expireTime);
|
||||
expireTime.tv_sec += 5;
|
||||
int time_out = GetEnvVarUInteger(MUTEX_TIME_OUT_ENV_VAR);
|
||||
time_out = time_out < DEFAULT_MUTEX_TIMEOUT_SECONDS ? DEFAULT_MUTEX_TIMEOUT_SECONDS: time_out;
|
||||
expireTime.tv_sec += time_out;
|
||||
|
||||
int ret;
|
||||
|
||||
ret = pthread_mutex_timedlock(mutex_ptr, &expireTime);
|
||||
pid_t cur_pid = getpid();
|
||||
|
||||
if (ret == EOWNERDEAD) {
|
||||
ret = pthread_mutex_consistent(mutex_ptr);
|
||||
// This function should not fail unless mutex_ptr is not robust
|
||||
// or mutex_ptr is not in an inconsistent state. Neither scenario
|
||||
// should ever be true at this point in the code.
|
||||
assert(!ret);
|
||||
|
||||
// ...but if there are undocumented failure cases for
|
||||
// pthread_mutex_consistent() handle them for release builds.
|
||||
if (ret) {
|
||||
fprintf(stderr, "pthread_mutex_consistent() returned %d\n", ret);
|
||||
free(mutex.name);
|
||||
|
||||
throw amd::smi::rsmi_exception(RSMI_STATUS_BUSY, __FUNCTION__);
|
||||
return mutex;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: %d detected dead process, and make mutex consistent.\n", name, cur_pid);
|
||||
// The mutex is locked even if EOWNERDEAD was returned,and need to unlock it.
|
||||
if (pthread_mutex_unlock(mutex_ptr)) {
|
||||
perror("pthread_mutex_unlock");
|
||||
}
|
||||
} else if (ret || (mutex.created == 0 &&
|
||||
reinterpret_cast<shared_mutex_t *>(addr)->ptr == NULL)) {
|
||||
// Something is out of sync.
|
||||
|
||||
// When process crash before unlock the mutex, the mutex is in bad status.
|
||||
// reset the mutex if no process is using it, and then retry lock
|
||||
if (!retried) {
|
||||
std::vector<std::string> ids = lsof(name);
|
||||
if (ids.size() == 0) { // no process is using it
|
||||
fprintf(stderr, "%s: %d re-init the mutex since no one use it.\n", name, cur_pid);
|
||||
memset(mutex_ptr, 0, sizeof(pthread_mutex_t));
|
||||
// Set mutex.created == 1 so that it can be initialized latter.
|
||||
mutex.created = 1;
|
||||
free(mutex.name);
|
||||
return shared_mutex_init(name, mode, true);
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "pthread_mutex_timedlock() returned %d\n", ret);
|
||||
perror("Failed to initialize RSMI device mutex after 5 seconds. Previous "
|
||||
"execution may not have shutdown cleanly. To fix problem, stop all "
|
||||
"rocm_smi programs, and then delete the rocm_smi* shared memory files in"
|
||||
" /dev/shm.");
|
||||
free(mutex.name);
|
||||
|
||||
throw amd::smi::rsmi_exception(RSMI_STATUS_BUSY, __FUNCTION__);
|
||||
return mutex;
|
||||
} else {
|
||||
if (pthread_mutex_unlock(mutex_ptr)) {
|
||||
perror("pthread_mutex_unlock");
|
||||
}
|
||||
}
|
||||
int ret;
|
||||
|
||||
// also need to set the attribute when retried as the mutex is re-initialized.
|
||||
if (mutex.created || retried) {
|
||||
@@ -218,6 +223,70 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode, bool retried) {
|
||||
}
|
||||
}
|
||||
|
||||
ret = pthread_mutex_timedlock(mutex_ptr, &expireTime);
|
||||
|
||||
|
||||
if (ret == EOWNERDEAD) {
|
||||
ret = pthread_mutex_consistent(mutex_ptr);
|
||||
// This function should not fail unless mutex_ptr is not robust
|
||||
// or mutex_ptr is not in an inconsistent state. Neither scenario
|
||||
// should ever be true at this point in the code.
|
||||
assert(!ret);
|
||||
|
||||
// ...but if there are undocumented failure cases for
|
||||
// pthread_mutex_consistent() handle them for release builds.
|
||||
if (ret) {
|
||||
fprintf(stderr, "pthread_mutex_consistent() returned %d\n", ret);
|
||||
free(mutex.name);
|
||||
|
||||
throw amd::smi::rsmi_exception(RSMI_STATUS_BUSY, __FUNCTION__);
|
||||
return mutex;
|
||||
}
|
||||
|
||||
fprintf(stderr, "%d detected dead process, and making mutex %s consistent.\n",
|
||||
cur_pid, name);
|
||||
// The mutex is locked even if EOWNERDEAD was returned,and need to unlock it.
|
||||
if (pthread_mutex_unlock(mutex_ptr)) {
|
||||
perror("pthread_mutex_unlock");
|
||||
}
|
||||
} else if (ret || (mutex.created == 0 &&
|
||||
reinterpret_cast<shared_mutex_t *>(addr)->ptr == nullptr)) {
|
||||
// Something is out of sync.
|
||||
|
||||
// When process crash before unlock the mutex, the mutex is in bad status.
|
||||
// reset the mutex if no process is using it, and then retry lock
|
||||
if (!retried) {
|
||||
std::vector<std::string> ids = lsof(name);
|
||||
if (ids.size() == 0) { // no process is using it
|
||||
fprintf(stderr, "%d re-init the mutex %s since no one use it. ret:%d ptr:%p\n",
|
||||
cur_pid, name, ret, reinterpret_cast<shared_mutex_t *>(addr)->ptr);
|
||||
memset(mutex_ptr, 0, sizeof(pthread_mutex_t));
|
||||
// Set mutex.created == 1 so that it can be initialized latter.
|
||||
mutex.created = 1;
|
||||
free(mutex.name);
|
||||
return shared_mutex_init(name, mode, true);
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "pthread_mutex_timedlock() returned %d\n", ret);
|
||||
perror("Failed to initialize RSMI device mutex after 5 seconds. Previous "
|
||||
"execution may not have shutdown cleanly. To fix problem, stop all "
|
||||
"rocm_smi programs, and then delete the rocm_smi* shared memory files in"
|
||||
" /dev/shm.");
|
||||
free(mutex.name);
|
||||
|
||||
throw amd::smi::rsmi_exception(RSMI_STATUS_BUSY, __FUNCTION__);
|
||||
return mutex;
|
||||
} else {
|
||||
const int ret = pthread_mutex_unlock(mutex_ptr);
|
||||
if (ret) {
|
||||
perror("pthread_mutex_unlock");
|
||||
fprintf(stderr, "%d init_mutex %s: unlock timed lock, ret: %d\n", cur_pid, name, ret);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
mutex.ptr = mutex_ptr;
|
||||
mutex.name = reinterpret_cast<char *>(malloc(NAME_MAX+1));
|
||||
(void)snprintf(mutex.name, NAME_MAX + 1, "%s", name);
|
||||
@@ -225,12 +294,17 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode, bool retried) {
|
||||
}
|
||||
|
||||
int shared_mutex_close(shared_mutex_t mutex) {
|
||||
if (munmap(reinterpret_cast<void *>(mutex.ptr), sizeof(pthread_mutex_t))) {
|
||||
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
|
||||
const bool is_thread_only = GetEnvVarUInteger(THREAD_ONLY_ENV_VAR) == 1 ||
|
||||
smi.is_thread_only_mutex();
|
||||
if (is_thread_only) {
|
||||
delete mutex.ptr;
|
||||
} else if (munmap(reinterpret_cast<void *>(mutex.ptr), sizeof(pthread_mutex_t))) {
|
||||
perror("munmap");
|
||||
return -1;
|
||||
}
|
||||
mutex.ptr = NULL;
|
||||
if (close(mutex.shm_fd)) {
|
||||
mutex.ptr = nullptr;
|
||||
if (!is_thread_only && close(mutex.shm_fd)) {
|
||||
perror("close");
|
||||
return -1;
|
||||
}
|
||||
@@ -241,21 +315,26 @@ int shared_mutex_close(shared_mutex_t mutex) {
|
||||
}
|
||||
|
||||
int shared_mutex_destroy(shared_mutex_t mutex) {
|
||||
amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance();
|
||||
const bool is_thread_only = GetEnvVarUInteger(THREAD_ONLY_ENV_VAR) == 1 ||
|
||||
smi.is_thread_only_mutex();
|
||||
if ((errno = pthread_mutex_destroy(mutex.ptr))) {
|
||||
perror("pthread_mutex_destroy");
|
||||
return -1;
|
||||
}
|
||||
if (munmap(reinterpret_cast<void *>(mutex.ptr), sizeof(pthread_mutex_t))) {
|
||||
if (is_thread_only) {
|
||||
delete mutex.ptr;
|
||||
} else if (munmap(reinterpret_cast<void *>(mutex.ptr), sizeof(pthread_mutex_t))) {
|
||||
perror("munmap");
|
||||
return -1;
|
||||
}
|
||||
mutex.ptr = NULL;
|
||||
if (close(mutex.shm_fd)) {
|
||||
mutex.ptr = nullptr;
|
||||
if (!is_thread_only && close(mutex.shm_fd)) {
|
||||
perror("close");
|
||||
return -1;
|
||||
}
|
||||
mutex.shm_fd = 0;
|
||||
if (shm_unlink(mutex.name)) {
|
||||
if (!is_thread_only && shm_unlink(mutex.name)) {
|
||||
perror("shm_unlink");
|
||||
return -1;
|
||||
}
|
||||
|
||||
Ссылка в новой задаче
Block a user