Fix process killed while holding mutex
Previously, when a process holding a shared mutex was killed,
the next time an RSMI application was started, it would not be
able to obtain the mutex--the application would have to exit.
This fix uses pthread_mutexattr_setrobust() to detect this
situation and act accordingingly.
Also, add some missing, needed mutexes and move mutexes
closer to where the protect resource is used.
Change-Id: Icfdc3a246f4cfa3fd008e3f13472199abd76fd35
[ROCm/rocm_smi_lib commit: f4938b0ac9]
This commit is contained in:
@@ -1078,6 +1078,7 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level,
|
||||
{RSMI_CLK_TYPE_SYS, "s"},
|
||||
{RSMI_CLK_TYPE_MEM, "m"},
|
||||
};
|
||||
DEVICE_MUTEX
|
||||
|
||||
// Set perf. level to manual so that we can then set the power profile
|
||||
ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_MANUAL);
|
||||
@@ -1127,6 +1128,8 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint,
|
||||
TRY
|
||||
rsmi_status_t ret;
|
||||
|
||||
DEVICE_MUTEX
|
||||
|
||||
// Set perf. level to manual so that we can then set the power profile
|
||||
ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_MANUAL);
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
@@ -1402,6 +1405,7 @@ rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block,
|
||||
return RSMI_STATUS_INVALID_ARGS;
|
||||
}
|
||||
|
||||
DEVICE_MUTEX
|
||||
return get_dev_value_int(dev_type, dv_ind, fw_version);
|
||||
CATCH
|
||||
}
|
||||
@@ -2735,9 +2739,9 @@ rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, uint64_t *counter) {
|
||||
TRY
|
||||
CHK_SUPPORT_NAME_ONLY(counter)
|
||||
|
||||
DEVICE_MUTEX
|
||||
rsmi_status_t ret;
|
||||
|
||||
DEVICE_MUTEX
|
||||
ret = get_dev_value_int(amd::smi::kDevPCIEReplayCount, dv_ind, counter);
|
||||
return ret;
|
||||
|
||||
@@ -2747,11 +2751,11 @@ rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, uint64_t *counter) {
|
||||
rsmi_status_t
|
||||
rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *unique_id) {
|
||||
TRY
|
||||
DEVICE_MUTEX
|
||||
rsmi_status_t ret;
|
||||
|
||||
CHK_SUPPORT_NAME_ONLY(unique_id)
|
||||
|
||||
DEVICE_MUTEX
|
||||
ret = get_dev_value_int(amd::smi::kDevUniqueId, dv_ind, unique_id);
|
||||
return ret;
|
||||
|
||||
@@ -2761,13 +2765,12 @@ rsmi_status_t
|
||||
rsmi_dev_counter_create(uint32_t dv_ind, rsmi_event_type_t type,
|
||||
rsmi_event_handle_t *evnt_handle) {
|
||||
TRY
|
||||
DEVICE_MUTEX
|
||||
REQUIRE_ROOT_ACCESS
|
||||
|
||||
// Note we don't need to pass in the variant to CHK_SUPPORT_VAR because
|
||||
// the success of this call doesn't depend on a sysfs file existing.
|
||||
CHK_SUPPORT_NAME_ONLY(evnt_handle)
|
||||
|
||||
DEVICE_MUTEX
|
||||
*evnt_handle = reinterpret_cast<uintptr_t>(
|
||||
new amd::smi::evt::Event(type, dv_ind));
|
||||
|
||||
@@ -3086,13 +3089,12 @@ rsmi_compute_process_info_by_pid_get(uint32_t pid,
|
||||
rsmi_status_t
|
||||
rsmi_dev_xgmi_error_status(uint32_t dv_ind, rsmi_xgmi_status_t *status) {
|
||||
TRY
|
||||
DEVICE_MUTEX
|
||||
|
||||
CHK_SUPPORT_NAME_ONLY(status)
|
||||
|
||||
rsmi_status_t ret;
|
||||
uint64_t status_code;
|
||||
|
||||
DEVICE_MUTEX
|
||||
ret = get_dev_value_int(amd::smi::kDevXGMIError, dv_ind, &status_code);
|
||||
|
||||
if (ret != RSMI_STATUS_SUCCESS) {
|
||||
@@ -3567,6 +3569,7 @@ rsmi_status_t
|
||||
rsmi_event_notification_init(uint32_t dv_ind) {
|
||||
TRY
|
||||
GET_DEV_FROM_INDX
|
||||
DEVICE_MUTEX
|
||||
|
||||
std::lock_guard<std::mutex> guard(*smi.kfd_notif_evt_fh_mutex());
|
||||
if (smi.kfd_notif_evt_fh() == -1) {
|
||||
@@ -3615,6 +3618,7 @@ rsmi_status_t
|
||||
rsmi_event_notification_mask_set(uint32_t dv_ind, uint64_t mask) {
|
||||
TRY
|
||||
GET_DEV_FROM_INDX
|
||||
DEVICE_MUTEX
|
||||
|
||||
if (dev->evt_notif_anon_fd() == -1) {
|
||||
return RSMI_INITIALIZATION_ERROR;
|
||||
@@ -3726,6 +3730,8 @@ rsmi_event_notification_get(int timeout_ms,
|
||||
rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) {
|
||||
TRY
|
||||
GET_DEV_FROM_INDX
|
||||
DEVICE_MUTEX
|
||||
|
||||
std::lock_guard<std::mutex> guard(*smi.kfd_notif_evt_fh_mutex());
|
||||
|
||||
if (dev->evt_notif_anon_fd() == -1) {
|
||||
|
||||
@@ -250,7 +250,7 @@ void TestMutualExclusion::Run(void) {
|
||||
CHECK_RET(ret, RSMI_STATUS_BUSY);
|
||||
ret = rsmi_dev_od_volt_curve_regions_get(0, &dmy_ui32, &dmy_vlt_reg);
|
||||
CHECK_RET(ret, RSMI_STATUS_BUSY);
|
||||
ret = rsmi_dev_overdrive_level_set(dmy_i32, 0);
|
||||
ret = rsmi_dev_overdrive_level_set_v1(dmy_i32, 0);
|
||||
CHECK_RET(ret, RSMI_STATUS_BUSY);
|
||||
ret = rsmi_dev_gpu_clk_freq_set(0, RSMI_CLK_TYPE_SYS, 0);
|
||||
CHECK_RET(ret, RSMI_STATUS_BUSY);
|
||||
@@ -261,6 +261,60 @@ void TestMutualExclusion::Run(void) {
|
||||
ret = rsmi_dev_ecc_status_get(0, RSMI_GPU_BLOCK_UMC, &dmy_ras_err_st);
|
||||
CHECK_RET(ret, RSMI_STATUS_BUSY);
|
||||
|
||||
/* Other functions holding device mutexes. Listed for reference.
|
||||
rsmi_dev_sku_get
|
||||
rsmi_dev_perf_level_set_v1
|
||||
rsmi_dev_od_clk_info_set
|
||||
rsmi_dev_od_volt_info_set
|
||||
rsmi_dev_firmware_version_get
|
||||
rsmi_dev_firmware_version_get
|
||||
rsmi_dev_name_get
|
||||
rsmi_dev_brand_get
|
||||
rsmi_dev_vram_vendor_get
|
||||
rsmi_dev_subsystem_name_get
|
||||
rsmi_dev_drm_render_minor_get
|
||||
rsmi_dev_vendor_name_get
|
||||
rsmi_dev_pci_bandwidth_get
|
||||
rsmi_dev_pci_bandwidth_set
|
||||
rsmi_dev_pci_throughput_get
|
||||
rsmi_dev_temp_metric_get
|
||||
rsmi_dev_volt_metric_get
|
||||
rsmi_dev_fan_speed_get
|
||||
rsmi_dev_fan_rpms_get
|
||||
rsmi_dev_fan_reset
|
||||
rsmi_dev_fan_speed_set
|
||||
rsmi_dev_fan_speed_max_get
|
||||
rsmi_dev_od_volt_info_get
|
||||
rsmi_dev_gpu_metrics_info_get
|
||||
rsmi_dev_od_volt_curve_regions_get
|
||||
rsmi_dev_power_max_get
|
||||
rsmi_dev_power_ave_get
|
||||
rsmi_dev_power_cap_get
|
||||
rsmi_dev_power_cap_range_get
|
||||
rsmi_dev_power_cap_set
|
||||
rsmi_dev_power_profile_presets_get
|
||||
rsmi_dev_power_profile_set
|
||||
rsmi_dev_memory_total_get
|
||||
rsmi_dev_memory_usage_get
|
||||
rsmi_dev_memory_busy_percent_get
|
||||
rsmi_dev_busy_percent_get
|
||||
rsmi_dev_vbios_version_get
|
||||
rsmi_dev_serial_number_get
|
||||
rsmi_dev_pci_replay_counter_get
|
||||
rsmi_dev_unique_id_get
|
||||
rsmi_dev_counter_create
|
||||
rsmi_counter_available_counters_get
|
||||
rsmi_dev_counter_group_supported
|
||||
rsmi_dev_memory_reserved_pages_get
|
||||
rsmi_dev_xgmi_error_status
|
||||
rsmi_dev_xgmi_error_reset
|
||||
rsmi_dev_xgmi_hive_id_get
|
||||
rsmi_topo_get_link_weight
|
||||
rsmi_event_notification_mask_set
|
||||
rsmi_event_notification_init
|
||||
rsmi_event_notification_stop
|
||||
*/
|
||||
|
||||
IF_VERB(STANDARD) {
|
||||
std::cout << "TESTER process: Finished verifying that all "
|
||||
"rsmi_dev_* functions returned RSMI_STATUS_BUSY" << std::endl;
|
||||
|
||||
@@ -32,6 +32,7 @@ THE SOFTWARE.
|
||||
#include <stdlib.h> // malloc, free
|
||||
#include <string.h> // strcpy
|
||||
#include <time.h> // clock_gettime
|
||||
#include <assert.h>
|
||||
|
||||
#include "rocm_smi/rocm_smi_exception.h"
|
||||
|
||||
@@ -86,9 +87,27 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
|
||||
clock_gettime(CLOCK_REALTIME, &expireTime);
|
||||
expireTime.tv_sec += 5;
|
||||
|
||||
int ret = pthread_mutex_timedlock(mutex_ptr, &expireTime);
|
||||
int ret;
|
||||
|
||||
if (ret || (mutex.created == 0 &&
|
||||
ret = pthread_mutex_timedlock(mutex_ptr, &expireTime);
|
||||
|
||||
if (ret == EOWNERDEAD) {
|
||||
ret = pthread_mutex_consistent(mutex_ptr);
|
||||
// This function should not fail unless mutex_ptr is not robust
|
||||
// or mutex_ptr is not in an inconsistent state. Neither scenario
|
||||
// should ever be true at this point in the code.
|
||||
assert(!ret);
|
||||
|
||||
// ...but if there are undocumented failure cases for
|
||||
// pthread_mutex_consistent() handle them for release builds.
|
||||
if (ret) {
|
||||
fprintf(stderr, "pthread_mutex_consistent() returned %d\n", ret);
|
||||
free(mutex.name);
|
||||
|
||||
throw amd::smi::rsmi_exception(RSMI_STATUS_BUSY, __FUNCTION__);
|
||||
return mutex;
|
||||
}
|
||||
} else if (ret || (mutex.created == 0 &&
|
||||
reinterpret_cast<shared_mutex_t *>(addr)->ptr == NULL)) {
|
||||
// Something is out of sync.
|
||||
fprintf(stderr, "pthread_mutex_timedlock() returned %d\n", ret);
|
||||
@@ -121,6 +140,10 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
|
||||
perror("pthread_mutexattr_settype");
|
||||
return mutex;
|
||||
}
|
||||
if (pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST)) {
|
||||
perror("pthread_mutexattr_setrobust");
|
||||
return mutex;
|
||||
}
|
||||
if (pthread_mutex_init(mutex_ptr, &attr)) {
|
||||
perror("pthread_mutex_init");
|
||||
return mutex;
|
||||
|
||||
Reference in New Issue
Block a user