Fix process killed while holding mutex

Previously, when a process holding a shared mutex was killed,
the next time an RSMI application was started, it would not be
able to obtain the mutex--the application would have to exit.
This fix uses pthread_mutexattr_setrobust() to detect this
situation and act accordingingly.

Also, add some missing, needed mutexes and move mutexes
closer to where the protect resource is used.

Change-Id: Icfdc3a246f4cfa3fd008e3f13472199abd76fd35


[ROCm/rocm_smi_lib commit: f4938b0ac9]
This commit is contained in:
Chris Freehill
2020-12-01 17:12:38 -06:00
parent 129e3e8934
commit aef625bfd3
3 changed files with 92 additions and 9 deletions
+12 -6
View File
@@ -1078,6 +1078,7 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level,
{RSMI_CLK_TYPE_SYS, "s"},
{RSMI_CLK_TYPE_MEM, "m"},
};
DEVICE_MUTEX
// Set perf. level to manual so that we can then set the power profile
ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_MANUAL);
@@ -1127,6 +1128,8 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint,
TRY
rsmi_status_t ret;
DEVICE_MUTEX
// Set perf. level to manual so that we can then set the power profile
ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_MANUAL);
if (ret != RSMI_STATUS_SUCCESS) {
@@ -1402,6 +1405,7 @@ rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block,
return RSMI_STATUS_INVALID_ARGS;
}
DEVICE_MUTEX
return get_dev_value_int(dev_type, dv_ind, fw_version);
CATCH
}
@@ -2735,9 +2739,9 @@ rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, uint64_t *counter) {
TRY
CHK_SUPPORT_NAME_ONLY(counter)
DEVICE_MUTEX
rsmi_status_t ret;
DEVICE_MUTEX
ret = get_dev_value_int(amd::smi::kDevPCIEReplayCount, dv_ind, counter);
return ret;
@@ -2747,11 +2751,11 @@ rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, uint64_t *counter) {
rsmi_status_t
rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *unique_id) {
TRY
DEVICE_MUTEX
rsmi_status_t ret;
CHK_SUPPORT_NAME_ONLY(unique_id)
DEVICE_MUTEX
ret = get_dev_value_int(amd::smi::kDevUniqueId, dv_ind, unique_id);
return ret;
@@ -2761,13 +2765,12 @@ rsmi_status_t
rsmi_dev_counter_create(uint32_t dv_ind, rsmi_event_type_t type,
rsmi_event_handle_t *evnt_handle) {
TRY
DEVICE_MUTEX
REQUIRE_ROOT_ACCESS
// Note we don't need to pass in the variant to CHK_SUPPORT_VAR because
// the success of this call doesn't depend on a sysfs file existing.
CHK_SUPPORT_NAME_ONLY(evnt_handle)
DEVICE_MUTEX
*evnt_handle = reinterpret_cast<uintptr_t>(
new amd::smi::evt::Event(type, dv_ind));
@@ -3086,13 +3089,12 @@ rsmi_compute_process_info_by_pid_get(uint32_t pid,
rsmi_status_t
rsmi_dev_xgmi_error_status(uint32_t dv_ind, rsmi_xgmi_status_t *status) {
TRY
DEVICE_MUTEX
CHK_SUPPORT_NAME_ONLY(status)
rsmi_status_t ret;
uint64_t status_code;
DEVICE_MUTEX
ret = get_dev_value_int(amd::smi::kDevXGMIError, dv_ind, &status_code);
if (ret != RSMI_STATUS_SUCCESS) {
@@ -3567,6 +3569,7 @@ rsmi_status_t
rsmi_event_notification_init(uint32_t dv_ind) {
TRY
GET_DEV_FROM_INDX
DEVICE_MUTEX
std::lock_guard<std::mutex> guard(*smi.kfd_notif_evt_fh_mutex());
if (smi.kfd_notif_evt_fh() == -1) {
@@ -3615,6 +3618,7 @@ rsmi_status_t
rsmi_event_notification_mask_set(uint32_t dv_ind, uint64_t mask) {
TRY
GET_DEV_FROM_INDX
DEVICE_MUTEX
if (dev->evt_notif_anon_fd() == -1) {
return RSMI_INITIALIZATION_ERROR;
@@ -3726,6 +3730,8 @@ rsmi_event_notification_get(int timeout_ms,
rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) {
TRY
GET_DEV_FROM_INDX
DEVICE_MUTEX
std::lock_guard<std::mutex> guard(*smi.kfd_notif_evt_fh_mutex());
if (dev->evt_notif_anon_fd() == -1) {
@@ -250,7 +250,7 @@ void TestMutualExclusion::Run(void) {
CHECK_RET(ret, RSMI_STATUS_BUSY);
ret = rsmi_dev_od_volt_curve_regions_get(0, &dmy_ui32, &dmy_vlt_reg);
CHECK_RET(ret, RSMI_STATUS_BUSY);
ret = rsmi_dev_overdrive_level_set(dmy_i32, 0);
ret = rsmi_dev_overdrive_level_set_v1(dmy_i32, 0);
CHECK_RET(ret, RSMI_STATUS_BUSY);
ret = rsmi_dev_gpu_clk_freq_set(0, RSMI_CLK_TYPE_SYS, 0);
CHECK_RET(ret, RSMI_STATUS_BUSY);
@@ -261,6 +261,60 @@ void TestMutualExclusion::Run(void) {
ret = rsmi_dev_ecc_status_get(0, RSMI_GPU_BLOCK_UMC, &dmy_ras_err_st);
CHECK_RET(ret, RSMI_STATUS_BUSY);
/* Other functions holding device mutexes. Listed for reference.
rsmi_dev_sku_get
rsmi_dev_perf_level_set_v1
rsmi_dev_od_clk_info_set
rsmi_dev_od_volt_info_set
rsmi_dev_firmware_version_get
rsmi_dev_firmware_version_get
rsmi_dev_name_get
rsmi_dev_brand_get
rsmi_dev_vram_vendor_get
rsmi_dev_subsystem_name_get
rsmi_dev_drm_render_minor_get
rsmi_dev_vendor_name_get
rsmi_dev_pci_bandwidth_get
rsmi_dev_pci_bandwidth_set
rsmi_dev_pci_throughput_get
rsmi_dev_temp_metric_get
rsmi_dev_volt_metric_get
rsmi_dev_fan_speed_get
rsmi_dev_fan_rpms_get
rsmi_dev_fan_reset
rsmi_dev_fan_speed_set
rsmi_dev_fan_speed_max_get
rsmi_dev_od_volt_info_get
rsmi_dev_gpu_metrics_info_get
rsmi_dev_od_volt_curve_regions_get
rsmi_dev_power_max_get
rsmi_dev_power_ave_get
rsmi_dev_power_cap_get
rsmi_dev_power_cap_range_get
rsmi_dev_power_cap_set
rsmi_dev_power_profile_presets_get
rsmi_dev_power_profile_set
rsmi_dev_memory_total_get
rsmi_dev_memory_usage_get
rsmi_dev_memory_busy_percent_get
rsmi_dev_busy_percent_get
rsmi_dev_vbios_version_get
rsmi_dev_serial_number_get
rsmi_dev_pci_replay_counter_get
rsmi_dev_unique_id_get
rsmi_dev_counter_create
rsmi_counter_available_counters_get
rsmi_dev_counter_group_supported
rsmi_dev_memory_reserved_pages_get
rsmi_dev_xgmi_error_status
rsmi_dev_xgmi_error_reset
rsmi_dev_xgmi_hive_id_get
rsmi_topo_get_link_weight
rsmi_event_notification_mask_set
rsmi_event_notification_init
rsmi_event_notification_stop
*/
IF_VERB(STANDARD) {
std::cout << "TESTER process: Finished verifying that all "
"rsmi_dev_* functions returned RSMI_STATUS_BUSY" << std::endl;
@@ -32,6 +32,7 @@ THE SOFTWARE.
#include <stdlib.h> // malloc, free
#include <string.h> // strcpy
#include <time.h> // clock_gettime
#include <assert.h>
#include "rocm_smi/rocm_smi_exception.h"
@@ -86,9 +87,27 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
clock_gettime(CLOCK_REALTIME, &expireTime);
expireTime.tv_sec += 5;
int ret = pthread_mutex_timedlock(mutex_ptr, &expireTime);
int ret;
if (ret || (mutex.created == 0 &&
ret = pthread_mutex_timedlock(mutex_ptr, &expireTime);
if (ret == EOWNERDEAD) {
ret = pthread_mutex_consistent(mutex_ptr);
// This function should not fail unless mutex_ptr is not robust
// or mutex_ptr is not in an inconsistent state. Neither scenario
// should ever be true at this point in the code.
assert(!ret);
// ...but if there are undocumented failure cases for
// pthread_mutex_consistent() handle them for release builds.
if (ret) {
fprintf(stderr, "pthread_mutex_consistent() returned %d\n", ret);
free(mutex.name);
throw amd::smi::rsmi_exception(RSMI_STATUS_BUSY, __FUNCTION__);
return mutex;
}
} else if (ret || (mutex.created == 0 &&
reinterpret_cast<shared_mutex_t *>(addr)->ptr == NULL)) {
// Something is out of sync.
fprintf(stderr, "pthread_mutex_timedlock() returned %d\n", ret);
@@ -121,6 +140,10 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) {
perror("pthread_mutexattr_settype");
return mutex;
}
if (pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST)) {
perror("pthread_mutexattr_setrobust");
return mutex;
}
if (pthread_mutex_init(mutex_ptr, &attr)) {
perror("pthread_mutex_init");
return mutex;