diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index bab5fc1e9b..31902531f8 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -1078,6 +1078,7 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, {RSMI_CLK_TYPE_SYS, "s"}, {RSMI_CLK_TYPE_MEM, "m"}, }; + DEVICE_MUTEX // Set perf. level to manual so that we can then set the power profile ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_MANUAL); @@ -1127,6 +1128,8 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, TRY rsmi_status_t ret; + DEVICE_MUTEX + // Set perf. level to manual so that we can then set the power profile ret = rsmi_dev_perf_level_set_v1(dv_ind, RSMI_DEV_PERF_LEVEL_MANUAL); if (ret != RSMI_STATUS_SUCCESS) { @@ -1402,6 +1405,7 @@ rsmi_dev_firmware_version_get(uint32_t dv_ind, rsmi_fw_block_t block, return RSMI_STATUS_INVALID_ARGS; } + DEVICE_MUTEX return get_dev_value_int(dev_type, dv_ind, fw_version); CATCH } @@ -2735,9 +2739,9 @@ rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, uint64_t *counter) { TRY CHK_SUPPORT_NAME_ONLY(counter) - DEVICE_MUTEX rsmi_status_t ret; + DEVICE_MUTEX ret = get_dev_value_int(amd::smi::kDevPCIEReplayCount, dv_ind, counter); return ret; @@ -2747,11 +2751,11 @@ rsmi_dev_pci_replay_counter_get(uint32_t dv_ind, uint64_t *counter) { rsmi_status_t rsmi_dev_unique_id_get(uint32_t dv_ind, uint64_t *unique_id) { TRY - DEVICE_MUTEX rsmi_status_t ret; CHK_SUPPORT_NAME_ONLY(unique_id) + DEVICE_MUTEX ret = get_dev_value_int(amd::smi::kDevUniqueId, dv_ind, unique_id); return ret; @@ -2761,13 +2765,12 @@ rsmi_status_t rsmi_dev_counter_create(uint32_t dv_ind, rsmi_event_type_t type, rsmi_event_handle_t *evnt_handle) { TRY - DEVICE_MUTEX REQUIRE_ROOT_ACCESS // Note we don't need to pass in the variant to CHK_SUPPORT_VAR because // the success of this call doesn't depend on a sysfs file existing. CHK_SUPPORT_NAME_ONLY(evnt_handle) - + DEVICE_MUTEX *evnt_handle = reinterpret_cast( new amd::smi::evt::Event(type, dv_ind)); @@ -3086,13 +3089,12 @@ rsmi_compute_process_info_by_pid_get(uint32_t pid, rsmi_status_t rsmi_dev_xgmi_error_status(uint32_t dv_ind, rsmi_xgmi_status_t *status) { TRY - DEVICE_MUTEX - CHK_SUPPORT_NAME_ONLY(status) rsmi_status_t ret; uint64_t status_code; + DEVICE_MUTEX ret = get_dev_value_int(amd::smi::kDevXGMIError, dv_ind, &status_code); if (ret != RSMI_STATUS_SUCCESS) { @@ -3567,6 +3569,7 @@ rsmi_status_t rsmi_event_notification_init(uint32_t dv_ind) { TRY GET_DEV_FROM_INDX + DEVICE_MUTEX std::lock_guard guard(*smi.kfd_notif_evt_fh_mutex()); if (smi.kfd_notif_evt_fh() == -1) { @@ -3615,6 +3618,7 @@ rsmi_status_t rsmi_event_notification_mask_set(uint32_t dv_ind, uint64_t mask) { TRY GET_DEV_FROM_INDX + DEVICE_MUTEX if (dev->evt_notif_anon_fd() == -1) { return RSMI_INITIALIZATION_ERROR; @@ -3726,6 +3730,8 @@ rsmi_event_notification_get(int timeout_ms, rsmi_status_t rsmi_event_notification_stop(uint32_t dv_ind) { TRY GET_DEV_FROM_INDX + DEVICE_MUTEX + std::lock_guard guard(*smi.kfd_notif_evt_fh_mutex()); if (dev->evt_notif_anon_fd() == -1) { diff --git a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/mutual_exclusion.cc b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/mutual_exclusion.cc index 8bb3c80dd7..026182eba0 100755 --- a/projects/rocm-smi-lib/tests/rocm_smi_test/functional/mutual_exclusion.cc +++ b/projects/rocm-smi-lib/tests/rocm_smi_test/functional/mutual_exclusion.cc @@ -250,7 +250,7 @@ void TestMutualExclusion::Run(void) { CHECK_RET(ret, RSMI_STATUS_BUSY); ret = rsmi_dev_od_volt_curve_regions_get(0, &dmy_ui32, &dmy_vlt_reg); CHECK_RET(ret, RSMI_STATUS_BUSY); - ret = rsmi_dev_overdrive_level_set(dmy_i32, 0); + ret = rsmi_dev_overdrive_level_set_v1(dmy_i32, 0); CHECK_RET(ret, RSMI_STATUS_BUSY); ret = rsmi_dev_gpu_clk_freq_set(0, RSMI_CLK_TYPE_SYS, 0); CHECK_RET(ret, RSMI_STATUS_BUSY); @@ -261,6 +261,60 @@ void TestMutualExclusion::Run(void) { ret = rsmi_dev_ecc_status_get(0, RSMI_GPU_BLOCK_UMC, &dmy_ras_err_st); CHECK_RET(ret, RSMI_STATUS_BUSY); + /* Other functions holding device mutexes. Listed for reference. + rsmi_dev_sku_get + rsmi_dev_perf_level_set_v1 + rsmi_dev_od_clk_info_set + rsmi_dev_od_volt_info_set + rsmi_dev_firmware_version_get + rsmi_dev_firmware_version_get + rsmi_dev_name_get + rsmi_dev_brand_get + rsmi_dev_vram_vendor_get + rsmi_dev_subsystem_name_get + rsmi_dev_drm_render_minor_get + rsmi_dev_vendor_name_get + rsmi_dev_pci_bandwidth_get + rsmi_dev_pci_bandwidth_set + rsmi_dev_pci_throughput_get + rsmi_dev_temp_metric_get + rsmi_dev_volt_metric_get + rsmi_dev_fan_speed_get + rsmi_dev_fan_rpms_get + rsmi_dev_fan_reset + rsmi_dev_fan_speed_set + rsmi_dev_fan_speed_max_get + rsmi_dev_od_volt_info_get + rsmi_dev_gpu_metrics_info_get + rsmi_dev_od_volt_curve_regions_get + rsmi_dev_power_max_get + rsmi_dev_power_ave_get + rsmi_dev_power_cap_get + rsmi_dev_power_cap_range_get + rsmi_dev_power_cap_set + rsmi_dev_power_profile_presets_get + rsmi_dev_power_profile_set + rsmi_dev_memory_total_get + rsmi_dev_memory_usage_get + rsmi_dev_memory_busy_percent_get + rsmi_dev_busy_percent_get + rsmi_dev_vbios_version_get + rsmi_dev_serial_number_get + rsmi_dev_pci_replay_counter_get + rsmi_dev_unique_id_get + rsmi_dev_counter_create + rsmi_counter_available_counters_get + rsmi_dev_counter_group_supported + rsmi_dev_memory_reserved_pages_get + rsmi_dev_xgmi_error_status + rsmi_dev_xgmi_error_reset + rsmi_dev_xgmi_hive_id_get + rsmi_topo_get_link_weight + rsmi_event_notification_mask_set + rsmi_event_notification_init + rsmi_event_notification_stop + */ + IF_VERB(STANDARD) { std::cout << "TESTER process: Finished verifying that all " "rsmi_dev_* functions returned RSMI_STATUS_BUSY" << std::endl; diff --git a/projects/rocm-smi-lib/third_party/shared_mutex/shared_mutex.cc b/projects/rocm-smi-lib/third_party/shared_mutex/shared_mutex.cc index ccb55f4a23..55fac1f5b5 100755 --- a/projects/rocm-smi-lib/third_party/shared_mutex/shared_mutex.cc +++ b/projects/rocm-smi-lib/third_party/shared_mutex/shared_mutex.cc @@ -32,6 +32,7 @@ THE SOFTWARE. #include // malloc, free #include // strcpy #include // clock_gettime +#include #include "rocm_smi/rocm_smi_exception.h" @@ -86,9 +87,27 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { clock_gettime(CLOCK_REALTIME, &expireTime); expireTime.tv_sec += 5; - int ret = pthread_mutex_timedlock(mutex_ptr, &expireTime); + int ret; - if (ret || (mutex.created == 0 && + ret = pthread_mutex_timedlock(mutex_ptr, &expireTime); + + if (ret == EOWNERDEAD) { + ret = pthread_mutex_consistent(mutex_ptr); + // This function should not fail unless mutex_ptr is not robust + // or mutex_ptr is not in an inconsistent state. Neither scenario + // should ever be true at this point in the code. + assert(!ret); + + // ...but if there are undocumented failure cases for + // pthread_mutex_consistent() handle them for release builds. + if (ret) { + fprintf(stderr, "pthread_mutex_consistent() returned %d\n", ret); + free(mutex.name); + + throw amd::smi::rsmi_exception(RSMI_STATUS_BUSY, __FUNCTION__); + return mutex; + } + } else if (ret || (mutex.created == 0 && reinterpret_cast(addr)->ptr == NULL)) { // Something is out of sync. fprintf(stderr, "pthread_mutex_timedlock() returned %d\n", ret); @@ -121,6 +140,10 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode) { perror("pthread_mutexattr_settype"); return mutex; } + if (pthread_mutexattr_setrobust(&attr, PTHREAD_MUTEX_ROBUST)) { + perror("pthread_mutexattr_setrobust"); + return mutex; + } if (pthread_mutex_init(mutex_ptr, &attr)) { perror("pthread_mutex_init"); return mutex;