From c9f9221ebe0e6e4ba0f6cbf06d7428522ebe323f Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 12 Sep 2023 16:34:04 -0500 Subject: [PATCH] rocm_smi_lib: Fix rocm-smi --resetfans results in Permission Denied For operations related to: --resetfans --setfan We report 'Not supported' for these cases instead of 'Permission denied' Code changes related to the following: * rocm_smi_properties * rocm_smi related APIs Change-Id: I144646efc3804fabd45cc5a46351803950b4feb7 Signed-off-by: Oliveira, Daniel [ROCm/amdsmi commit: 12f395e592b781feb0df21ed4130bc6bbb28708a] --- projects/amdsmi/python_smi_tools/rocm_smi.py | 8 +++-- projects/amdsmi/src/rocm_smi.cc | 9 ++--- projects/amdsmi/src/rocm_smi_properties.cc | 35 ++++++++++++++------ projects/amdsmi/src/rocm_smi_utils.cc | 7 ++++ 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/projects/amdsmi/python_smi_tools/rocm_smi.py b/projects/amdsmi/python_smi_tools/rocm_smi.py index 4a943a66a4..2d96cf3c73 100755 --- a/projects/amdsmi/python_smi_tools/rocm_smi.py +++ b/projects/amdsmi/python_smi_tools/rocm_smi.py @@ -813,8 +813,10 @@ def resetFans(deviceList): for device in deviceList: sensor_ind = c_uint32(0) ret = rocmsmi.rsmi_dev_fan_reset(device, sensor_ind) - if rsmi_ret_ok(ret, device, 'reset_fan'): + if rsmi_ret_ok(ret, device, silent=True): printLog(device, 'Successfully reset fan speed to driver control', None) + else: + printLog(device, 'Not supported on the given system', None) printLogSpacer() @@ -1335,8 +1337,10 @@ def setFanSpeed(deviceList, fan): else: fanLevel = int(str(fan)) ret = rocmsmi.rsmi_dev_fan_speed_set(device, 0, int(fanLevel)) - if rsmi_ret_ok(ret, device, 'set_fan_speed'): + if rsmi_ret_ok(ret, device, silent=True): printLog(device, 'Successfully set fan speed to level %s' % (str(int(fanLevel))), None) + else: + printLog(device, 'Not supported on the given system', None) printLogSpacer() diff --git a/projects/amdsmi/src/rocm_smi.cc b/projects/amdsmi/src/rocm_smi.cc index d8bd892ac8..892f6664c7 100755 --- a/projects/amdsmi/src/rocm_smi.cc +++ b/projects/amdsmi/src/rocm_smi.cc @@ -407,6 +407,10 @@ static rsmi_status_t set_dev_mon_value(amd::smi::MonitorTypes type, } int ret = dev->monitor()->writeMonitor(type, sensor_ind, std::to_string(val)); + /// If the sysfs file doesn't exist, it is not supported. + if (ret == ENOENT) { + return rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED; + } return amd::smi::ErrnoToRsmiStatus(ret); } @@ -2631,9 +2635,8 @@ rsmi_dev_fan_reset(uint32_t dv_ind, uint32_t sensor_ind) { LOG_TRACE(ss); ++sensor_ind; // fan sysfs files have 1-based indices - + REQUIRE_ROOT_ACCESS DEVICE_MUTEX - ret = set_dev_mon_value(amd::smi::kMonFanCntrlEnable, dv_ind, sensor_ind, 2); return ret; @@ -2669,14 +2672,12 @@ rsmi_dev_fan_speed_set(uint32_t dv_ind, uint32_t sensor_ind, uint64_t speed) { // First need to set fan mode (pwm1_enable) to 1 (aka, "manual") ret = set_dev_mon_value(amd::smi::kMonFanCntrlEnable, dv_ind, sensor_ind, 1); - if (ret != RSMI_STATUS_SUCCESS) { return ret; } ret = set_dev_mon_value(amd::smi::kMonFanSpeed, dv_ind, sensor_ind, speed); - return ret; CATCH diff --git a/projects/amdsmi/src/rocm_smi_properties.cc b/projects/amdsmi/src/rocm_smi_properties.cc index b076991881..d73f974286 100644 --- a/projects/amdsmi/src/rocm_smi_properties.cc +++ b/projects/amdsmi/src/rocm_smi_properties.cc @@ -166,6 +166,7 @@ const AMDGpuVerbList_t amdgpu_verb_check_list { { AMDGpuVerbTypes_t::kGetGpuOdVoltCurveRegions, "amdsmi_get_gpu_od_volt_curve_regions" } }; +const uint16_t kDevIDAll(0xFFFF); const uint16_t kDevRevIDAll(0xFFFF); const AMDGpuPropertyList_t amdgpu_property_reinforcement_list { // @@ -176,6 +177,14 @@ const AMDGpuPropertyList_t amdgpu_property_reinforcement_list { // rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_MANUAL = rsmi_dev_clk_range_set; // + // AMD All Families + {kDevIDAll, {kDevRevIDAll, + make_unique_property_id(AMDGpuPropertyTypesOffset_t::kMonitorTypes, + MonitorTypes::kMonFanCntrlEnable), + AMDGpuVerbTypes_t::kResetGpuFan, + AMDGpuPropertyOpModeTypes_t::kBoth, false } + }, + // AMD Instinct MI210 {0x740F, {0x02, make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, @@ -239,12 +248,6 @@ const AMDGpuPropertyList_t amdgpu_property_reinforcement_list { AMDGpuVerbTypes_t::kGetGpuPowerProfilePresets, AMDGpuPropertyOpModeTypes_t::kBoth, false } }, - {0x74A1, {kDevRevIDAll, - make_unique_property_id(AMDGpuPropertyTypesOffset_t::kDevInfoTypes, - DevInfoTypes::kDevGpuReset), - AMDGpuVerbTypes_t::kResetGpu, - AMDGpuPropertyOpModeTypes_t::kSrIov, false } - }, {0x74A1, {kDevRevIDAll, make_unique_property_id(AMDGpuPropertyTypesOffset_t::kPerfTypes, rsmi_dev_perf_level::RSMI_DEV_PERF_LEVEL_DETERMINISM), @@ -350,7 +353,7 @@ rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbT // likely the reinforcement table does not contain any entries/rules for the // dev_id in question. // - auto amdgpu_property_query_result_hdlr = [](rsmi_status_t query_result) { + auto amdgpu_property_query_result_hdlr = [&](const rsmi_status_t query_result) { switch (query_result) { case (rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR): case (rsmi_status_t::RSMI_STATUS_NO_DATA): @@ -363,7 +366,7 @@ rsmi_status_t validate_property_reinforcement_query(uint32_t dv_ind, AMDGpuVerbT break; default: - return rsmi_status_t::RSMI_STATUS_NOT_FOUND; + return actual_error_code; break; } }; @@ -415,7 +418,7 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx std::ostringstream osstream; auto rsmi_status(rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR); - AMDGpuPropertyQuery_t amdgpu_property_query = [&]() { + auto amdgpu_property_query = [&]() { AMDGpuPropertyQuery_t amdgpu_property_query_init{}; amdgpu_property_query_init.m_asic_id = 0; amdgpu_property_query_init.m_pci_rev_id = 0; @@ -445,6 +448,18 @@ rsmi_status_t Device::check_amdgpu_property_reinforcement_query(uint32_t dev_idx LOG_TRACE(osstream); bool is_proper_query(false); + + // Generic filter for checking properties for all asics and revisions. + auto amdgpu_property_query_all_asics = amdgpu_property_query; + amdgpu_property_query_all_asics.m_asic_id = kDevIDAll; + amdgpu_property_query_all_asics.m_pci_rev_id = kDevRevIDAll; + auto amdgpu_property_query_result = run_amdgpu_property_reinforcement_query(amdgpu_property_query_all_asics); + // We found a generic entry for all asics and revisions + if (amdgpu_property_query_result != rsmi_status_t::RSMI_STATUS_UNKNOWN_ERROR) { + return amdgpu_property_query_result; + } + + // If no generic entry, then we query for specific asic and revision ids. amdgpu_property_query = build_asic_id_filters(amdgpu_property_query, is_proper_query); if (!is_proper_query) { rsmi_status = rsmi_status_t::RSMI_STATUS_NO_DATA; @@ -487,7 +502,7 @@ rsmi_status_t Device::run_amdgpu_property_reinforcement_query(const AMDGpuProper osstream << __PRETTY_FUNCTION__ << " asic id found: " << itr_begin->first << "\n"; // Pci_rev_id matches the filter or ALL Revisions if ((itr_begin->second.m_pci_rev_id == amdgpu_property_query.m_pci_rev_id) || - (itr_begin->second.m_pci_rev_id == kDevRevIDAll)) { + (itr_begin->second.m_pci_rev_id == kDevRevIDAll)) { osstream << __PRETTY_FUNCTION__ << " asic rev.id found: " << itr_begin->second.m_pci_rev_id << "\n"; // Do we have the property we are looking for? if (((amdgpu_property_query.m_property != 0) && diff --git a/projects/amdsmi/src/rocm_smi_utils.cc b/projects/amdsmi/src/rocm_smi_utils.cc index 1e9a444320..4b8f61a842 100755 --- a/projects/amdsmi/src/rocm_smi_utils.cc +++ b/projects/amdsmi/src/rocm_smi_utils.cc @@ -176,6 +176,13 @@ int isRegularFile(std::string fname, bool *is_reg) { } int WriteSysfsStr(std::string path, std::string val) { + // On success, zero is returned. On error, -1 is returned, and + // errno is set to indicate the error. + auto is_regular_file_result = isRegularFile(path, nullptr); + if (is_regular_file_result != 0) { + return ENOENT; + } + std::ofstream fs; int ret = 0; std::ostringstream ss;