From 17871ecb141f7618ec245bc7019516fabf42bb71 Mon Sep 17 00:00:00 2001 From: Chris Freehill Date: Sat, 14 Mar 2020 15:09:11 -0500 Subject: [PATCH] More general solution to api support hwmon mapping This solution takes into account that some hwmons use label files to map sensor types. The previous solution did not take this into account. Change-Id: I1d6204573cefa8197b2cfe0ffb412b545df3d80a [ROCm/rocm_smi_lib commit: 324c0ca0e500b36757842b3e7b0b541a926119bc] --- .../rocm-smi-lib/include/rocm_smi/rocm_smi.h | 3 +- .../include/rocm_smi/rocm_smi_common.h | 4 ++ .../include/rocm_smi/rocm_smi_monitor.h | 12 +++- projects/rocm-smi-lib/src/rocm_smi.cc | 20 ++---- projects/rocm-smi-lib/src/rocm_smi_device.cc | 29 ++++----- projects/rocm-smi-lib/src/rocm_smi_main.cc | 6 +- projects/rocm-smi-lib/src/rocm_smi_monitor.cc | 61 ++++++++++++++++--- 7 files changed, 96 insertions(+), 39 deletions(-) diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index c906cdbd8c..8e045a8182 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -326,7 +326,8 @@ typedef enum { //!< temperature RSMI_TEMP_TYPE_MEMORY, //!< VRAM temperature - RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_MEMORY + RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_MEMORY, + RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type } rsmi_temperature_type_t; /** diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_common.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_common.h index 9ed0052a87..ee81b91c9c 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_common.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_common.h @@ -93,6 +93,10 @@ struct RocmSMI_env_vars { const char *path_power_root_override; }; +// Use this bit offset to store the label-mapped file index +#define MONITOR_TYPE_BIT_POSITION 16 +#define MONITOR_IND_BIT_MASK ((1 << MONITOR_TYPE_BIT_POSITION) - 1) + // Support information data structures typedef std::vector SubVariant; typedef SubVariant::const_iterator SubVariantIt; diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_monitor.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_monitor.h index 8192ba2963..3b72398fbd 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_monitor.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_monitor.h @@ -93,7 +93,8 @@ class Monitor { int readMonitor(MonitorTypes type, uint32_t sensor_ind, std::string *val); int writeMonitor(MonitorTypes type, uint32_t sensor_ind, std::string val); uint32_t setSensorLabelMap(void); - uint32_t getSensorIndex(rsmi_temperature_type_t type); + uint32_t getTempSensorIndex(rsmi_temperature_type_t type); + rsmi_temperature_type_t getTempSensorEnum(uint64_t ind); void fillSupportedFuncs(SupportedFuncMap *supported_funcs); private: @@ -101,6 +102,15 @@ class Monitor { std::string path_; const RocmSMI_env_vars *env_; std::map temp_type_index_map_; + + // This map uses a 64b index instead of 32b (unlike temp_type_index_map_) + // for flexibility and simplicity. Currently, some parts of the + // implementation store both the RSMI api index and the file index into a + // single value. 32 bits is enough to store both, but we are using 64 + // bits for simpler integration with existing implementation, which uses + // a 64b value. Also, if we need to encode anything else, 64b will give + // us more room to do so, without excessive changes. + std::map index_temp_type_map_; }; } // namespace smi diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 56167f43e4..75afd392d8 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -1892,15 +1892,10 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, assert(dev->monitor() != nullptr); std::shared_ptr m = dev->monitor(); - uint32_t err = m->setSensorLabelMap(); - if (err) { - return errno_to_rsmi_status(err); - } - - // getSensorIndex will throw an out of range exception if sensor_type is not - // found + // getTempSensorIndex will throw an out of range exception if sensor_type is + // not found uint32_t sensor_index = - m->getSensorIndex(static_cast(sensor_type)); + m->getTempSensorIndex(static_cast(sensor_type)); CHK_API_SUPPORT_ONLY(temperature, metric, sensor_index) @@ -3072,12 +3067,9 @@ rsmi_func_iter_value_get(rsmi_func_id_iter_handle_t handle, case SUBVARIANT_ITER: sub_var_itr = reinterpret_cast(handle->func_id_iter); - // The sub-variant refers to monitors types. We store those as the exist - // in the file name. For example, for temp2_crit, 2 is stored (crit is the - // variant). These are 1-based values. But the RSMI user expects a 0-based - // value, so we will convert it to RSMI-space by subracting 1: - assert(*(*sub_var_itr) > 0); - value->id = *(*sub_var_itr) - 1; + // The hwmon file index that is appropriate for the rsmi user is stored + // at bit position MONITOR_TYPE_BIT_POSITION. + value->id = *(*sub_var_itr) >> MONITOR_TYPE_BIT_POSITION; break; default: diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc index fa57a60fb8..0c8dc8a927 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_device.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc @@ -880,6 +880,19 @@ void Device::fillSupportedFuncs(void) { // DumpSupportedFunctions(); } +static bool subvariant_match(const std::shared_ptr *sv, + uint64_t sub_v) { + assert(sv != nullptr); + + SubVariantIt it = (*sv)->begin(); + for (; it != (*sv)->end(); it++) { + if ((*it & MONITOR_IND_BIT_MASK) == sub_v) { + return true; + } + } + return false; +} + bool Device::DeviceAPISupported(std::string name, uint64_t variant, uint64_t sub_variant) { SupportedFuncMapIt func_it; @@ -908,13 +921,7 @@ bool Device::DeviceAPISupported(std::string name, uint64_t variant, // if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr assert(var_it->second != nullptr); - sub_var_it = std::find(var_it->second->begin(), - var_it->second->end(), sub_variant); - if (sub_var_it == var_it->second->end()) { - return false; - } else { - return true; - } + return subvariant_match(&(var_it->second), sub_variant); } } else { // variant == RSMI_DEFAULT_VARIANT if (func_it->second != nullptr) { @@ -926,13 +933,7 @@ bool Device::DeviceAPISupported(std::string name, uint64_t variant, if (func_it->second == nullptr) { return false; } - sub_var_it = std::find(var_it->second->begin(), - var_it->second->end(), sub_variant); - if (sub_var_it == var_it->second->end()) { - return false; - } else { - return true; - } + return subvariant_match(&(var_it->second), sub_variant); } } assert(!"We should not reach here"); diff --git a/projects/rocm-smi-lib/src/rocm_smi_main.cc b/projects/rocm-smi-lib/src/rocm_smi_main.cc index 45c88e88de..4846d8decb 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_main.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_main.cc @@ -481,8 +481,10 @@ uint32_t RocmSMI::DiscoverAMDMonitors(void) { fs.close(); if (amd_monitor_types_.find(mon_type) != amd_monitor_types_.end()) { - monitors_.push_back(std::shared_ptr( - new Monitor(mon_name, &env_vars_))); + std::shared_ptr m = + std::shared_ptr(new Monitor(mon_name, &env_vars_)); + m->setSensorLabelMap(); + monitors_.push_back(m); } } dentry = readdir(mon_dir); diff --git a/projects/rocm-smi-lib/src/rocm_smi_monitor.cc b/projects/rocm-smi-lib/src/rocm_smi_monitor.cc index 15f0c39148..1a709d9ace 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_monitor.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_monitor.cc @@ -275,12 +275,19 @@ Monitor::setSensorLabelMap(void) { } auto add_temp_sensor_entry = [&](uint32_t file_index) { ret = readMonitor(kMonTempLabel, file_index, &type_str); - if (ret) { - return ret; - } - rsmi_temperature_type_t t_type = kTempSensorNameMap.at(type_str); - temp_type_index_map_.insert({t_type, file_index}); + + // If readMonitor fails, there is no label file for the file_index. + // In that case, map the type to file index 0, which is not supported + // and will fail appropriately later when we check for support. + if (ret) { + temp_type_index_map_.insert({t_type, 0}); + index_temp_type_map_.insert({file_index, RSMI_TEMP_TYPE_INVALID}); + } else { + temp_type_index_map_.insert({t_type, file_index}); + index_temp_type_map_.insert({file_index, t_type}); + } + index_temp_type_map_.insert({file_index, t_type}); return 0; }; @@ -342,10 +349,15 @@ static int get_supported_sensors(std::string dir_path, std::string fn_reg_ex, } uint32_t -Monitor::getSensorIndex(rsmi_temperature_type_t type) { +Monitor::getTempSensorIndex(rsmi_temperature_type_t type) { return temp_type_index_map_.at(type); } +rsmi_temperature_type_t +Monitor::getTempSensorEnum(uint64_t ind) { + return index_temp_type_map_.at(ind); +} + static std::vector get_intersection(std::vector *v1, std::vector *v2) { assert(v1 != nullptr); @@ -360,6 +372,25 @@ static std::vector get_intersection(std::vector *v1, return intersect; } +// Use this enum to encode the monitor type into the monitor ID. +// We can later use this to convert to rsmi-api sensor types; for exampple, +// rsmi_temperature_type_t, which is what the caller will expect. Add +// new types as needed. + +typedef enum { + eDefaultMonitor = 0, + eTempMonitor, +} monitor_types; + +static monitor_types getFuncType(std::string f_name) { + monitor_types ret = eDefaultMonitor; + + if (f_name.compare("rsmi_dev_temp_metric_get") == 0) { + ret = eTempMonitor; + } + return ret; +} + void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { std::map::const_iterator it = kMonFuncDependsMap.begin(); @@ -369,6 +400,7 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { std::vector sensors_i; std::vector intersect; int ret; + monitor_types m_type; assert(supported_funcs != nullptr); @@ -377,6 +409,7 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { std::vector::const_iterator dep = it->second.mandatory_depends.begin(); + m_type = getFuncType(it->first); mand_depends_met = true; // Initialize "intersect". A monitor is considered supported if all of its @@ -451,7 +484,21 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { supported_monitors = intersect; } if (supported_monitors.size() > 0) { - (*supported_variants)[kMonInfoVarTypeToRSMIVariant.at(*var)] = + for (uint32_t i = 0; i < supported_monitors.size(); ++i) { + assert(supported_monitors[i] > 0); + + if (m_type == eDefaultMonitor) { + supported_monitors[i] |= + (supported_monitors[i] - 1) << MONITOR_TYPE_BIT_POSITION; + } else if (m_type == eTempMonitor) { + supported_monitors[i] |= + static_cast(getTempSensorEnum(supported_monitors[i])) + << MONITOR_TYPE_BIT_POSITION; + } else { + assert(!"Unexpected monitor type"); + } + } + (*supported_variants)[kMonInfoVarTypeToRSMIVariant.at(*var)] = std::make_shared(supported_monitors); } }