diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h index c906cdbd8c..8e045a8182 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi.h @@ -326,7 +326,8 @@ typedef enum { //!< temperature RSMI_TEMP_TYPE_MEMORY, //!< VRAM temperature - RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_MEMORY + RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_MEMORY, + RSMI_TEMP_TYPE_INVALID = 0xFFFFFFFF //!< Invalid type } rsmi_temperature_type_t; /** diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_common.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_common.h index 9ed0052a87..ee81b91c9c 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_common.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_common.h @@ -93,6 +93,10 @@ struct RocmSMI_env_vars { const char *path_power_root_override; }; +// Use this bit offset to store the label-mapped file index +#define MONITOR_TYPE_BIT_POSITION 16 +#define MONITOR_IND_BIT_MASK ((1 << MONITOR_TYPE_BIT_POSITION) - 1) + // Support information data structures typedef std::vector SubVariant; typedef SubVariant::const_iterator SubVariantIt; diff --git a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_monitor.h b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_monitor.h index 8192ba2963..3b72398fbd 100755 --- a/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_monitor.h +++ b/projects/rocm-smi-lib/include/rocm_smi/rocm_smi_monitor.h @@ -93,7 +93,8 @@ class Monitor { int readMonitor(MonitorTypes type, uint32_t sensor_ind, std::string *val); int writeMonitor(MonitorTypes type, uint32_t sensor_ind, std::string val); uint32_t setSensorLabelMap(void); - uint32_t getSensorIndex(rsmi_temperature_type_t type); + uint32_t getTempSensorIndex(rsmi_temperature_type_t type); + rsmi_temperature_type_t getTempSensorEnum(uint64_t ind); void fillSupportedFuncs(SupportedFuncMap *supported_funcs); private: @@ -101,6 +102,15 @@ class Monitor { std::string path_; const RocmSMI_env_vars *env_; std::map temp_type_index_map_; + + // This map uses a 64b index instead of 32b (unlike temp_type_index_map_) + // for flexibility and simplicity. Currently, some parts of the + // implementation store both the RSMI api index and the file index into a + // single value. 32 bits is enough to store both, but we are using 64 + // bits for simpler integration with existing implementation, which uses + // a 64b value. Also, if we need to encode anything else, 64b will give + // us more room to do so, without excessive changes. + std::map index_temp_type_map_; }; } // namespace smi diff --git a/projects/rocm-smi-lib/src/rocm_smi.cc b/projects/rocm-smi-lib/src/rocm_smi.cc index 56167f43e4..75afd392d8 100755 --- a/projects/rocm-smi-lib/src/rocm_smi.cc +++ b/projects/rocm-smi-lib/src/rocm_smi.cc @@ -1892,15 +1892,10 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, assert(dev->monitor() != nullptr); std::shared_ptr m = dev->monitor(); - uint32_t err = m->setSensorLabelMap(); - if (err) { - return errno_to_rsmi_status(err); - } - - // getSensorIndex will throw an out of range exception if sensor_type is not - // found + // getTempSensorIndex will throw an out of range exception if sensor_type is + // not found uint32_t sensor_index = - m->getSensorIndex(static_cast(sensor_type)); + m->getTempSensorIndex(static_cast(sensor_type)); CHK_API_SUPPORT_ONLY(temperature, metric, sensor_index) @@ -3072,12 +3067,9 @@ rsmi_func_iter_value_get(rsmi_func_id_iter_handle_t handle, case SUBVARIANT_ITER: sub_var_itr = reinterpret_cast(handle->func_id_iter); - // The sub-variant refers to monitors types. We store those as the exist - // in the file name. For example, for temp2_crit, 2 is stored (crit is the - // variant). These are 1-based values. But the RSMI user expects a 0-based - // value, so we will convert it to RSMI-space by subracting 1: - assert(*(*sub_var_itr) > 0); - value->id = *(*sub_var_itr) - 1; + // The hwmon file index that is appropriate for the rsmi user is stored + // at bit position MONITOR_TYPE_BIT_POSITION. + value->id = *(*sub_var_itr) >> MONITOR_TYPE_BIT_POSITION; break; default: diff --git a/projects/rocm-smi-lib/src/rocm_smi_device.cc b/projects/rocm-smi-lib/src/rocm_smi_device.cc index fa57a60fb8..0c8dc8a927 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_device.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_device.cc @@ -880,6 +880,19 @@ void Device::fillSupportedFuncs(void) { // DumpSupportedFunctions(); } +static bool subvariant_match(const std::shared_ptr *sv, + uint64_t sub_v) { + assert(sv != nullptr); + + SubVariantIt it = (*sv)->begin(); + for (; it != (*sv)->end(); it++) { + if ((*it & MONITOR_IND_BIT_MASK) == sub_v) { + return true; + } + } + return false; +} + bool Device::DeviceAPISupported(std::string name, uint64_t variant, uint64_t sub_variant) { SupportedFuncMapIt func_it; @@ -908,13 +921,7 @@ bool Device::DeviceAPISupported(std::string name, uint64_t variant, // if variant is != RSMI_DEFAULT_VARIANT, we should not have a nullptr assert(var_it->second != nullptr); - sub_var_it = std::find(var_it->second->begin(), - var_it->second->end(), sub_variant); - if (sub_var_it == var_it->second->end()) { - return false; - } else { - return true; - } + return subvariant_match(&(var_it->second), sub_variant); } } else { // variant == RSMI_DEFAULT_VARIANT if (func_it->second != nullptr) { @@ -926,13 +933,7 @@ bool Device::DeviceAPISupported(std::string name, uint64_t variant, if (func_it->second == nullptr) { return false; } - sub_var_it = std::find(var_it->second->begin(), - var_it->second->end(), sub_variant); - if (sub_var_it == var_it->second->end()) { - return false; - } else { - return true; - } + return subvariant_match(&(var_it->second), sub_variant); } } assert(!"We should not reach here"); diff --git a/projects/rocm-smi-lib/src/rocm_smi_main.cc b/projects/rocm-smi-lib/src/rocm_smi_main.cc index 45c88e88de..4846d8decb 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_main.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_main.cc @@ -481,8 +481,10 @@ uint32_t RocmSMI::DiscoverAMDMonitors(void) { fs.close(); if (amd_monitor_types_.find(mon_type) != amd_monitor_types_.end()) { - monitors_.push_back(std::shared_ptr( - new Monitor(mon_name, &env_vars_))); + std::shared_ptr m = + std::shared_ptr(new Monitor(mon_name, &env_vars_)); + m->setSensorLabelMap(); + monitors_.push_back(m); } } dentry = readdir(mon_dir); diff --git a/projects/rocm-smi-lib/src/rocm_smi_monitor.cc b/projects/rocm-smi-lib/src/rocm_smi_monitor.cc index 15f0c39148..1a709d9ace 100755 --- a/projects/rocm-smi-lib/src/rocm_smi_monitor.cc +++ b/projects/rocm-smi-lib/src/rocm_smi_monitor.cc @@ -275,12 +275,19 @@ Monitor::setSensorLabelMap(void) { } auto add_temp_sensor_entry = [&](uint32_t file_index) { ret = readMonitor(kMonTempLabel, file_index, &type_str); - if (ret) { - return ret; - } - rsmi_temperature_type_t t_type = kTempSensorNameMap.at(type_str); - temp_type_index_map_.insert({t_type, file_index}); + + // If readMonitor fails, there is no label file for the file_index. + // In that case, map the type to file index 0, which is not supported + // and will fail appropriately later when we check for support. + if (ret) { + temp_type_index_map_.insert({t_type, 0}); + index_temp_type_map_.insert({file_index, RSMI_TEMP_TYPE_INVALID}); + } else { + temp_type_index_map_.insert({t_type, file_index}); + index_temp_type_map_.insert({file_index, t_type}); + } + index_temp_type_map_.insert({file_index, t_type}); return 0; }; @@ -342,10 +349,15 @@ static int get_supported_sensors(std::string dir_path, std::string fn_reg_ex, } uint32_t -Monitor::getSensorIndex(rsmi_temperature_type_t type) { +Monitor::getTempSensorIndex(rsmi_temperature_type_t type) { return temp_type_index_map_.at(type); } +rsmi_temperature_type_t +Monitor::getTempSensorEnum(uint64_t ind) { + return index_temp_type_map_.at(ind); +} + static std::vector get_intersection(std::vector *v1, std::vector *v2) { assert(v1 != nullptr); @@ -360,6 +372,25 @@ static std::vector get_intersection(std::vector *v1, return intersect; } +// Use this enum to encode the monitor type into the monitor ID. +// We can later use this to convert to rsmi-api sensor types; for exampple, +// rsmi_temperature_type_t, which is what the caller will expect. Add +// new types as needed. + +typedef enum { + eDefaultMonitor = 0, + eTempMonitor, +} monitor_types; + +static monitor_types getFuncType(std::string f_name) { + monitor_types ret = eDefaultMonitor; + + if (f_name.compare("rsmi_dev_temp_metric_get") == 0) { + ret = eTempMonitor; + } + return ret; +} + void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { std::map::const_iterator it = kMonFuncDependsMap.begin(); @@ -369,6 +400,7 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { std::vector sensors_i; std::vector intersect; int ret; + monitor_types m_type; assert(supported_funcs != nullptr); @@ -377,6 +409,7 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { std::vector::const_iterator dep = it->second.mandatory_depends.begin(); + m_type = getFuncType(it->first); mand_depends_met = true; // Initialize "intersect". A monitor is considered supported if all of its @@ -451,7 +484,21 @@ void Monitor::fillSupportedFuncs(SupportedFuncMap *supported_funcs) { supported_monitors = intersect; } if (supported_monitors.size() > 0) { - (*supported_variants)[kMonInfoVarTypeToRSMIVariant.at(*var)] = + for (uint32_t i = 0; i < supported_monitors.size(); ++i) { + assert(supported_monitors[i] > 0); + + if (m_type == eDefaultMonitor) { + supported_monitors[i] |= + (supported_monitors[i] - 1) << MONITOR_TYPE_BIT_POSITION; + } else if (m_type == eTempMonitor) { + supported_monitors[i] |= + static_cast(getTempSensorEnum(supported_monitors[i])) + << MONITOR_TYPE_BIT_POSITION; + } else { + assert(!"Unexpected monitor type"); + } + } + (*supported_variants)[kMonInfoVarTypeToRSMIVariant.at(*var)] = std::make_shared(supported_monitors); } }