diff --git a/docs/ROCm_SMI_Manual.pdf b/docs/ROCm_SMI_Manual.pdf index 4fd77c2150..b6da2a742c 100644 Binary files a/docs/ROCm_SMI_Manual.pdf and b/docs/ROCm_SMI_Manual.pdf differ diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index c0f1ed38c0..4311042bfd 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -238,6 +238,21 @@ typedef enum { typedef rsmi_temperature_metric_t rsmi_temperature_metric; /// \endcond +/** + * @brief This ennumeration is used to indicate from which part of the device a + * temperature reading should be obtained. + */ +typedef enum { + RSMI_TEMP_TYPE_FIRST = 0, + + RSMI_TEMP_TYPE_EDGE = RSMI_TEMP_TYPE_FIRST, //!< Edge GPU temperature + RSMI_TEMP_TYPE_JUNCTION, //!< Junction/hotspot + //!< temperature + RSMI_TEMP_TYPE_MEMORY, //!< VRAM temperature + + RSMI_TEMP_TYPE_LAST = RSMI_TEMP_TYPE_MEMORY +} rsmi_temperature_type_t; + /** * @brief Pre-set Profile Selections. These bitmasks can be AND'd with the * ::rsmi_power_profile_status_t.available_profiles returned from @@ -1096,15 +1111,15 @@ rsmi_status_t rsmi_dev_fan_speed_max_get(uint32_t dv_ind, * @brief Get the temperature metric value for the specified metric, from the * specified temperature sensor on the specified device. * - * @details Given a device index @p dv_ind, a 0-based sensor index - * @p sensor_ind, a ::rsmi_temperature_metric_t @p metric and a pointer to an - * int64_t @p temperature, this function will write the value of the metric - * indicated by @p metric to the memory location @p temperature. + * @details Given a device index @p dv_ind, a sensor type @p sensor_type, a + * ::rsmi_temperature_metric_t @p metric and a pointer to an int64_t @p + * temperature, this function will write the value of the metric indicated by + * @p metric and @p sensor_type to the memory location @p temperature. * * @param[in] dv_ind a device index * - * @param[in] sensor_ind a 0-based sensor index. Normally, this will be 0. - * If a device has more than one sensor, it could be greater than 0. + * @param[in] sensor_type part of device from which temperature should be + * obtained. This should come from the enum ::rsmi_temperature_type_t * * @param[in] metric enum indicated which temperature value should be * retrieved @@ -1115,7 +1130,7 @@ rsmi_status_t rsmi_dev_fan_speed_max_get(uint32_t dv_ind, * @retval ::RSMI_STATUS_SUCCESS is returned upon successful call. * */ -rsmi_status_t rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_ind, +rsmi_status_t rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t *temperature); /** @} */ // end of PhysQuer diff --git a/include/rocm_smi/rocm_smi_monitor.h b/include/rocm_smi/rocm_smi_monitor.h index 217c600091..695f94c8fe 100755 --- a/include/rocm_smi/rocm_smi_monitor.h +++ b/include/rocm_smi/rocm_smi_monitor.h @@ -47,8 +47,10 @@ #include #include +#include #include "rocm_smi/rocm_smi_common.h" +#include "rocm_smi/rocm_smi.h" namespace amd { namespace smi { @@ -77,6 +79,7 @@ enum MonitorTypes { kMonTempOffset, kMonTempLowest, kMonTempHighest, + kMonTempLabel, kMonInvalid = 0xFFFFFFFF, }; @@ -89,10 +92,14 @@ class Monitor { const std::string path(void) const {return path_;} int readMonitor(MonitorTypes type, uint32_t sensor_ind, std::string *val); int writeMonitor(MonitorTypes type, uint32_t sensor_ind, std::string val); + uint32_t setSensorLabelMap(void); + uint32_t getSensorIndex(rsmi_temperature_type_t type); + private: std::string MakeMonitorPath(MonitorTypes type, int32_t sensor_id); std::string path_; const RocmSMI_env_vars *env_; + std::map temp_type_index_map_; }; } // namespace smi diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index c9951af2a9..7445194d61 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -1180,8 +1180,7 @@ get_id_name_str_from_line(uint64_t id, std::string ln, return ret_str; } -static rsmi_status_t get_backup_name(uint16_t id, char *name, - size_t len, eNameStrType typ) { +static rsmi_status_t get_backup_name(uint16_t id, char *name, size_t len) { std::string name_str; name_str += "0x"; @@ -1291,7 +1290,7 @@ static rsmi_status_t get_dev_name_from_id(uint32_t dv_ind, char *name, val_str.clear(); return get_backup_name(typ == NAME_STR_DEVICE ? - device_id : subsys_id, name, len, typ); + device_id : subsys_id, name, len); } val_str = get_id_name_str_from_line(vendor_id, ln, &ln_str); @@ -1315,7 +1314,7 @@ static rsmi_status_t get_dev_name_from_id(uint32_t dv_ind, char *name, // We should have already returned if we were looking for // device or subdivce assert(typ == NAME_STR_VENDOR); - return get_backup_name(vendor_id, name, len, typ); + return get_backup_name(vendor_id, name, len); } size_t ct = val_str.copy(name, len); @@ -1467,7 +1466,7 @@ rsmi_dev_pci_throughput_get(uint32_t dv_ind, uint64_t *sent, } rsmi_status_t -rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_ind, +rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_type, rsmi_temperature_metric_t metric, int64_t *temperature) { TRY @@ -1478,14 +1477,6 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_ind, rsmi_status_t ret; amd::smi::MonitorTypes mon_type; - - // Make any adjustments to sensor_ind here, if index is not a 0 based. For - // rocm_smi we are using a 0-based index. However, most of the Linux sysfs - // monitor files are 1-based, so we will increment by 1 and make adjustments - // for exceptions later. - // See https://www.kernel.org/doc/Documentation/hwmon/sysfs-interface - ++sensor_ind; - switch (metric) { case RSMI_TEMP_CURRENT: mon_type = amd::smi::kMonTemp; @@ -1535,7 +1526,19 @@ rsmi_dev_temp_metric_get(uint32_t dv_ind, uint32_t sensor_ind, DEVICE_MUTEX - ret = get_dev_mon_value(mon_type, dv_ind, sensor_ind, temperature); + GET_DEV_FROM_INDX + + assert(dev->monitor() != nullptr); + std::shared_ptr m = dev->monitor(); + + uint32_t err = m->setSensorLabelMap(); + if (err) { + return errno_to_rsmi_status(err); + } + + uint32_t sensor_index = + m->getSensorIndex(static_cast(sensor_type)); + ret = get_dev_mon_value(mon_type, dv_ind, sensor_index, temperature); return ret; CATCH diff --git a/src/rocm_smi_monitor.cc b/src/rocm_smi_monitor.cc index 26c137cd74..4ab035fb86 100755 --- a/src/rocm_smi_monitor.cc +++ b/src/rocm_smi_monitor.cc @@ -86,6 +86,18 @@ static const char *kMonTempCritMinHystName = "temp#_lcrit_hyst"; static const char *kMonTempOffsetName = "temp#_offset"; static const char *kMonTempLowestName = "temp#_lowest"; static const char *kMonTempHighestName = "temp#_highest"; +static const char *kMonTempLabelName = "temp#_label"; + +static const char *kTempSensorTypeMemoryName = "mem"; +static const char *kTempSensorTypeJunctionName = "junction"; +static const char *kTempSensorTypeEdgeName = "edge"; + +static const std::map + kTempSensorNameMap = { + {kTempSensorTypeMemoryName, RSMI_TEMP_TYPE_MEMORY}, + {kTempSensorTypeJunctionName, RSMI_TEMP_TYPE_JUNCTION}, + {kTempSensorTypeEdgeName, RSMI_TEMP_TYPE_EDGE}, +}; static const std::map kMonitorNameMap = { {kMonName, kMonNameFName}, @@ -111,6 +123,7 @@ static const std::map kMonitorNameMap = { {kMonTempOffset, kMonTempOffsetName}, {kMonTempLowest, kMonTempLowestName}, {kMonTempHighest, kMonTempHighestName}, + {kMonTempLabel, kMonTempLabelName}, }; Monitor::Monitor(std::string path, RocmSMI_env_vars const *e) : @@ -152,6 +165,39 @@ int Monitor::readMonitor(MonitorTypes type, uint32_t sensor_id, return ReadSysfsStr(sysfs_path, val); } +uint32_t +Monitor::setSensorLabelMap(void) { + std::string type_str; + int ret; + + if (temp_type_index_map_.size() > 0) { + return 0; // We've already filled in the map + } + auto add_temp_sensor_entry = [&](uint32_t file_index) { + ret = readMonitor(kMonTempLabel, file_index, &type_str); + if (ret) { + return ret; + } + + rsmi_temperature_type_t t_type = kTempSensorNameMap.at(type_str); + temp_type_index_map_.insert({t_type, file_index}); + return 0; + }; + + for (uint32_t i = 1; i <= 3; ++i) { + ret = add_temp_sensor_entry(i); + if (ret) { + return ret; + } + } + return 0; +} + +uint32_t +Monitor::getSensorIndex(rsmi_temperature_type_t type) { + return temp_type_index_map_.at(type); +} + } // namespace smi } // namespace amd diff --git a/tests/rocm_smi_test/functional/temp_read.cc b/tests/rocm_smi_test/functional/temp_read.cc index e68185e689..b2e63ac5f1 100755 --- a/tests/rocm_smi_test/functional/temp_read.cc +++ b/tests/rocm_smi_test/functional/temp_read.cc @@ -48,12 +48,19 @@ #include #include +#include #include "gtest/gtest.h" #include "rocm_smi/rocm_smi.h" #include "rocm_smi_test/functional/temp_read.h" #include "rocm_smi_test/test_common.h" + +static const std::map kTempSensorNameMap = { + {RSMI_TEMP_TYPE_MEMORY, "Memory"}, + {RSMI_TEMP_TYPE_JUNCTION, "Junction"}, + {RSMI_TEMP_TYPE_EDGE, "Edge"}, +}; TestTempRead::TestTempRead() : TestBase() { set_title("RSMI Temp Read Test"); set_description("The Temperature Read tests verifies that the temperature " @@ -91,12 +98,14 @@ void TestTempRead::Run(void) { TestBase::Run(); + uint32_t type; + for (uint32_t i = 0; i < num_monitor_devs(); ++i) { PrintDeviceHeader(i); auto print_temp_metric = [&](rsmi_temperature_metric_t met, std::string label) { - err = rsmi_dev_temp_metric_get(i, 0, met, &val_i64); + err = rsmi_dev_temp_metric_get(i, type, met, &val_i64); if (err != RSMI_STATUS_SUCCESS) { if (err == RSMI_STATUS_NOT_SUPPORTED) { @@ -115,25 +124,31 @@ void TestTempRead::Run(void) { "C" << std::endl; } }; - print_temp_metric(RSMI_TEMP_CURRENT, "Current Temp."); - print_temp_metric(RSMI_TEMP_MAX, "Temperature max value"); - print_temp_metric(RSMI_TEMP_MIN, "Temperature min value"); - print_temp_metric(RSMI_TEMP_MAX_HYST, - "Temperature hysteresis value for max limit"); - print_temp_metric(RSMI_TEMP_MIN_HYST, - "Temperature hysteresis value for min limit"); - print_temp_metric(RSMI_TEMP_CRITICAL, "Temperature critical max value"); - print_temp_metric(RSMI_TEMP_CRITICAL_HYST, - "Temperature hysteresis value for critical limit"); - print_temp_metric(RSMI_TEMP_EMERGENCY, - "Temperature emergency max value"); - print_temp_metric(RSMI_TEMP_EMERGENCY_HYST, - "Temperature hysteresis value for emergency limit"); - print_temp_metric(RSMI_TEMP_CRIT_MIN, "Temperature critical min value"); - print_temp_metric(RSMI_TEMP_CRIT_MIN_HYST, - "Temperature hysteresis value for critical min value"); - print_temp_metric(RSMI_TEMP_OFFSET, "Temperature offset"); - print_temp_metric(RSMI_TEMP_LOWEST, "Historical minimum temperature"); - print_temp_metric(RSMI_TEMP_HIGHEST, "Historical maximum temperature"); + for (type = RSMI_TEMP_TYPE_FIRST; type <= RSMI_TEMP_TYPE_LAST; ++type) { + IF_VERB(STANDARD) { + std::cout << "\t** **********" << kTempSensorNameMap.at(type) << + " Temperatures **********" << std::endl; + } + print_temp_metric(RSMI_TEMP_CURRENT, "Current Temp."); + print_temp_metric(RSMI_TEMP_MAX, "Temperature max value"); + print_temp_metric(RSMI_TEMP_MIN, "Temperature min value"); + print_temp_metric(RSMI_TEMP_MAX_HYST, + "Temperature hysteresis value for max limit"); + print_temp_metric(RSMI_TEMP_MIN_HYST, + "Temperature hysteresis value for min limit"); + print_temp_metric(RSMI_TEMP_CRITICAL, "Temperature critical max value"); + print_temp_metric(RSMI_TEMP_CRITICAL_HYST, + "Temperature hysteresis value for critical limit"); + print_temp_metric(RSMI_TEMP_EMERGENCY, + "Temperature emergency max value"); + print_temp_metric(RSMI_TEMP_EMERGENCY_HYST, + "Temperature hysteresis value for emergency limit"); + print_temp_metric(RSMI_TEMP_CRIT_MIN, "Temperature critical min value"); + print_temp_metric(RSMI_TEMP_CRIT_MIN_HYST, + "Temperature hysteresis value for critical min value"); + print_temp_metric(RSMI_TEMP_OFFSET, "Temperature offset"); + print_temp_metric(RSMI_TEMP_LOWEST, "Historical minimum temperature"); + print_temp_metric(RSMI_TEMP_HIGHEST, "Historical maximum temperature"); + } } }