Fallback to junction temperature and socket power

If the card does not have edge temperature, fallback to junction
temperature. If the card only have socket power, then use socket
power instead.

Change-Id: I053a67a89cf3b29a34e82123f522c08d7dd68916


[ROCm/rdc commit: 5cfe2b4169]
This commit is contained in:
Bill(Shuzhou) Liu
2024-02-05 10:06:31 -06:00
vanhempi 80d3711aca
commit d1efa59fe8
3 muutettua tiedostoa jossa 15 lisäystä ja 3 poistoa
@@ -65,7 +65,7 @@ def config_func(config):
if key == 'field_ids':
field_ids = []
for f in node.values:
field_id = rdc.get_field_id_from_name(f)
field_id = rdc.get_field_id_from_name(str.encode(f))
if field_id.value == rdc_field_t.RDC_FI_INVALID:
print("Invalid field '%s' will be ignored." % (f))
else:
@@ -85,7 +85,7 @@ def get_field_ids(args):
if len(field_id_str)> 0 :
for f in field_id_str:
field_id = rdc.get_field_id_from_name(f)
field_id = rdc.get_field_id_from_name(str.encode(f))
if field_id.value == rdc_field_t.RDC_FI_INVALID:
print("Invalid field '%s' will be ignored." % (f))
else:
@@ -364,12 +364,16 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
}
break;
case RDC_FI_POWER_USAGE:
value->status = rsmi_dev_power_ave_get(gpu_index, RSMI_TEMP_CURRENT, &i64);
{
RSMI_POWER_TYPE power_type = RSMI_CURRENT_POWER;
// below call should handle both socket power and regular power
value->status = rsmi_dev_power_get(gpu_index, &i64, &power_type);
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = static_cast<int64_t>(i64);
}
break;
}
case RDC_FI_GPU_CLOCK:
case RDC_FI_MEM_CLOCK:
rsmi_frequencies_t f;
@@ -404,6 +408,14 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field
}
value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, RSMI_TEMP_CURRENT, &val_i64);
// fallback to hotspot temperature as some card may not have edge temperature.
if (sensor_type == RSMI_TEMP_TYPE_EDGE
&& value->status == RSMI_STATUS_NOT_SUPPORTED) {
sensor_type = RSMI_TEMP_TYPE_JUNCTION;
value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type,
RSMI_TEMP_CURRENT, &val_i64);
}
value->type = INTEGER;
if (value->status == RSMI_STATUS_SUCCESS) {
value->value.l_int = val_i64;