From d1efa59fe85f02026d5feb6d1d51ebd81eb8f9cf Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Mon, 5 Feb 2024 10:06:31 -0600 Subject: [PATCH] Fallback to junction temperature and socket power If the card does not have edge temperature, fallback to junction temperature. If the card only have socket power, then use socket power instead. Change-Id: I053a67a89cf3b29a34e82123f522c08d7dd68916 [ROCm/rdc commit: 5cfe2b41690441f2eb12a8406ca169619250baf9] --- projects/rdc/python_binding/rdc_collectd.py | 2 +- projects/rdc/python_binding/rdc_prometheus.py | 2 +- .../rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc | 14 +++++++++++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/projects/rdc/python_binding/rdc_collectd.py b/projects/rdc/python_binding/rdc_collectd.py index 346dce8e5e..9ce808993c 100644 --- a/projects/rdc/python_binding/rdc_collectd.py +++ b/projects/rdc/python_binding/rdc_collectd.py @@ -65,7 +65,7 @@ def config_func(config): if key == 'field_ids': field_ids = [] for f in node.values: - field_id = rdc.get_field_id_from_name(f) + field_id = rdc.get_field_id_from_name(str.encode(f)) if field_id.value == rdc_field_t.RDC_FI_INVALID: print("Invalid field '%s' will be ignored." % (f)) else: diff --git a/projects/rdc/python_binding/rdc_prometheus.py b/projects/rdc/python_binding/rdc_prometheus.py index 77975b2606..fe486c534b 100644 --- a/projects/rdc/python_binding/rdc_prometheus.py +++ b/projects/rdc/python_binding/rdc_prometheus.py @@ -85,7 +85,7 @@ def get_field_ids(args): if len(field_id_str)> 0 : for f in field_id_str: - field_id = rdc.get_field_id_from_name(f) + field_id = rdc.get_field_id_from_name(str.encode(f)) if field_id.value == rdc_field_t.RDC_FI_INVALID: print("Invalid field '%s' will be ignored." % (f)) else: diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index fbbccdd45a..6f48376281 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -364,12 +364,16 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field } break; case RDC_FI_POWER_USAGE: - value->status = rsmi_dev_power_ave_get(gpu_index, RSMI_TEMP_CURRENT, &i64); + { + RSMI_POWER_TYPE power_type = RSMI_CURRENT_POWER; + // below call should handle both socket power and regular power + value->status = rsmi_dev_power_get(gpu_index, &i64, &power_type); value->type = INTEGER; if (value->status == RSMI_STATUS_SUCCESS) { value->value.l_int = static_cast(i64); } break; + } case RDC_FI_GPU_CLOCK: case RDC_FI_MEM_CLOCK: rsmi_frequencies_t f; @@ -404,6 +408,14 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field } value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, RSMI_TEMP_CURRENT, &val_i64); + // fallback to hotspot temperature as some card may not have edge temperature. + if (sensor_type == RSMI_TEMP_TYPE_EDGE + && value->status == RSMI_STATUS_NOT_SUPPORTED) { + sensor_type = RSMI_TEMP_TYPE_JUNCTION; + value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, + RSMI_TEMP_CURRENT, &val_i64); + } + value->type = INTEGER; if (value->status == RSMI_STATUS_SUCCESS) { value->value.l_int = val_i64;