diff --git a/python_binding/rdc_collectd.py b/python_binding/rdc_collectd.py index 346dce8e5e..9ce808993c 100644 --- a/python_binding/rdc_collectd.py +++ b/python_binding/rdc_collectd.py @@ -65,7 +65,7 @@ def config_func(config): if key == 'field_ids': field_ids = [] for f in node.values: - field_id = rdc.get_field_id_from_name(f) + field_id = rdc.get_field_id_from_name(str.encode(f)) if field_id.value == rdc_field_t.RDC_FI_INVALID: print("Invalid field '%s' will be ignored." % (f)) else: diff --git a/python_binding/rdc_prometheus.py b/python_binding/rdc_prometheus.py index 77975b2606..fe486c534b 100644 --- a/python_binding/rdc_prometheus.py +++ b/python_binding/rdc_prometheus.py @@ -85,7 +85,7 @@ def get_field_ids(args): if len(field_id_str)> 0 : for f in field_id_str: - field_id = rdc.get_field_id_from_name(f) + field_id = rdc.get_field_id_from_name(str.encode(f)) if field_id.value == rdc_field_t.RDC_FI_INVALID: print("Invalid field '%s' will be ignored." % (f)) else: diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index fbbccdd45a..6f48376281 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -364,12 +364,16 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field } break; case RDC_FI_POWER_USAGE: - value->status = rsmi_dev_power_ave_get(gpu_index, RSMI_TEMP_CURRENT, &i64); + { + RSMI_POWER_TYPE power_type = RSMI_CURRENT_POWER; + // below call should handle both socket power and regular power + value->status = rsmi_dev_power_get(gpu_index, &i64, &power_type); value->type = INTEGER; if (value->status == RSMI_STATUS_SUCCESS) { value->value.l_int = static_cast(i64); } break; + } case RDC_FI_GPU_CLOCK: case RDC_FI_MEM_CLOCK: rsmi_frequencies_t f; @@ -404,6 +408,14 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field } value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, RSMI_TEMP_CURRENT, &val_i64); + // fallback to hotspot temperature as some card may not have edge temperature. + if (sensor_type == RSMI_TEMP_TYPE_EDGE + && value->status == RSMI_STATUS_NOT_SUPPORTED) { + sensor_type = RSMI_TEMP_TYPE_JUNCTION; + value->status = rsmi_dev_temp_metric_get(gpu_index, sensor_type, + RSMI_TEMP_CURRENT, &val_i64); + } + value->type = INTEGER; if (value->status == RSMI_STATUS_SUCCESS) { value->value.l_int = val_i64;