diff --git a/python_binding/rdc_bootstrap.py b/python_binding/rdc_bootstrap.py index 5081734ce7..2f74afbf1b 100644 --- a/python_binding/rdc_bootstrap.py +++ b/python_binding/rdc_bootstrap.py @@ -72,7 +72,14 @@ class rdc_field_type_t(c_int): STRING = 2 BLOB = 3 +class rdc_metric_type_t(c_int): + INVALID = 0 + GAUGE = 1 + COUNTER = 2 + LABEL = 3 + class rdc_field_t(c_int): + RDC_FI_INVALID = 0 RDC_FI_GPU_COUNT = 1 RDC_FI_DEV_NAME = 2 @@ -198,6 +205,38 @@ class rdc_field_t(c_int): RDC_HEALTH_POWER_THROTTLE_TIME = 3006 RDC_HEALTH_THERMAL_THROTTLE_TIME = 3007 + _rdc_metric_type_lookup = { + RDC_FI_INVALID: rdc_metric_type_t.INVALID, + RDC_FI_GPU_COUNT: rdc_metric_type_t.LABEL, + RDC_FI_DEV_NAME: rdc_metric_type_t.LABEL, + RDC_FI_OAM_ID: rdc_metric_type_t.LABEL, + RDC_FI_GPU_MEMORY_TOTAL: rdc_metric_type_t.COUNTER, + RDC_FI_ECC_CORRECT_TOTAL: rdc_metric_type_t.COUNTER, + RDC_FI_ECC_UNCORRECT_TOTAL: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_VMFAULT: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_THERMAL_THROTTLE: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_PRE_RESET: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_POST_RESET: rdc_metric_type_t.COUNTER, + RDC_EVNT_NOTIF_RING_HANG: rdc_metric_type_t.COUNTER, + } + + @classmethod + def get_rdc_metric_type(cls, rdc_metric_t): + if isinstance(rdc_metric_t, str): + rdc_metric_t = getattr(cls, rdc_metric_t, None) + + # If the metric was found, do the lookup, otherwise default GAUGE + if rdc_metric_t is not None: + return cls._rdc_metric_type_lookup.get(rdc_metric_t, rdc_metric_type_t.GAUGE) + return rdc_metric_type_t.GAUGE + + @classmethod + def get_field_name(cls, value): + for attr_name, attr_value in cls.__dict__.items(): + if isinstance(attr_value, int) and attr_value == value: + return attr_name + return "Unknown field value" + rdc_handle_t = c_void_p rdc_gpu_group_t = c_uint32 rdc_field_grp_t = c_uint32 diff --git a/python_binding/rdc_prometheus.py b/python_binding/rdc_prometheus.py index 58a887ab07..eede725bf7 100644 --- a/python_binding/rdc_prometheus.py +++ b/python_binding/rdc_prometheus.py @@ -1,8 +1,11 @@ import argparse +import os from RdcReader import RdcReader from RdcUtil import RdcUtil from rdc_bootstrap import * -from prometheus_client import start_http_server, Gauge, REGISTRY, PROCESS_COLLECTOR, PLATFORM_COLLECTOR +from prometheus_client import start_http_server, Gauge, Counter, Info, REGISTRY, PROCESS_COLLECTOR, PLATFORM_COLLECTOR + +os.environ['PROMETHEUS_DISABLE_CREATED_SERIES'] = "True" default_field_ids = [ rdc_field_t.RDC_FI_GPU_MEMORY_USAGE, @@ -35,16 +38,48 @@ class PrometheusReader(RdcReader): REGISTRY.unregister(PROCESS_COLLECTOR) REGISTRY.unregister(PLATFORM_COLLECTOR) - # Create the gauges - self.guages = {} + # Create the metrics + self.gauges = {} + self.counters = {} + self.infos = {} + for fid in self.field_ids: - field_name = self.rdc_util.field_id_string(fid).lower() - self.guages[fid] = Gauge(field_name, field_name, labelnames=['gpu_index']) + field_name = self.rdc_util.field_id_string(fid) + + rdc_metric_type = rdc_field_t.get_rdc_metric_type(rdc_field_t.get_field_name(fid)) + + field_name = field_name.lower() + + + if rdc_metric_type == 1: + self.gauges[fid] = Gauge(field_name, field_name, labelnames=['gpu_index']) + elif rdc_metric_type == 2: + self.counters[fid] = Counter(field_name, field_name, labelnames=['gpu_index']) + else: + self.infos[fid] = Info(field_name, field_name, labelnames=['gpu_index']) + + def handle_field(self, gpu_index, value): gpu_label = gpu_index - if value.field_id.value in self.guages: - self.guages[value.field_id.value].labels(gpu_label).set(value.value.l_int) + if value.field_id.value in self.gauges: + self.gauges[value.field_id.value].labels(gpu_label).set(value.value.l_int) + elif value.field_id.value in self.counters: + self.counters[value.field_id.value].labels(gpu_label).inc(value.value.l_int) + else: + self.infos[value.field_id.value].labels(gpu_label).info({'gpu_label': self.process_value(value)}) + + def process_value(self, value): + if value.type.value == rdc_field_type_t.INTEGER: + return str(value.value.l_int) + elif value.type.value == rdc_field_type_t.DOUBLE: + return str(value.value.d_float) + elif value.type.value == rdc_field_type_t.STRING: + return value.value.str.decode('utf-8', 'ignore') + elif value.type.value == rdc_field_type_t.BLOB: + return value.value.str.hex() + else: + return "unknown" def get_field_ids(args): field_ids = []