default to gauage

Change-Id: Ia0428e61f023f10b02b3ebe103870d40c057abe3

Change values in question to gauges

Change-Id: I81c91c880246342a0ad0586f6dbe50b247a01117

fixes

Change-Id: I949438d3d3b511c22649640e082b59a3fb7696e0

Fix info handling

Change-Id: I8091fbfa55ba5a9c21c4569dd40e37fb432924f3

fix default

Change-Id: Ia449fed18730a06a858107e9218dc7b443a681fb
Этот коммит содержится в:
adapryor
2024-10-04 08:15:40 -05:00
коммит произвёл Galantsev, Dmitrii
родитель 9571dad23d
Коммит e847f74f78
2 изменённых файлов: 81 добавлений и 7 удалений
+39
Просмотреть файл
@@ -72,7 +72,14 @@ class rdc_field_type_t(c_int):
STRING = 2
BLOB = 3
class rdc_metric_type_t(c_int):
INVALID = 0
GAUGE = 1
COUNTER = 2
LABEL = 3
class rdc_field_t(c_int):
RDC_FI_INVALID = 0
RDC_FI_GPU_COUNT = 1
RDC_FI_DEV_NAME = 2
@@ -198,6 +205,38 @@ class rdc_field_t(c_int):
RDC_HEALTH_POWER_THROTTLE_TIME = 3006
RDC_HEALTH_THERMAL_THROTTLE_TIME = 3007
_rdc_metric_type_lookup = {
RDC_FI_INVALID: rdc_metric_type_t.INVALID,
RDC_FI_GPU_COUNT: rdc_metric_type_t.LABEL,
RDC_FI_DEV_NAME: rdc_metric_type_t.LABEL,
RDC_FI_OAM_ID: rdc_metric_type_t.LABEL,
RDC_FI_GPU_MEMORY_TOTAL: rdc_metric_type_t.COUNTER,
RDC_FI_ECC_CORRECT_TOTAL: rdc_metric_type_t.COUNTER,
RDC_FI_ECC_UNCORRECT_TOTAL: rdc_metric_type_t.COUNTER,
RDC_EVNT_NOTIF_VMFAULT: rdc_metric_type_t.COUNTER,
RDC_EVNT_NOTIF_THERMAL_THROTTLE: rdc_metric_type_t.COUNTER,
RDC_EVNT_NOTIF_PRE_RESET: rdc_metric_type_t.COUNTER,
RDC_EVNT_NOTIF_POST_RESET: rdc_metric_type_t.COUNTER,
RDC_EVNT_NOTIF_RING_HANG: rdc_metric_type_t.COUNTER,
}
@classmethod
def get_rdc_metric_type(cls, rdc_metric_t):
if isinstance(rdc_metric_t, str):
rdc_metric_t = getattr(cls, rdc_metric_t, None)
# If the metric was found, do the lookup, otherwise default GAUGE
if rdc_metric_t is not None:
return cls._rdc_metric_type_lookup.get(rdc_metric_t, rdc_metric_type_t.GAUGE)
return rdc_metric_type_t.GAUGE
@classmethod
def get_field_name(cls, value):
for attr_name, attr_value in cls.__dict__.items():
if isinstance(attr_value, int) and attr_value == value:
return attr_name
return "Unknown field value"
rdc_handle_t = c_void_p
rdc_gpu_group_t = c_uint32
rdc_field_grp_t = c_uint32
+42 -7
Просмотреть файл
@@ -1,8 +1,11 @@
import argparse
import os
from RdcReader import RdcReader
from RdcUtil import RdcUtil
from rdc_bootstrap import *
from prometheus_client import start_http_server, Gauge, REGISTRY, PROCESS_COLLECTOR, PLATFORM_COLLECTOR
from prometheus_client import start_http_server, Gauge, Counter, Info, REGISTRY, PROCESS_COLLECTOR, PLATFORM_COLLECTOR
os.environ['PROMETHEUS_DISABLE_CREATED_SERIES'] = "True"
default_field_ids = [
rdc_field_t.RDC_FI_GPU_MEMORY_USAGE,
@@ -35,16 +38,48 @@ class PrometheusReader(RdcReader):
REGISTRY.unregister(PROCESS_COLLECTOR)
REGISTRY.unregister(PLATFORM_COLLECTOR)
# Create the gauges
self.guages = {}
# Create the metrics
self.gauges = {}
self.counters = {}
self.infos = {}
for fid in self.field_ids:
field_name = self.rdc_util.field_id_string(fid).lower()
self.guages[fid] = Gauge(field_name, field_name, labelnames=['gpu_index'])
field_name = self.rdc_util.field_id_string(fid)
rdc_metric_type = rdc_field_t.get_rdc_metric_type(rdc_field_t.get_field_name(fid))
field_name = field_name.lower()
if rdc_metric_type == 1:
self.gauges[fid] = Gauge(field_name, field_name, labelnames=['gpu_index'])
elif rdc_metric_type == 2:
self.counters[fid] = Counter(field_name, field_name, labelnames=['gpu_index'])
else:
self.infos[fid] = Info(field_name, field_name, labelnames=['gpu_index'])
def handle_field(self, gpu_index, value):
gpu_label = gpu_index
if value.field_id.value in self.guages:
self.guages[value.field_id.value].labels(gpu_label).set(value.value.l_int)
if value.field_id.value in self.gauges:
self.gauges[value.field_id.value].labels(gpu_label).set(value.value.l_int)
elif value.field_id.value in self.counters:
self.counters[value.field_id.value].labels(gpu_label).inc(value.value.l_int)
else:
self.infos[value.field_id.value].labels(gpu_label).info({'gpu_label': self.process_value(value)})
def process_value(self, value):
if value.type.value == rdc_field_type_t.INTEGER:
return str(value.value.l_int)
elif value.type.value == rdc_field_type_t.DOUBLE:
return str(value.value.d_float)
elif value.type.value == rdc_field_type_t.STRING:
return value.value.str.decode('utf-8', 'ignore')
elif value.type.value == rdc_field_type_t.BLOB:
return value.value.str.hex()
else:
return "unknown"
def get_field_ids(args):
field_ids = []