Fix Prometheus counters
default to gauage Change-Id: Ia0428e61f023f10b02b3ebe103870d40c057abe3 Change values in question to gauges Change-Id: I81c91c880246342a0ad0586f6dbe50b247a01117 fixes Change-Id: I949438d3d3b511c22649640e082b59a3fb7696e0 Fix info handling Change-Id: I8091fbfa55ba5a9c21c4569dd40e37fb432924f3 fix default Change-Id: Ia449fed18730a06a858107e9218dc7b443a681fb
Этот коммит содержится в:
коммит произвёл
Galantsev, Dmitrii
родитель
9571dad23d
Коммит
e847f74f78
@@ -72,7 +72,14 @@ class rdc_field_type_t(c_int):
|
||||
STRING = 2
|
||||
BLOB = 3
|
||||
|
||||
class rdc_metric_type_t(c_int):
|
||||
INVALID = 0
|
||||
GAUGE = 1
|
||||
COUNTER = 2
|
||||
LABEL = 3
|
||||
|
||||
class rdc_field_t(c_int):
|
||||
|
||||
RDC_FI_INVALID = 0
|
||||
RDC_FI_GPU_COUNT = 1
|
||||
RDC_FI_DEV_NAME = 2
|
||||
@@ -198,6 +205,38 @@ class rdc_field_t(c_int):
|
||||
RDC_HEALTH_POWER_THROTTLE_TIME = 3006
|
||||
RDC_HEALTH_THERMAL_THROTTLE_TIME = 3007
|
||||
|
||||
_rdc_metric_type_lookup = {
|
||||
RDC_FI_INVALID: rdc_metric_type_t.INVALID,
|
||||
RDC_FI_GPU_COUNT: rdc_metric_type_t.LABEL,
|
||||
RDC_FI_DEV_NAME: rdc_metric_type_t.LABEL,
|
||||
RDC_FI_OAM_ID: rdc_metric_type_t.LABEL,
|
||||
RDC_FI_GPU_MEMORY_TOTAL: rdc_metric_type_t.COUNTER,
|
||||
RDC_FI_ECC_CORRECT_TOTAL: rdc_metric_type_t.COUNTER,
|
||||
RDC_FI_ECC_UNCORRECT_TOTAL: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_VMFAULT: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_THERMAL_THROTTLE: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_PRE_RESET: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_POST_RESET: rdc_metric_type_t.COUNTER,
|
||||
RDC_EVNT_NOTIF_RING_HANG: rdc_metric_type_t.COUNTER,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def get_rdc_metric_type(cls, rdc_metric_t):
|
||||
if isinstance(rdc_metric_t, str):
|
||||
rdc_metric_t = getattr(cls, rdc_metric_t, None)
|
||||
|
||||
# If the metric was found, do the lookup, otherwise default GAUGE
|
||||
if rdc_metric_t is not None:
|
||||
return cls._rdc_metric_type_lookup.get(rdc_metric_t, rdc_metric_type_t.GAUGE)
|
||||
return rdc_metric_type_t.GAUGE
|
||||
|
||||
@classmethod
|
||||
def get_field_name(cls, value):
|
||||
for attr_name, attr_value in cls.__dict__.items():
|
||||
if isinstance(attr_value, int) and attr_value == value:
|
||||
return attr_name
|
||||
return "Unknown field value"
|
||||
|
||||
rdc_handle_t = c_void_p
|
||||
rdc_gpu_group_t = c_uint32
|
||||
rdc_field_grp_t = c_uint32
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
import argparse
|
||||
import os
|
||||
from RdcReader import RdcReader
|
||||
from RdcUtil import RdcUtil
|
||||
from rdc_bootstrap import *
|
||||
from prometheus_client import start_http_server, Gauge, REGISTRY, PROCESS_COLLECTOR, PLATFORM_COLLECTOR
|
||||
from prometheus_client import start_http_server, Gauge, Counter, Info, REGISTRY, PROCESS_COLLECTOR, PLATFORM_COLLECTOR
|
||||
|
||||
os.environ['PROMETHEUS_DISABLE_CREATED_SERIES'] = "True"
|
||||
|
||||
default_field_ids = [
|
||||
rdc_field_t.RDC_FI_GPU_MEMORY_USAGE,
|
||||
@@ -35,16 +38,48 @@ class PrometheusReader(RdcReader):
|
||||
REGISTRY.unregister(PROCESS_COLLECTOR)
|
||||
REGISTRY.unregister(PLATFORM_COLLECTOR)
|
||||
|
||||
# Create the gauges
|
||||
self.guages = {}
|
||||
# Create the metrics
|
||||
self.gauges = {}
|
||||
self.counters = {}
|
||||
self.infos = {}
|
||||
|
||||
for fid in self.field_ids:
|
||||
field_name = self.rdc_util.field_id_string(fid).lower()
|
||||
self.guages[fid] = Gauge(field_name, field_name, labelnames=['gpu_index'])
|
||||
field_name = self.rdc_util.field_id_string(fid)
|
||||
|
||||
rdc_metric_type = rdc_field_t.get_rdc_metric_type(rdc_field_t.get_field_name(fid))
|
||||
|
||||
field_name = field_name.lower()
|
||||
|
||||
|
||||
if rdc_metric_type == 1:
|
||||
self.gauges[fid] = Gauge(field_name, field_name, labelnames=['gpu_index'])
|
||||
elif rdc_metric_type == 2:
|
||||
self.counters[fid] = Counter(field_name, field_name, labelnames=['gpu_index'])
|
||||
else:
|
||||
self.infos[fid] = Info(field_name, field_name, labelnames=['gpu_index'])
|
||||
|
||||
|
||||
|
||||
def handle_field(self, gpu_index, value):
|
||||
gpu_label = gpu_index
|
||||
if value.field_id.value in self.guages:
|
||||
self.guages[value.field_id.value].labels(gpu_label).set(value.value.l_int)
|
||||
if value.field_id.value in self.gauges:
|
||||
self.gauges[value.field_id.value].labels(gpu_label).set(value.value.l_int)
|
||||
elif value.field_id.value in self.counters:
|
||||
self.counters[value.field_id.value].labels(gpu_label).inc(value.value.l_int)
|
||||
else:
|
||||
self.infos[value.field_id.value].labels(gpu_label).info({'gpu_label': self.process_value(value)})
|
||||
|
||||
def process_value(self, value):
|
||||
if value.type.value == rdc_field_type_t.INTEGER:
|
||||
return str(value.value.l_int)
|
||||
elif value.type.value == rdc_field_type_t.DOUBLE:
|
||||
return str(value.value.d_float)
|
||||
elif value.type.value == rdc_field_type_t.STRING:
|
||||
return value.value.str.decode('utf-8', 'ignore')
|
||||
elif value.type.value == rdc_field_type_t.BLOB:
|
||||
return value.value.str.hex()
|
||||
else:
|
||||
return "unknown"
|
||||
|
||||
def get_field_ids(args):
|
||||
field_ids = []
|
||||
|
||||
Ссылка в новой задаче
Block a user