2020-08-13 09:01:12 -04:00
import argparse
2024-10-04 08:15:40 -05:00
import os
2020-08-13 09:01:12 -04:00
from RdcReader import RdcReader
from RdcUtil import RdcUtil
from rdc_bootstrap import *
2024-10-04 08:15:40 -05:00
from prometheus_client import start_http_server , Gauge , Counter , Info , REGISTRY , PROCESS_COLLECTOR , PLATFORM_COLLECTOR
os . environ [ 'PROMETHEUS_DISABLE_CREATED_SERIES' ] = "True"
2020-08-13 09:01:12 -04:00
default_field_ids = [
rdc_field_t . RDC_FI_GPU_MEMORY_USAGE ,
rdc_field_t . RDC_FI_GPU_MEMORY_TOTAL ,
rdc_field_t . RDC_FI_POWER_USAGE ,
rdc_field_t . RDC_FI_GPU_CLOCK ,
rdc_field_t . RDC_FI_GPU_UTIL ,
2024-07-15 18:50:15 -05:00
rdc_field_t . RDC_FI_GPU_TEMP ,
rdc_field_t . RDC_FI_PROF_ACTIVE_CYCLES ,
rdc_field_t . RDC_FI_PROF_ACTIVE_WAVES ,
2024-12-13 16:50:32 -06:00
rdc_field_t . RDC_FI_PROF_OCCUPANCY_PERCENT ,
2020-08-13 09:01:12 -04:00
]
class PrometheusReader ( RdcReader ):
def __init__ ( self , rdc_ip_port , field_ids , update_freq , max_keep_age , max_keep_samples ,
2024-07-15 18:50:15 -05:00
gpu_indexes , rdc_unauth , enable_plugin_monitoring ):
2020-08-13 09:01:12 -04:00
group_name = "rdc_prometheus_plugin_group"
field_group_name = "rdc_prometheus_plugin_fieldgroup"
if rdc_unauth :
RdcReader . __init__ ( self , ip_port = rdc_ip_port , field_ids = field_ids , update_freq = update_freq ,
max_keep_age = max_keep_age , max_keep_samples = max_keep_samples ,
gpu_indexes = gpu_indexes , field_group_name = field_group_name , gpu_group_name = group_name , root_ca = None )
else :
RdcReader . __init__ ( self , ip_port = rdc_ip_port , field_ids = field_ids , update_freq = update_freq ,
max_keep_age = max_keep_age , max_keep_samples = max_keep_samples ,
gpu_indexes = gpu_indexes , field_group_name = field_group_name , gpu_group_name = group_name )
# Supress internal metrics from prometheus_client
if enable_plugin_monitoring == False :
REGISTRY . unregister ( PROCESS_COLLECTOR )
REGISTRY . unregister ( PLATFORM_COLLECTOR )
2024-10-04 08:15:40 -05:00
# Create the metrics
self . gauges = {}
self . counters = {}
self . infos = {}
2020-08-13 09:01:12 -04:00
for fid in self . field_ids :
2024-10-04 08:15:40 -05:00
field_name = self . rdc_util . field_id_string ( fid )
rdc_metric_type = rdc_field_t . get_rdc_metric_type ( rdc_field_t . get_field_name ( fid ))
field_name = field_name . lower ()
if rdc_metric_type == 1 :
self . gauges [ fid ] = Gauge ( field_name , field_name , labelnames = [ 'gpu_index' ])
elif rdc_metric_type == 2 :
self . counters [ fid ] = Counter ( field_name , field_name , labelnames = [ 'gpu_index' ])
else :
self . infos [ fid ] = Info ( field_name , field_name , labelnames = [ 'gpu_index' ])
2020-08-13 09:01:12 -04:00
def handle_field ( self , gpu_index , value ):
2022-05-05 09:16:05 -04:00
gpu_label = gpu_index
2024-10-04 08:15:40 -05:00
if value . field_id . value in self . gauges :
self . gauges [ value . field_id . value ] . labels ( gpu_label ) . set ( value . value . l_int )
elif value . field_id . value in self . counters :
self . counters [ value . field_id . value ] . labels ( gpu_label ) . inc ( value . value . l_int )
else :
self . infos [ value . field_id . value ] . labels ( gpu_label ) . info ({ 'gpu_label' : self . process_value ( value )})
def process_value ( self , value ):
if value . type . value == rdc_field_type_t . INTEGER :
return str ( value . value . l_int )
elif value . type . value == rdc_field_type_t . DOUBLE :
return str ( value . value . d_float )
elif value . type . value == rdc_field_type_t . STRING :
return value . value . str . decode ( 'utf-8' , 'ignore' )
elif value . type . value == rdc_field_type_t . BLOB :
return value . value . str . hex ()
else :
return "unknown"
2020-08-13 09:01:12 -04:00
def get_field_ids ( args ):
field_ids = []
field_id_str = []
if args . rdc_fields :
field_id_str = args . rdc_fields
elif args . rdc_fields_file :
try :
with open ( args . rdc_fields_file ) as fi :
content = fi . readlines ()
field_id_str = [ x . strip () for x in content ]
except Exception as e :
print ( "Fail to read " + args . rdc_fields_file + ":" + str ( e ))
if len ( field_id_str ) > 0 :
for f in field_id_str :
2024-02-05 10:06:31 -06:00
field_id = rdc . get_field_id_from_name ( str . encode ( f ))
2020-08-13 09:01:12 -04:00
if field_id . value == rdc_field_t . RDC_FI_INVALID :
print ( "Invalid field ' %s ' will be ignored." % ( f ))
else :
field_ids . append ( field_id . value )
return field_ids
return default_field_ids
if __name__ == '__main__' :
parser = argparse . ArgumentParser ( description = 'RDC Prometheus plugin.' )
parser . add_argument ( '--listen_port' , default = 5000 , type = int , help = 'The listen port of the plugin (default: 5000)' )
parser . add_argument ( '--rdc_embedded' , default = False , action = 'store_true' , help = 'Run RDC in embedded mode (default: standalone mode)' )
parser . add_argument ( '--rdc_ip_port' , default = 'localhost:50051' , help = 'The rdcd IP and port in standalone mode (default: localhost:50051)' )
parser . add_argument ( '--rdc_unauth' , default = False , action = 'store_true' , help = 'Set this option if the rdcd is running with unauth in standalone mode (default: false)' )
parser . add_argument ( '--rdc_update_freq' , default = 10 , help = 'The fields update frequency in seconds (default: 10)' )
parser . add_argument ( '--rdc_max_keep_age' , default = 3600 , help = 'The max keep age of the fields in seconds (default: 3600)' )
parser . add_argument ( '--rdc_max_keep_samples' , default = 1000 , help = 'The max samples to keep for each field in the cache (default: 1000)' )
parser . add_argument ( '--rdc_fields' , default = None , nargs = '+' , help = 'The list of fields name needs to be watched, for example, " --rdc_fields RDC_FI_GPU_TEMP RDC_FI_POWER_USAGE " (default: predefined fields in the plugin)' )
parser . add_argument ( '--rdc_fields_file' , default = None , help = 'The list of fields name can also be read from a file with each field name in a separated line (default: None)' )
parser . add_argument ( '--rdc_gpu_indexes' , default = None , nargs = '+' , help = 'The list of GPUs to be watched (default: All GPUs)' )
parser . add_argument ( '--enable_plugin_monitoring' , default = False , action = 'store_true' , help = 'Set this option to collect process metrics of the plugin itself (default: false)' )
args = parser . parse_args ()
field_ids = get_field_ids ( args )
rdc_ip_port = args . rdc_ip_port
if args . rdc_embedded :
rdc_ip_port = None
2021-02-09 13:36:57 -05:00
if args . rdc_gpu_indexes != None :
for i in range ( 0 , len ( args . rdc_gpu_indexes )):
args . rdc_gpu_indexes [ i ] = int ( args . rdc_gpu_indexes [ i ])
2020-08-13 09:01:12 -04:00
reader = PrometheusReader ( rdc_ip_port , field_ids , args . rdc_update_freq * 1000000 ,
args . rdc_max_keep_age , args . rdc_max_keep_samples ,
2024-07-15 18:50:15 -05:00
args . rdc_gpu_indexes , args . rdc_unauth , args . enable_plugin_monitoring )
2020-08-13 09:01:12 -04:00
start_http_server ( args . listen_port )
print ( "The RDC Prometheus plugin listen at port %d " % ( args . listen_port ))
time . sleep ( 3 )
while True :
reader . process ()
time . sleep ( 1 )