2020-08-13 09:01:12 -04:00
import argparse
from RdcReader import RdcReader
from RdcUtil import RdcUtil
from rdc_bootstrap import *
from prometheus_client import start_http_server , Gauge , REGISTRY , PROCESS_COLLECTOR , PLATFORM_COLLECTOR
default_field_ids = [
rdc_field_t . RDC_FI_GPU_MEMORY_USAGE ,
rdc_field_t . RDC_FI_GPU_MEMORY_TOTAL ,
rdc_field_t . RDC_FI_POWER_USAGE ,
rdc_field_t . RDC_FI_GPU_CLOCK ,
rdc_field_t . RDC_FI_GPU_UTIL ,
rdc_field_t . RDC_FI_GPU_TEMP
]
class PrometheusReader ( RdcReader ):
def __init__ ( self , rdc_ip_port , field_ids , update_freq , max_keep_age , max_keep_samples ,
2022-05-05 09:16:05 -04:00
gpu_indexes , rdc_unauth , enable_pci_id , enable_plugin_monitoring ):
2020-08-13 09:01:12 -04:00
group_name = "rdc_prometheus_plugin_group"
field_group_name = "rdc_prometheus_plugin_fieldgroup"
if rdc_unauth :
RdcReader . __init__ ( self , ip_port = rdc_ip_port , field_ids = field_ids , update_freq = update_freq ,
max_keep_age = max_keep_age , max_keep_samples = max_keep_samples ,
gpu_indexes = gpu_indexes , field_group_name = field_group_name , gpu_group_name = group_name , root_ca = None )
else :
RdcReader . __init__ ( self , ip_port = rdc_ip_port , field_ids = field_ids , update_freq = update_freq ,
max_keep_age = max_keep_age , max_keep_samples = max_keep_samples ,
gpu_indexes = gpu_indexes , field_group_name = field_group_name , gpu_group_name = group_name )
# Supress internal metrics from prometheus_client
if enable_plugin_monitoring == False :
REGISTRY . unregister ( PROCESS_COLLECTOR )
REGISTRY . unregister ( PLATFORM_COLLECTOR )
2022-05-05 09:16:05 -04:00
# Use the PCI id as gpu_index if enabled
self . enable_pci_id = False
if enable_pci_id == True :
try :
import sys , os
2024-01-22 18:56:42 -06:00
# Relaive path of amd_smi to map gpu index to PCI id
# change smi_lib_path if the amd_smi is installed in different folder
2022-05-05 09:16:05 -04:00
smi_lib_relative_path = "../../bin"
smi_lib_path = os . path . join ( sys . path [ 0 ], smi_lib_relative_path )
2024-01-22 18:56:42 -06:00
if os . path . exists ( smi_lib_path + "/amd_smi.py" ):
2022-05-05 09:16:05 -04:00
sys . path . append ( smi_lib_path )
2024-01-22 18:56:42 -06:00
from amd_smi import getBus , initializeRsmi
2022-05-05 09:16:05 -04:00
initializeRsmi ()
# Map between gpu indexes and PCIe bus addresses
self . index_to_bus_addr = {}
for item in self . gpu_indexes :
self . index_to_bus_addr [ item ] = getBus ( item )
self . enable_pci_id = True
else :
print ( "cannot find smi_lib to map the PCI id" )
except Exception as error :
print ( "Fail to get the PCI id" , error )
2020-08-13 09:01:12 -04:00
# Create the guages
self . guages = {}
for fid in self . field_ids :
field_name = self . rdc_util . field_id_string ( fid ) . lower ()
self . guages [ fid ] = Gauge ( field_name , field_name , labelnames = [ 'gpu_index' ])
def handle_field ( self , gpu_index , value ):
2022-05-05 09:16:05 -04:00
gpu_label = gpu_index
if self . enable_pci_id :
gpu_label = self . index_to_bus_addr [ gpu_index ]
2020-08-13 09:01:12 -04:00
if value . field_id . value in self . guages :
2022-05-05 09:16:05 -04:00
self . guages [ value . field_id . value ] . labels ( gpu_label ) . set ( value . value . l_int )
2020-08-13 09:01:12 -04:00
def get_field_ids ( args ):
field_ids = []
field_id_str = []
if args . rdc_fields :
field_id_str = args . rdc_fields
elif args . rdc_fields_file :
try :
with open ( args . rdc_fields_file ) as fi :
content = fi . readlines ()
field_id_str = [ x . strip () for x in content ]
except Exception as e :
print ( "Fail to read " + args . rdc_fields_file + ":" + str ( e ))
if len ( field_id_str ) > 0 :
for f in field_id_str :
2024-02-05 10:06:31 -06:00
field_id = rdc . get_field_id_from_name ( str . encode ( f ))
2020-08-13 09:01:12 -04:00
if field_id . value == rdc_field_t . RDC_FI_INVALID :
print ( "Invalid field ' %s ' will be ignored." % ( f ))
else :
field_ids . append ( field_id . value )
return field_ids
return default_field_ids
if __name__ == '__main__' :
parser = argparse . ArgumentParser ( description = 'RDC Prometheus plugin.' )
parser . add_argument ( '--listen_port' , default = 5000 , type = int , help = 'The listen port of the plugin (default: 5000)' )
parser . add_argument ( '--rdc_embedded' , default = False , action = 'store_true' , help = 'Run RDC in embedded mode (default: standalone mode)' )
parser . add_argument ( '--rdc_ip_port' , default = 'localhost:50051' , help = 'The rdcd IP and port in standalone mode (default: localhost:50051)' )
parser . add_argument ( '--rdc_unauth' , default = False , action = 'store_true' , help = 'Set this option if the rdcd is running with unauth in standalone mode (default: false)' )
parser . add_argument ( '--rdc_update_freq' , default = 10 , help = 'The fields update frequency in seconds (default: 10)' )
parser . add_argument ( '--rdc_max_keep_age' , default = 3600 , help = 'The max keep age of the fields in seconds (default: 3600)' )
parser . add_argument ( '--rdc_max_keep_samples' , default = 1000 , help = 'The max samples to keep for each field in the cache (default: 1000)' )
parser . add_argument ( '--rdc_fields' , default = None , nargs = '+' , help = 'The list of fields name needs to be watched, for example, " --rdc_fields RDC_FI_GPU_TEMP RDC_FI_POWER_USAGE " (default: predefined fields in the plugin)' )
parser . add_argument ( '--rdc_fields_file' , default = None , help = 'The list of fields name can also be read from a file with each field name in a separated line (default: None)' )
parser . add_argument ( '--rdc_gpu_indexes' , default = None , nargs = '+' , help = 'The list of GPUs to be watched (default: All GPUs)' )
2022-05-05 09:16:05 -04:00
parser . add_argument ( '--enable_pci_id' , default = False , action = 'store_true' , help = 'Use the PCI Device Identifier to identify GPU (default: false)' )
2020-08-13 09:01:12 -04:00
parser . add_argument ( '--enable_plugin_monitoring' , default = False , action = 'store_true' , help = 'Set this option to collect process metrics of the plugin itself (default: false)' )
args = parser . parse_args ()
field_ids = get_field_ids ( args )
rdc_ip_port = args . rdc_ip_port
if args . rdc_embedded :
rdc_ip_port = None
2021-02-09 13:36:57 -05:00
if args . rdc_gpu_indexes != None :
for i in range ( 0 , len ( args . rdc_gpu_indexes )):
args . rdc_gpu_indexes [ i ] = int ( args . rdc_gpu_indexes [ i ])
2020-08-13 09:01:12 -04:00
reader = PrometheusReader ( rdc_ip_port , field_ids , args . rdc_update_freq * 1000000 ,
args . rdc_max_keep_age , args . rdc_max_keep_samples ,
2022-05-05 09:16:05 -04:00
args . rdc_gpu_indexes , args . rdc_unauth , args . enable_pci_id , args . enable_plugin_monitoring )
2020-08-13 09:01:12 -04:00
start_http_server ( args . listen_port )
print ( "The RDC Prometheus plugin listen at port %d " % ( args . listen_port ))
time . sleep ( 3 )
while True :
reader . process ()
time . sleep ( 1 )