The collectd plugin for RDC
Two files are added to the python_binding folder:
* The rdc_collectd.py is a collectd plugin to store the RDC
metrics to the collectd round robin database.
* The rdc_collectd.conf is a configure file which can control
which fields to collect, how frequently the fields can be collected
and run the plugin in embedded mode.
Change-Id: Ief44d004376ca8a82ed0d8ad36805243acb47080
[ROCm/rdc commit: bb6d98b036]
Este commit está contenido en:
@@ -0,0 +1,24 @@
|
||||
<Plugin python>
|
||||
ModulePath "/opt/rocm/rdc/python_binding"
|
||||
LogTraces true
|
||||
Interactive false
|
||||
Import "rdc_collectd"
|
||||
<Module rdc_collectd>
|
||||
# Run RDC in embedded mode (default: standalone mode)
|
||||
embedded false
|
||||
# The rdcd IP and port in standalone mode (default: localhost:50051)
|
||||
rdc_ip_port "localhost:50051"
|
||||
# Set this option if the rdcd is running with unauth in standalone mode (default: false)
|
||||
unauth false
|
||||
# The list of fields name needs to be watched (default: fields in the plugin), for example
|
||||
# field_ids "RDC_FI_GPU_TEMP" "RDC_FI_GPU_CLOCK"
|
||||
# The fields update frequency in seconds (default: 10)
|
||||
update_freq 10
|
||||
# The max keep age of the fields in seconds (default: 3600)
|
||||
max_keep_age 3600
|
||||
# The max samples to keep for each field in the cache (default: 1000)
|
||||
max_keep_samples 1000
|
||||
# The list of GPUs to be watched (default: All GPUs), for example
|
||||
# gpu_indexes 0 1
|
||||
</Module>
|
||||
</Plugin>
|
||||
@@ -0,0 +1,92 @@
|
||||
from RdcReader import RdcReader
|
||||
from rdc_bootstrap import *
|
||||
import collectd
|
||||
|
||||
default_field_ids = [
|
||||
rdc_field_t.RDC_FI_GPU_MEMORY_USAGE,
|
||||
rdc_field_t.RDC_FI_GPU_MEMORY_TOTAL,
|
||||
rdc_field_t.RDC_FI_POWER_USAGE,
|
||||
rdc_field_t.RDC_FI_GPU_CLOCK,
|
||||
rdc_field_t.RDC_FI_GPU_UTIL,
|
||||
rdc_field_t.RDC_FI_GPU_TEMP
|
||||
]
|
||||
|
||||
|
||||
class CollectdReader(RdcReader):
|
||||
def __init__(self, rdc_ip_port, field_ids, update_freq, max_keep_age, max_keep_samples,
|
||||
gpu_indexes, rdc_unauth):
|
||||
group_name = "rdc_collectd_plugin_group"
|
||||
field_group_name = "rdc_collectd_plugin_fieldgroup"
|
||||
if rdc_unauth:
|
||||
RdcReader.__init__(self, ip_port = rdc_ip_port, field_ids = field_ids, update_freq=update_freq,
|
||||
max_keep_age = max_keep_age, max_keep_samples = max_keep_samples,
|
||||
gpu_indexes = gpu_indexes, field_group_name = field_group_name, gpu_group_name = group_name, root_ca = None)
|
||||
else:
|
||||
RdcReader.__init__(self, ip_port = rdc_ip_port, field_ids = field_ids, update_freq=update_freq,
|
||||
max_keep_age = max_keep_age, max_keep_samples = max_keep_samples,
|
||||
gpu_indexes = gpu_indexes, field_group_name = field_group_name, gpu_group_name = group_name)
|
||||
|
||||
def handle_field(self, gpu_index, value):
|
||||
PLUGIN_NAME = "rdc_collectd"
|
||||
field_name = self.rdc_util.field_id_string(value.field_id).lower()
|
||||
collectd.Values(plugin=PLUGIN_NAME,
|
||||
type_instance= field_name,
|
||||
type="gauge",
|
||||
values=[value.value.l_int]).dispatch()
|
||||
|
||||
g_reader = None
|
||||
|
||||
def config_func(config):
|
||||
global g_reader
|
||||
|
||||
embedded = False # enable embedded if no rdcd
|
||||
rdc_ip_port = "localhost:50051" # rdcd listen address
|
||||
field_ids = default_field_ids # The fields to watch
|
||||
update_freq = 10 # 10 seconds
|
||||
max_keep_age = 3600 # 1 hour
|
||||
max_keep_samples = 1000 # The max samples to keep for each field
|
||||
gpu_indexes = None # All GPus
|
||||
unauth = False # Enable auth by default
|
||||
|
||||
# Parse configure parameters
|
||||
for node in config.children:
|
||||
key = node.key.lower()
|
||||
if len(node.values) <= 0:
|
||||
print("Missing value in configure " + key)
|
||||
continue
|
||||
|
||||
val = node.values[0]
|
||||
if key == 'embedded' and val == True:
|
||||
embedded = True
|
||||
if key == 'rdc_ip_port':
|
||||
rdc_ip_port = val
|
||||
if key == 'unauth':
|
||||
unauth = val
|
||||
if key == 'field_ids':
|
||||
field_ids = []
|
||||
for f in node.values:
|
||||
field_id = rdc.get_field_id_from_name(f)
|
||||
if field_id.value == rdc_field_t.RDC_FI_INVALID:
|
||||
print("Invalid field '%s' will be ignored." % (f))
|
||||
else:
|
||||
field_ids.append(field_id.value)
|
||||
if key == 'update_freq':
|
||||
update_freq = int(val)
|
||||
if key == 'max_keep_age':
|
||||
max_keep_age = int(max_keep_age)
|
||||
if key == 'max_keep_samples':
|
||||
max_keep_samples = int(max_keep_samples)
|
||||
if key == 'gpu_indexes':
|
||||
gpu_indexes = [int(x) for x in node.values]
|
||||
|
||||
if embedded:
|
||||
rdc_ip_port = None
|
||||
g_reader = CollectdReader(rdc_ip_port, field_ids, update_freq*1000000,
|
||||
max_keep_age, max_keep_samples, gpu_indexes, unauth)
|
||||
|
||||
def read_callback(data=None):
|
||||
global g_reader
|
||||
g_reader.process()
|
||||
|
||||
collectd.register_config(config_func)
|
||||
collectd.register_read(read_callback)
|
||||
Referencia en una nueva incidencia
Block a user