From 121f5617fe20263d6fd8d9e7d6ffc1576fec2f67 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Mon, 26 Oct 2020 15:51:48 -0400 Subject: [PATCH] The collectd plugin for RDC Two files are added to the python_binding folder: * The rdc_collectd.py is a collectd plugin to store the RDC metrics to the collectd round robin database. * The rdc_collectd.conf is a configure file which can control which fields to collect, how frequently the fields can be collected and run the plugin in embedded mode. Change-Id: Ief44d004376ca8a82ed0d8ad36805243acb47080 [ROCm/rdc commit: bb6d98b036129a6ed767d4e07255fdbafa867cbf] --- projects/rdc/python_binding/rdc_collectd.conf | 24 +++++ projects/rdc/python_binding/rdc_collectd.py | 92 +++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 projects/rdc/python_binding/rdc_collectd.conf create mode 100644 projects/rdc/python_binding/rdc_collectd.py diff --git a/projects/rdc/python_binding/rdc_collectd.conf b/projects/rdc/python_binding/rdc_collectd.conf new file mode 100644 index 0000000000..a75c77e91a --- /dev/null +++ b/projects/rdc/python_binding/rdc_collectd.conf @@ -0,0 +1,24 @@ + + ModulePath "/opt/rocm/rdc/python_binding" + LogTraces true + Interactive false + Import "rdc_collectd" + + # Run RDC in embedded mode (default: standalone mode) + embedded false + # The rdcd IP and port in standalone mode (default: localhost:50051) + rdc_ip_port "localhost:50051" + # Set this option if the rdcd is running with unauth in standalone mode (default: false) + unauth false + # The list of fields name needs to be watched (default: fields in the plugin), for example + # field_ids "RDC_FI_GPU_TEMP" "RDC_FI_GPU_CLOCK" + # The fields update frequency in seconds (default: 10) + update_freq 10 + # The max keep age of the fields in seconds (default: 3600) + max_keep_age 3600 + # The max samples to keep for each field in the cache (default: 1000) + max_keep_samples 1000 + # The list of GPUs to be watched (default: All GPUs), for example + # gpu_indexes 0 1 + + diff --git a/projects/rdc/python_binding/rdc_collectd.py b/projects/rdc/python_binding/rdc_collectd.py new file mode 100644 index 0000000000..346dce8e5e --- /dev/null +++ b/projects/rdc/python_binding/rdc_collectd.py @@ -0,0 +1,92 @@ +from RdcReader import RdcReader +from rdc_bootstrap import * +import collectd + +default_field_ids = [ + rdc_field_t.RDC_FI_GPU_MEMORY_USAGE, + rdc_field_t.RDC_FI_GPU_MEMORY_TOTAL, + rdc_field_t.RDC_FI_POWER_USAGE, + rdc_field_t.RDC_FI_GPU_CLOCK, + rdc_field_t.RDC_FI_GPU_UTIL, + rdc_field_t.RDC_FI_GPU_TEMP +] + + +class CollectdReader(RdcReader): + def __init__(self, rdc_ip_port, field_ids, update_freq, max_keep_age, max_keep_samples, + gpu_indexes, rdc_unauth): + group_name = "rdc_collectd_plugin_group" + field_group_name = "rdc_collectd_plugin_fieldgroup" + if rdc_unauth: + RdcReader.__init__(self, ip_port = rdc_ip_port, field_ids = field_ids, update_freq=update_freq, + max_keep_age = max_keep_age, max_keep_samples = max_keep_samples, + gpu_indexes = gpu_indexes, field_group_name = field_group_name, gpu_group_name = group_name, root_ca = None) + else: + RdcReader.__init__(self, ip_port = rdc_ip_port, field_ids = field_ids, update_freq=update_freq, + max_keep_age = max_keep_age, max_keep_samples = max_keep_samples, + gpu_indexes = gpu_indexes, field_group_name = field_group_name, gpu_group_name = group_name) + + def handle_field(self, gpu_index, value): + PLUGIN_NAME = "rdc_collectd" + field_name = self.rdc_util.field_id_string(value.field_id).lower() + collectd.Values(plugin=PLUGIN_NAME, + type_instance= field_name, + type="gauge", + values=[value.value.l_int]).dispatch() + +g_reader = None + +def config_func(config): + global g_reader + + embedded = False # enable embedded if no rdcd + rdc_ip_port = "localhost:50051" # rdcd listen address + field_ids = default_field_ids # The fields to watch + update_freq = 10 # 10 seconds + max_keep_age = 3600 # 1 hour + max_keep_samples = 1000 # The max samples to keep for each field + gpu_indexes = None # All GPus + unauth = False # Enable auth by default + + # Parse configure parameters + for node in config.children: + key = node.key.lower() + if len(node.values) <= 0: + print("Missing value in configure " + key) + continue + + val = node.values[0] + if key == 'embedded' and val == True: + embedded = True + if key == 'rdc_ip_port': + rdc_ip_port = val + if key == 'unauth': + unauth = val + if key == 'field_ids': + field_ids = [] + for f in node.values: + field_id = rdc.get_field_id_from_name(f) + if field_id.value == rdc_field_t.RDC_FI_INVALID: + print("Invalid field '%s' will be ignored." % (f)) + else: + field_ids.append(field_id.value) + if key == 'update_freq': + update_freq = int(val) + if key == 'max_keep_age': + max_keep_age = int(max_keep_age) + if key == 'max_keep_samples': + max_keep_samples = int(max_keep_samples) + if key == 'gpu_indexes': + gpu_indexes = [int(x) for x in node.values] + + if embedded: + rdc_ip_port = None + g_reader = CollectdReader(rdc_ip_port, field_ids, update_freq*1000000, + max_keep_age, max_keep_samples, gpu_indexes, unauth) + +def read_callback(data=None): + global g_reader + g_reader.process() + +collectd.register_config(config_func) +collectd.register_read(read_callback)