RDC python binding

A new folder python_binding is created for RDC python binding:
* The rdc_bootstrap.py is a python ctypes wrapper for the librdc_boostrap.so
* The RdcUtil.py defines common utilities for RDC to manage group/fieldgroup
* The RdcReader.py is a class to simplify the usage of the RDC:
  - The user only needs to specify which fields he wants to monitoring.
     RdcReader will create groups and fieldgroups, watch the fields, and fetch the fields.
  - The RdcReader can support embedded and standalone mode.
  - The standalone can be with authentication and without authentication.
  - In standalone mode, the RdcReader can automatically reconnect to the rdcd when the connection is lost.
  - When rdcd is restarted, the previously created group and fieldgroup may lose.
    The RdcReader can re-create them and watch the fields after reconnect.
  - If the client is restarted, RdcReader can detect the groups and fieldgroups
    created before and avoid re-create them.
  - The user can pass the unit converter if he does not want to use RDC default unit.

Change-Id: I109ec86012f37162eb13f7d3e921115b7dd82369


[ROCm/rdc commit: 9209c6c516]
Este commit está contenido en:
Bill(Shuzhou) Liu
2020-07-29 11:59:20 -04:00
cometido por Chris Freehill
padre 6b246dcf4b
commit 14c9c17292
Se han modificado 7 ficheros con 495 adiciones y 0 borrados
+3
Ver fichero
@@ -164,6 +164,9 @@ install(TARGETS ${CLIENT_LIB}
install(DIRECTORY ${SOURCE_DIR}/authentication
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}
COMPONENT ${CLIENT_COMPONENT})
install(DIRECTORY ${SOURCE_DIR}/python_binding
DESTINATION ${RDC_CLIENT_INSTALL_PREFIX}/${RDC}
COMPONENT ${CLIENT_COMPONENT})
# Generate Doxygen documentation for client api manual
find_package(Doxygen)
+11
Ver fichero
@@ -777,6 +777,17 @@ const char* rdc_status_string(rdc_status_t status);
*/
const char* field_id_string(rdc_field_t field_id);
/**
* @brief Get the field id from name
*
* @details return the field id from field name.
*
* @param[in] name The field name.
*
* @retval return RDC_FI_INVALID if the field name is invalid.
*/
rdc_field_t get_field_id_from_name(const char* name);
#ifdef __cplusplus
}
#endif // __cplusplus
+14
Ver fichero
@@ -0,0 +1,14 @@
* Quick start
If you do not have the RDC installed, please specify the RDC library path using:
export LD_LIBRARY_PATH=<rdc_libs_path>
Then you can run RdcReader in python_binding folder:
python RdcReader.py
* Prometheus plugin
* Collectd plugin
+142
Ver fichero
@@ -0,0 +1,142 @@
import os,time
from rdc_bootstrap import *
from RdcUtil import RdcUtil
default_field_ids = [
rdc_field_t.RDC_FI_GPU_MEMORY_USAGE,
rdc_field_t.RDC_FI_GPU_MEMORY_TOTAL,
rdc_field_t.RDC_FI_POWER_USAGE,
rdc_field_t.RDC_FI_GPU_CLOCK,
rdc_field_t.RDC_FI_GPU_UTIL,
rdc_field_t.RDC_FI_GPU_TEMP
]
default_unit_coverter = {
rdc_field_t.RDC_FI_GPU_MEMORY_USAGE: 0.000001, # MegaBytes
rdc_field_t.RDC_FI_GPU_MEMORY_TOTAL: 0.000001, # MegaBytes
rdc_field_t.RDC_FI_POWER_USAGE: 0.000001, # Watts
rdc_field_t.RDC_FI_GPU_CLOCK: 0.000001, # MHz
rdc_field_t.RDC_FI_GPU_TEMP: 0.001 # degree
}
class RdcReader:
# To run the RDC in embedded mode, set the ip_port = None
def __init__(self, ip_port = "localhost:50051", field_ids = default_field_ids,
unit_converter = default_unit_coverter,
update_freq = 10000000, max_keep_age = 3600.0 , max_keep_samples = 1000,
field_group_name = "rdc_reader_field_group", gpu_group_name = "rdc_reader_gpu_group",
gpu_indexes = None, root_ca = "/etc/rdc/client/certs/rdc_cacert.pem",
client_cert = "/etc/rdc/client/certs/rdc_client_cert.pem",
client_key = "/etc/rdc/client/private/rdc_client_cert.key"):
result = rdc.rdc_init(0)
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("RdcReader init fail: " + str(result))
self.rdc_util = RdcUtil()
self.unit_converter = unit_converter
self.rdc_handle = c_void_p()
self.is_standalone = True
if not ip_port: # embedded
self.is_standalone = False
result = rdc.rdc_start_embedded(rdc_operation_mode_t.RDC_OPERATION_MODE_AUTO, self.rdc_handle)
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("RdcReader start as embedded fail: " + str(result))
else: # standalone
if root_ca == None or client_cert == None or client_key == None:
with_auth = False
root_ca_str = client_cert_str = client_key_str = None
else:
with_auth = True
root_ca_str = self.rdc_util.read_file(root_ca)
client_cert_str = self.rdc_util.read_file(client_cert)
client_key_str = self.rdc_util.read_file(client_key)
result = rdc.rdc_connect(ip_port.encode('utf-8'), self.rdc_handle, root_ca_str, client_cert_str, client_key_str)
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("RdcReader standalone auth(" + str(with_auth) + ") connect to " + ip_port+ " fail: " + str(result))
# Create the GPU group
self.gpu_group_name = gpu_group_name.encode()
if gpu_indexes == None:
self.gpu_indexes = self.rdc_util.get_all_gpu_indexes(self.rdc_handle)
else:
self.gpu_indexes = gpu_indexes
self.gpu_group_id, gpu_group_created = self.rdc_util.create_gpu_group(self.rdc_handle, self.gpu_group_name, self.gpu_indexes)
# Create the field group
self.field_ids = field_ids
self.field_group_name = field_group_name.encode()
self.field_group_id, field_group_created = self.rdc_util.create_field_group(self.rdc_handle, self.field_group_name, self.field_ids)
# Watch the fields
self.update_freq = update_freq
self.max_keep_age = max_keep_age
self.max_keep_samples = max_keep_samples
# Unwatch first to clean up what left from last run
rdc.rdc_field_unwatch(self.rdc_handle, self.gpu_group_id, self.field_group_id)
result = rdc.rdc_field_watch(self.rdc_handle, self.gpu_group_id,
self.field_group_id, self.update_freq, self.max_keep_age, self.max_keep_samples);
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("RdcReader fail to watch group " + str(self.gpu_group_id) + ", field group " + str(self.field_group_id) + ":" + str(result))
# Process the fields periodically
def process(self):
has_succeed = False
for gindex in self.gpu_indexes:
for fid in self.field_ids:
value = rdc_field_value()
result = rdc.rdc_field_get_latest_value(self.rdc_handle,
gindex, fid, value)
if rdc_status_t(result) == rdc_status_t.RDC_ST_OK:
# Convert the unit
if self.unit_converter != None and fid in self.unit_converter:
if value.type.value == rdc_field_type_t.INTEGER:
value.value.l_int = int(value.value.l_int * self.unit_converter[fid])
if value.type.value == rdc_field_type_t.DOUBLE:
value.value.dbl = value.value.l_int * self.unit_converter[fid]
self.handle_field(gindex, value)
has_succeed = True
self.process_other_fields()
if len(self.gpu_indexes) != 0 and len(self.field_ids) != 0 and has_succeed == False:
self.try_reconnect()
def process_other_fields(self):
pass
def try_reconnect(self):
if self.is_standalone == False:
return
try:
# When rdcd restart, the GPU and field group need to be re-created.
self.gpu_group_id, gpu_group_created = self.rdc_util.create_gpu_group(self.rdc_handle, self.gpu_group_name, self.gpu_indexes)
self.field_group_id, field_group_created = self.rdc_util.create_field_group(self.rdc_handle, self.field_group_name, self.field_ids)
# rdcd restart requires to watch the group again
if gpu_group_created or field_group_created:
result = rdc.rdc_field_watch(self.rdc_handle, self.gpu_group_id,
self.field_group_id, self.update_freq, self.max_keep_age, self.max_keep_samples);
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("RdcReader fail to watch group " + str(self.gpu_group_id) + ", field group " + str(self.field_group_id) + ":" + str(result))
except Exception as e:
print(e)
def handle_field(self, gpu_index, value):
field_name = self.rdc_util.field_id_string(value.field_id)
print("%d %d:%d %s:%d" % (value.ts, gpu_index, value.field_id.value, field_name, value.value.l_int))
if __name__ == '__main__':
# Run the reader in embedded mode
reader = RdcReader(ip_port=None, update_freq=1000000)
while True:
time.sleep(1)
reader.process()
+112
Ver fichero
@@ -0,0 +1,112 @@
from rdc_bootstrap import *
class RdcUtil:
def __init__(self):
pass
def get_all_gpu_indexes(self, rdc_handle):
gpu_count = c_uint32()
gpu_index_list = (c_uint32 * RDC_MAX_NUM_DEVICES)()
result = rdc.rdc_device_get_all(rdc_handle, gpu_index_list, gpu_count)
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("Fail to get all GPus")
gpu_indexes = []
for index in range(gpu_count.value):
gpu_indexes.append(gpu_index_list[index])
return gpu_indexes
def get_all_gpu_groups(self, rdc_handle):
all_groups = {}
group_count = c_uint32()
gpu_group_list = (c_uint32 * RDC_MAX_NUM_GROUPS)()
result = rdc.rdc_group_get_all_ids(rdc_handle, gpu_group_list, group_count)
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("Fail to get all groups")
for index in range(group_count.value):
group_id = gpu_group_list[index]
group_info = rdc_group_info_t()
result = rdc.rdc_group_gpu_get_info(rdc_handle, group_id, group_info)
all_groups[group_id] = group_info
return all_groups
# Create gpu group if not exists
# Return <gpu_group_id, is_created>
def create_gpu_group(self, rdc_handle, gpu_group_name, gpu_indexes):
# Can we reuse the exists one?
all_groups = self.get_all_gpu_groups(rdc_handle)
for id,group_info in all_groups.items():
group_name = group_info.group_name.decode('utf-8')
list_gpu_indexes = list(group_info.entity_ids[:group_info.count])
if group_name == gpu_group_name:
# Reuse existing group
if list_gpu_indexes == gpu_indexes:
return id, False
else: # delete old group
result = rdc.rdc_group_gpu_destroy(rdc_handle, id)
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("Fail to delete the GPU group")
#Create new gpu group
gpu_group_id = c_uint32()
result = rdc.rdc_group_gpu_create(rdc_handle, rdc_group_type_t.RDC_GROUP_EMPTY, gpu_group_name, gpu_group_id)
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("Fail to create the GPU group " << group_name)
#Add GPU index to the group
for gpu in gpu_indexes:
result = rdc.rdc_group_gpu_add(rdc_handle, gpu_group_id, gpu)
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("Fail to add GPU index " << gpu <<" to group " << gpu_group_id)
return gpu_group_id, True
def create_field_group(self, rdc_handle, field_group_name, field_ids):
# Do we need to recreate the field group?
field_group_id_list = (rdc_field_grp_t * RDC_MAX_FIELD_IDS_PER_FIELD_GROUP)()
field_group_count = c_uint32()
result = rdc.rdc_group_field_get_all_ids(rdc_handle, field_group_id_list, field_group_count)
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("Fail to get all field group")
for index in range(field_group_count.value):
group_info = rdc_field_group_info_t()
result = rdc.rdc_group_field_get_info(rdc_handle, field_group_id_list[index], pointer(group_info))
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("Fail to get field group " << field_group_id_list[index] <<" info")
if group_info.group_name.decode("utf-8") == field_group_name:
field_ids_ori = [ e.value for e in group_info.field_ids[:group_info.count] ]
# reuse the old field group
if (field_ids == field_ids_ori):
return field_group_id_list[index], False
else:
result = rdc.rdc_group_field_destroy(rdc_handle, field_group_id_list[index])
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("Fail to delete field group " << field_group_id_list[index])
#Create new field group
fields_c_ids = []
for f in field_ids:
fields_c_ids.append(rdc_field_t(f))
c_ids = ( rdc_field_t * len(field_ids))(*fields_c_ids)
field_group_id = c_uint32()
result = rdc.rdc_group_field_create(rdc_handle, len(field_ids), c_ids, field_group_name, field_group_id)
if rdc_status_t(result) != rdc_status_t.RDC_ST_OK:
raise Exception("Fail to create field group " + field_group_name.decode("utf-8") +": " + str(result))
return field_group_id, True
def field_id_string(self, field_id):
return rdc.field_id_string(field_id).decode("utf-8")
def read_file(self, file_name):
try:
with open(file_name, 'r') as file:
return file.read().encode('utf-8')
except Exception as e:
print("Fail to read " + file_name + ":" + str(e))
return None
+205
Ver fichero
@@ -0,0 +1,205 @@
import os, time
import ctypes.util
from ctypes import *
from enum import Enum
librdc = "librdc_bootstrap.so"
# The python ctypes wrapper for "librdc_bootstrap.so"
rdc = CDLL(librdc)
GPU_ID_INVALID = -1
RDC_GROUP_ALL_GPUS = -1000
RDC_JOB_STATS_FIELDS = -1000
RDC_MAX_STR_LENGTH = 256
RDC_GROUP_MAX_ENTITIES = 64
RDC_MAX_NUM_DEVICES = 16
RDC_MAX_FIELD_IDS_PER_FIELD_GROUP = 128
RDC_MAX_NUM_GROUPS = 64
RDC_MAX_NUM_FIELD_GROUPS = 64
class rdc_status_t(Enum):
def from_param(cls, obj):
return int(obj)
RDC_ST_OK = 0
RDC_ST_NOT_SUPPORTED = 1
RDC_ST_MSI_ERROR = 2
RDC_ST_FAIL_LOAD_MODULE = 3
RDC_ST_INVALID_HANDLER = 4
RDC_ST_BAD_PARAMETER = 5
RDC_ST_NOT_FOUND = 6
RDC_ST_CONFLICT = 7
RDC_ST_CLIENT_ERROR = 8
RDC_ST_ALREADY_EXIST = 9
RDC_ST_MAX_LIMIT = 10
class rdc_operation_mode_t(c_int):
RDC_OPERATION_MODE_AUTO = 0
RDC_OPERATION_MODE_MANUAL = 1
class rdc_group_type_t(c_int):
RDC_GROUP_DEFAULT = 0
RDC_GROUP_EMPTY = 1
class rdc_field_type_t(c_int):
INTEGER = 0
DOUBLE = 1
STRING = 2
BLOB = 3
class rdc_field_t(c_int):
RDC_FI_INVALID = 0
RDC_FI_GPU_COUNT = 1
RDC_FI_DEV_NAME = 2
RDC_FI_GPU_CLOCK = 100
RDC_FI_MEM_CLOCK = 101
RDC_FI_MEMORY_TEMP = 200
RDC_FI_GPU_TEMP = 201
RDC_FI_POWER_USAGE = 300
RDC_FI_PCIE_TX = 400
RDC_FI_PCIE_RX = 401
RDC_FI_GPU_UTIL = 500
RDC_FI_GPU_MEMORY_USAGE = 501
RDC_FI_GPU_MEMORY_TOTAL = 502
RDC_FI_ECC_CORRECT_TOTAL = 600
RDC_FI_ECC_UNCORRECT_TOTAL = 601
rdc_handle_t = c_void_p
rdc_gpu_group_t = c_uint32
rdc_field_grp_t = c_uint32
class rdc_device_attributes_t(Structure):
_fields_ = [
("device_name", c_char*256)
]
class rdc_group_info_t(Structure):
_fields_ = [
("count", c_uint32)
,("group_name", c_char*256)
,("entity_ids", c_uint32*64)
]
class rdc_stats_summary_t(Structure):
_fields_ = [
("max_value", c_uint64)
,("min_value", c_uint64)
,("average", c_uint64)
,("standard_deviation", c_double)
]
class rdc_gpu_usage_info_t(Structure):
_fields_ = [
("gpu_id", c_uint32)
,("start_time", c_uint64)
,("end_time", c_uint64)
,("energy_consumed", c_uint64)
,("ecc_correct", c_uint64)
,("ecc_uncorrect", c_uint64)
,("pcie_tx", rdc_stats_summary_t)
,("pcie_rx", rdc_stats_summary_t)
,("power_usage", rdc_stats_summary_t)
,("gpu_clock", rdc_stats_summary_t)
,("memory_clock", rdc_stats_summary_t)
,("gpu_utilization", rdc_stats_summary_t)
,("gpu_temperature", rdc_stats_summary_t)
,("max_gpu_memory_used", c_uint64)
,("memory_utilization", rdc_stats_summary_t)
]
class rdc_job_info_t(Structure):
_fields_ = [
("num_gpus", c_uint32)
,("summary", rdc_gpu_usage_info_t)
,("gpus", rdc_gpu_usage_info_t*16)
]
class rdc_anonymous_0(ctypes.Union):
_fields_ = [
("l_int", c_int64)
,("dbl", c_double)
,("str", c_char*256)
]
class rdc_field_value(Structure):
_fields_ = [
("field_id", rdc_field_t)
,("status", c_int)
,("ts", c_uint64)
,("type", rdc_field_type_t)
,("value", rdc_anonymous_0)
]
class rdc_field_group_info_t(Structure):
_fields_ = [
("count", c_uint32)
,("group_name", c_char*256)
,("field_ids", rdc_field_t*128)
]
class rdc_job_group_info_t(Structure):
_fields_ = [
("job_id", c_char*256)
,("group_id", rdc_gpu_group_t)
,("start_time", c_uint64)
,("stop_time", c_uint64)
]
rdc.rdc_init.restype = rdc_status_t
rdc.rdc_init.argtypes = [ c_uint64 ]
rdc.rdc_shutdown.restype = rdc_status_t
rdc.rdc_shutdown.argtypes = [ ]
rdc.rdc_start_embedded.restype = rdc_status_t
rdc.rdc_start_embedded.argtypes = [ rdc_operation_mode_t,POINTER(rdc_handle_t) ]
rdc.rdc_stop_embedded.restype = rdc_status_t
rdc.rdc_stop_embedded.argtypes = [ rdc_handle_t ]
rdc.rdc_connect.restype = rdc_status_t
rdc.rdc_connect.argtypes = [ c_char_p,POINTER(rdc_handle_t),c_char_p,c_char_p,c_char_p ]
rdc.rdc_disconnect.restype = rdc_status_t
rdc.rdc_disconnect.argtypes = [ rdc_handle_t ]
rdc.rdc_job_start_stats.restype = rdc_status_t
rdc.rdc_job_start_stats.argtypes = [ rdc_handle_t,rdc_gpu_group_t,POINTER(c_char),c_uint64 ]
rdc.rdc_job_get_stats.restype = rdc_status_t
rdc.rdc_job_get_stats.argtypes = [ rdc_handle_t,POINTER(c_char),POINTER(rdc_job_info_t) ]
rdc.rdc_job_stop_stats.restype = rdc_status_t
rdc.rdc_job_stop_stats.argtypes = [ rdc_handle_t,POINTER(c_char) ]
rdc.rdc_job_remove.restype = rdc_status_t
rdc.rdc_job_remove.argtypes = [ rdc_handle_t,POINTER(c_char) ]
rdc.rdc_job_remove_all.restype = rdc_status_t
rdc.rdc_job_remove_all.argtypes = [ rdc_handle_t ]
rdc.rdc_field_update_all.restype = rdc_status_t
rdc.rdc_field_update_all.argtypes = [ rdc_handle_t,c_uint32 ]
rdc.rdc_device_get_all.restype = rdc_status_t
rdc.rdc_device_get_all.argtypes = [ rdc_handle_t,POINTER(c_uint32),POINTER(c_uint32) ]
rdc.rdc_device_get_attributes.restype = rdc_status_t
rdc.rdc_device_get_attributes.argtypes = [ rdc_handle_t,c_uint32,POINTER(rdc_device_attributes_t) ]
rdc.rdc_group_gpu_create.restype = rdc_status_t
rdc.rdc_group_gpu_create.argtypes = [ rdc_handle_t,rdc_group_type_t,c_char_p,POINTER(rdc_gpu_group_t) ]
rdc.rdc_group_gpu_add.restype = rdc_status_t
rdc.rdc_group_gpu_add.argtypes = [ rdc_handle_t,rdc_gpu_group_t,c_uint32 ]
rdc.rdc_group_gpu_get_info.restype = rdc_status_t
rdc.rdc_group_gpu_get_info.argtypes = [ rdc_handle_t,rdc_gpu_group_t,POINTER(rdc_group_info_t) ]
rdc.rdc_group_get_all_ids.restype = rdc_status_t
rdc.rdc_group_get_all_ids.argtypes = [ rdc_handle_t,POINTER(rdc_gpu_group_t),POINTER(c_uint32) ]
rdc.rdc_group_gpu_destroy.restype = rdc_status_t
rdc.rdc_group_gpu_destroy.argtypes = [ rdc_handle_t,rdc_gpu_group_t ]
rdc.rdc_group_field_create.restype = rdc_status_t
rdc.rdc_group_field_create.argtypes = [ rdc_handle_t,c_uint32,POINTER(rdc_field_t),c_char_p,POINTER(rdc_field_grp_t) ]
rdc.rdc_group_field_get_info.restype = rdc_status_t
rdc.rdc_group_field_get_info.argtypes = [ rdc_handle_t,rdc_field_grp_t,POINTER(rdc_field_group_info_t) ]
rdc.rdc_group_field_get_all_ids.restype = rdc_status_t
rdc.rdc_group_field_get_all_ids.argtypes = [ rdc_handle_t,POINTER(rdc_field_grp_t),POINTER(c_uint32) ]
rdc.rdc_group_field_destroy.restype = rdc_status_t
rdc.rdc_group_field_destroy.argtypes = [ rdc_handle_t,rdc_field_grp_t ]
rdc.rdc_field_watch.restype = rdc_status_t
rdc.rdc_field_watch.argtypes = [ rdc_handle_t,rdc_gpu_group_t,rdc_field_grp_t,c_uint64,c_double,c_uint32 ]
rdc.rdc_field_get_latest_value.restype = rdc_status_t
rdc.rdc_field_get_latest_value.argtypes = [ rdc_handle_t,c_uint32,rdc_field_t,POINTER(rdc_field_value) ]
rdc.rdc_field_get_value_since.restype = rdc_status_t
rdc.rdc_field_get_value_since.argtypes = [ rdc_handle_t,c_uint32,rdc_field_t,c_uint64,POINTER(c_uint64),POINTER(rdc_field_value) ]
rdc.rdc_field_unwatch.restype = rdc_status_t
rdc.rdc_field_unwatch.argtypes = [ rdc_handle_t,rdc_gpu_group_t,rdc_field_grp_t ]
rdc.rdc_status_string.restype = c_char_p
rdc.rdc_status_string.argtypes = [ rdc_status_t ]
rdc.field_id_string.restype = c_char_p
rdc.field_id_string.argtypes = [ rdc_field_t ]
rdc.get_field_id_from_name.restype = rdc_field_t
rdc.get_field_id_from_name.argtypes = [ c_char_p ]
@@ -357,6 +357,14 @@ const char* field_id_string(rdc_field_t field_id) {
return field_id_to_descript.find(field_id)->second.label.c_str();
}
rdc_field_t get_field_id_from_name(const char* name) {
rdc_field_t value;
if (amd::rdc::get_field_id_from_name(name, &value)) {
return value;
}
return RDC_FI_INVALID;
}
char *strncpy_with_null(char *dest, const char *src, size_t n) {
if (n == 0) {
return dest;