diff --git a/projects/rdc/python_binding/RdcReader.py b/projects/rdc/python_binding/RdcReader.py index 2dc69c66bb..12ba34a6f2 100644 --- a/projects/rdc/python_binding/RdcReader.py +++ b/projects/rdc/python_binding/RdcReader.py @@ -16,13 +16,13 @@ default_unit_coverter = { rdc_field_t.RDC_FI_GPU_MEMORY_TOTAL: 0.000001, # MegaBytes rdc_field_t.RDC_FI_POWER_USAGE: 0.000001, # Watts rdc_field_t.RDC_FI_GPU_CLOCK: 0.000001, # MHz - rdc_field_t.RDC_FI_GPU_TEMP: 0.001 # degree + rdc_field_t.RDC_FI_GPU_TEMP: 0.001, # degree } class RdcReader: # To run the RDC in embedded mode, set the ip_port = None def __init__(self, ip_port = "localhost:50051", field_ids = default_field_ids, - unit_converter = default_unit_coverter, + unit_converter: dict[int, float] = default_unit_coverter, update_freq = 10000000, max_keep_age = 3600.0 , max_keep_samples = 1000, field_group_name = "rdc_reader_field_group", gpu_group_name = "rdc_reader_gpu_group", gpu_indexes = None, root_ca = "/etc/rdc/client/certs/rdc_cacert.pem", @@ -98,7 +98,10 @@ class RdcReader: if value.type.value == rdc_field_type_t.INTEGER: value.value.l_int = int(value.value.l_int * self.unit_converter[fid]) if value.type.value == rdc_field_type_t.DOUBLE: - value.value.dbl = value.value.l_int * self.unit_converter[fid] + value.value.dbl = int(value.value.dbl * self.unit_converter[fid]) + # convert from double to l_int + if value.type.value == rdc_field_type_t.DOUBLE: + value.value.l_int = int(value.value.dbl) self.handle_field(gindex, value) has_succeed = True diff --git a/projects/rdc/python_binding/rdc_bootstrap.py b/projects/rdc/python_binding/rdc_bootstrap.py index 7c94576961..c2e48c02d7 100644 --- a/projects/rdc/python_binding/rdc_bootstrap.py +++ b/projects/rdc/python_binding/rdc_bootstrap.py @@ -132,6 +132,16 @@ class rdc_field_t(c_int): RDC_FI_XGMI_5_WRITE_KB = 713 RDC_FI_XGMI_6_WRITE_KB = 714 RDC_FI_XGMI_7_WRITE_KB = 715 + RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU = 800 + RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU = 801 + RDC_FI_PROF_ACTIVE_CYCLES = 802 + RDC_FI_PROF_ACTIVE_WAVES = 803 + RDC_FI_PROF_ELAPSED_CYCLES = 804 + RDC_FI_PROF_EVAL_MEM_R_BW = 805 + RDC_FI_PROF_EVAL_MEM_W_BW = 806 + RDC_FI_PROF_EVAL_FLOPS_16 = 807 + RDC_FI_PROF_EVAL_FLOPS_32 = 808 + RDC_FI_PROF_EVAL_FLOPS_64 = 809 RDC_EVNT_XGMI_0_NOP_TX = 1000 RDC_EVNT_XGMI_0_REQ_TX = 1001 RDC_EVNT_XGMI_0_RESP_TX = 1002 diff --git a/projects/rdc/python_binding/rdc_prometheus.py b/projects/rdc/python_binding/rdc_prometheus.py index 00dd558602..d75aa7708c 100644 --- a/projects/rdc/python_binding/rdc_prometheus.py +++ b/projects/rdc/python_binding/rdc_prometheus.py @@ -10,12 +10,14 @@ default_field_ids = [ rdc_field_t.RDC_FI_POWER_USAGE, rdc_field_t.RDC_FI_GPU_CLOCK, rdc_field_t.RDC_FI_GPU_UTIL, - rdc_field_t.RDC_FI_GPU_TEMP + rdc_field_t.RDC_FI_GPU_TEMP, + rdc_field_t.RDC_FI_PROF_ACTIVE_CYCLES, + rdc_field_t.RDC_FI_PROF_ACTIVE_WAVES, ] class PrometheusReader(RdcReader): def __init__(self, rdc_ip_port, field_ids, update_freq, max_keep_age, max_keep_samples, - gpu_indexes, rdc_unauth, enable_pci_id, enable_plugin_monitoring): + gpu_indexes, rdc_unauth, enable_plugin_monitoring): group_name = "rdc_prometheus_plugin_group" field_group_name = "rdc_prometheus_plugin_fieldgroup" if rdc_unauth: @@ -32,31 +34,7 @@ class PrometheusReader(RdcReader): REGISTRY.unregister(PROCESS_COLLECTOR) REGISTRY.unregister(PLATFORM_COLLECTOR) - # Use the PCI id as gpu_index if enabled - self.enable_pci_id = False - if enable_pci_id == True: - try: - import sys, os - # Relaive path of amd_smi to map gpu index to PCI id - # change smi_lib_path if the amd_smi is installed in different folder - smi_lib_relative_path = "../../bin" - smi_lib_path = os.path.join(sys.path[0], smi_lib_relative_path) - if os.path.exists(smi_lib_path+"/amd_smi.py"): - sys.path.append(smi_lib_path) - from amd_smi import getBus, initializeRsmi - initializeRsmi() - # Map between gpu indexes and PCIe bus addresses - self.index_to_bus_addr = {} - for item in self.gpu_indexes: - self.index_to_bus_addr[item] = getBus(item) - self.enable_pci_id = True - else: - print("cannot find smi_lib to map the PCI id") - except Exception as error: - print("Fail to get the PCI id", error) - - - # Create the guages + # Create the gauges self.guages = {} for fid in self.field_ids: field_name = self.rdc_util.field_id_string(fid).lower() @@ -64,8 +42,6 @@ class PrometheusReader(RdcReader): def handle_field(self, gpu_index, value): gpu_label = gpu_index - if self.enable_pci_id: - gpu_label = self.index_to_bus_addr[gpu_index] if value.field_id.value in self.guages: self.guages[value.field_id.value].labels(gpu_label).set(value.value.l_int) @@ -107,7 +83,6 @@ if __name__ == '__main__': parser.add_argument('--rdc_fields', default=None, nargs='+', help='The list of fields name needs to be watched, for example, " --rdc_fields RDC_FI_GPU_TEMP RDC_FI_POWER_USAGE " (default: predefined fields in the plugin)') parser.add_argument('--rdc_fields_file', default=None, help='The list of fields name can also be read from a file with each field name in a separated line (default: None)') parser.add_argument('--rdc_gpu_indexes', default=None, nargs='+', help='The list of GPUs to be watched (default: All GPUs)') - parser.add_argument('--enable_pci_id', default=False, action='store_true', help = 'Use the PCI Device Identifier to identify GPU (default: false)') parser.add_argument('--enable_plugin_monitoring', default=False, action='store_true', help = 'Set this option to collect process metrics of the plugin itself (default: false)') args = parser.parse_args() @@ -122,7 +97,7 @@ if __name__ == '__main__': reader = PrometheusReader(rdc_ip_port, field_ids, args.rdc_update_freq*1000000, args.rdc_max_keep_age, args.rdc_max_keep_samples, - args.rdc_gpu_indexes, args.rdc_unauth, args.enable_pci_id, args.enable_plugin_monitoring) + args.rdc_gpu_indexes, args.rdc_unauth, args.enable_plugin_monitoring) start_http_server(args.listen_port) print("The RDC Prometheus plugin listen at port %d" % (args.listen_port)) time.sleep(3)