Update python_interface and remove --enable_pci_id

Change-Id: Ie5d511f3da25221bf60bc669ab172323703a1c45
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: bbe0b3573c]
Cette révision appartient à :
Galantsev, Dmitrii
2024-07-15 18:50:15 -05:00
révisé par Dmitrii Galantsev
Parent 98ba530267
révision 3dd90a6ff2
3 fichiers modifiés avec 22 ajouts et 34 suppressions
+6 -3
Voir le fichier
@@ -16,13 +16,13 @@ default_unit_coverter = {
rdc_field_t.RDC_FI_GPU_MEMORY_TOTAL: 0.000001, # MegaBytes
rdc_field_t.RDC_FI_POWER_USAGE: 0.000001, # Watts
rdc_field_t.RDC_FI_GPU_CLOCK: 0.000001, # MHz
rdc_field_t.RDC_FI_GPU_TEMP: 0.001 # degree
rdc_field_t.RDC_FI_GPU_TEMP: 0.001, # degree
}
class RdcReader:
# To run the RDC in embedded mode, set the ip_port = None
def __init__(self, ip_port = "localhost:50051", field_ids = default_field_ids,
unit_converter = default_unit_coverter,
unit_converter: dict[int, float] = default_unit_coverter,
update_freq = 10000000, max_keep_age = 3600.0 , max_keep_samples = 1000,
field_group_name = "rdc_reader_field_group", gpu_group_name = "rdc_reader_gpu_group",
gpu_indexes = None, root_ca = "/etc/rdc/client/certs/rdc_cacert.pem",
@@ -98,7 +98,10 @@ class RdcReader:
if value.type.value == rdc_field_type_t.INTEGER:
value.value.l_int = int(value.value.l_int * self.unit_converter[fid])
if value.type.value == rdc_field_type_t.DOUBLE:
value.value.dbl = value.value.l_int * self.unit_converter[fid]
value.value.dbl = int(value.value.dbl * self.unit_converter[fid])
# convert from double to l_int
if value.type.value == rdc_field_type_t.DOUBLE:
value.value.l_int = int(value.value.dbl)
self.handle_field(gindex, value)
has_succeed = True
+10
Voir le fichier
@@ -132,6 +132,16 @@ class rdc_field_t(c_int):
RDC_FI_XGMI_5_WRITE_KB = 713
RDC_FI_XGMI_6_WRITE_KB = 714
RDC_FI_XGMI_7_WRITE_KB = 715
RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU = 800
RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU = 801
RDC_FI_PROF_ACTIVE_CYCLES = 802
RDC_FI_PROF_ACTIVE_WAVES = 803
RDC_FI_PROF_ELAPSED_CYCLES = 804
RDC_FI_PROF_EVAL_MEM_R_BW = 805
RDC_FI_PROF_EVAL_MEM_W_BW = 806
RDC_FI_PROF_EVAL_FLOPS_16 = 807
RDC_FI_PROF_EVAL_FLOPS_32 = 808
RDC_FI_PROF_EVAL_FLOPS_64 = 809
RDC_EVNT_XGMI_0_NOP_TX = 1000
RDC_EVNT_XGMI_0_REQ_TX = 1001
RDC_EVNT_XGMI_0_RESP_TX = 1002
+6 -31
Voir le fichier
@@ -10,12 +10,14 @@ default_field_ids = [
rdc_field_t.RDC_FI_POWER_USAGE,
rdc_field_t.RDC_FI_GPU_CLOCK,
rdc_field_t.RDC_FI_GPU_UTIL,
rdc_field_t.RDC_FI_GPU_TEMP
rdc_field_t.RDC_FI_GPU_TEMP,
rdc_field_t.RDC_FI_PROF_ACTIVE_CYCLES,
rdc_field_t.RDC_FI_PROF_ACTIVE_WAVES,
]
class PrometheusReader(RdcReader):
def __init__(self, rdc_ip_port, field_ids, update_freq, max_keep_age, max_keep_samples,
gpu_indexes, rdc_unauth, enable_pci_id, enable_plugin_monitoring):
gpu_indexes, rdc_unauth, enable_plugin_monitoring):
group_name = "rdc_prometheus_plugin_group"
field_group_name = "rdc_prometheus_plugin_fieldgroup"
if rdc_unauth:
@@ -32,31 +34,7 @@ class PrometheusReader(RdcReader):
REGISTRY.unregister(PROCESS_COLLECTOR)
REGISTRY.unregister(PLATFORM_COLLECTOR)
# Use the PCI id as gpu_index if enabled
self.enable_pci_id = False
if enable_pci_id == True:
try:
import sys, os
# Relaive path of amd_smi to map gpu index to PCI id
# change smi_lib_path if the amd_smi is installed in different folder
smi_lib_relative_path = "../../bin"
smi_lib_path = os.path.join(sys.path[0], smi_lib_relative_path)
if os.path.exists(smi_lib_path+"/amd_smi.py"):
sys.path.append(smi_lib_path)
from amd_smi import getBus, initializeRsmi
initializeRsmi()
# Map between gpu indexes and PCIe bus addresses
self.index_to_bus_addr = {}
for item in self.gpu_indexes:
self.index_to_bus_addr[item] = getBus(item)
self.enable_pci_id = True
else:
print("cannot find smi_lib to map the PCI id")
except Exception as error:
print("Fail to get the PCI id", error)
# Create the guages
# Create the gauges
self.guages = {}
for fid in self.field_ids:
field_name = self.rdc_util.field_id_string(fid).lower()
@@ -64,8 +42,6 @@ class PrometheusReader(RdcReader):
def handle_field(self, gpu_index, value):
gpu_label = gpu_index
if self.enable_pci_id:
gpu_label = self.index_to_bus_addr[gpu_index]
if value.field_id.value in self.guages:
self.guages[value.field_id.value].labels(gpu_label).set(value.value.l_int)
@@ -107,7 +83,6 @@ if __name__ == '__main__':
parser.add_argument('--rdc_fields', default=None, nargs='+', help='The list of fields name needs to be watched, for example, " --rdc_fields RDC_FI_GPU_TEMP RDC_FI_POWER_USAGE " (default: predefined fields in the plugin)')
parser.add_argument('--rdc_fields_file', default=None, help='The list of fields name can also be read from a file with each field name in a separated line (default: None)')
parser.add_argument('--rdc_gpu_indexes', default=None, nargs='+', help='The list of GPUs to be watched (default: All GPUs)')
parser.add_argument('--enable_pci_id', default=False, action='store_true', help = 'Use the PCI Device Identifier to identify GPU (default: false)')
parser.add_argument('--enable_plugin_monitoring', default=False, action='store_true', help = 'Set this option to collect process metrics of the plugin itself (default: false)')
args = parser.parse_args()
@@ -122,7 +97,7 @@ if __name__ == '__main__':
reader = PrometheusReader(rdc_ip_port, field_ids, args.rdc_update_freq*1000000,
args.rdc_max_keep_age, args.rdc_max_keep_samples,
args.rdc_gpu_indexes, args.rdc_unauth, args.enable_pci_id, args.enable_plugin_monitoring)
args.rdc_gpu_indexes, args.rdc_unauth, args.enable_plugin_monitoring)
start_http_server(args.listen_port)
print("The RDC Prometheus plugin listen at port %d" % (args.listen_port))
time.sleep(3)