Update python_interface and remove --enable_pci_id
Change-Id: Ie5d511f3da25221bf60bc669ab172323703a1c45
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
[ROCm/rdc commit: bbe0b3573c]
Cette révision appartient à :
révisé par
Dmitrii Galantsev
Parent
98ba530267
révision
3dd90a6ff2
@@ -16,13 +16,13 @@ default_unit_coverter = {
|
||||
rdc_field_t.RDC_FI_GPU_MEMORY_TOTAL: 0.000001, # MegaBytes
|
||||
rdc_field_t.RDC_FI_POWER_USAGE: 0.000001, # Watts
|
||||
rdc_field_t.RDC_FI_GPU_CLOCK: 0.000001, # MHz
|
||||
rdc_field_t.RDC_FI_GPU_TEMP: 0.001 # degree
|
||||
rdc_field_t.RDC_FI_GPU_TEMP: 0.001, # degree
|
||||
}
|
||||
|
||||
class RdcReader:
|
||||
# To run the RDC in embedded mode, set the ip_port = None
|
||||
def __init__(self, ip_port = "localhost:50051", field_ids = default_field_ids,
|
||||
unit_converter = default_unit_coverter,
|
||||
unit_converter: dict[int, float] = default_unit_coverter,
|
||||
update_freq = 10000000, max_keep_age = 3600.0 , max_keep_samples = 1000,
|
||||
field_group_name = "rdc_reader_field_group", gpu_group_name = "rdc_reader_gpu_group",
|
||||
gpu_indexes = None, root_ca = "/etc/rdc/client/certs/rdc_cacert.pem",
|
||||
@@ -98,7 +98,10 @@ class RdcReader:
|
||||
if value.type.value == rdc_field_type_t.INTEGER:
|
||||
value.value.l_int = int(value.value.l_int * self.unit_converter[fid])
|
||||
if value.type.value == rdc_field_type_t.DOUBLE:
|
||||
value.value.dbl = value.value.l_int * self.unit_converter[fid]
|
||||
value.value.dbl = int(value.value.dbl * self.unit_converter[fid])
|
||||
# convert from double to l_int
|
||||
if value.type.value == rdc_field_type_t.DOUBLE:
|
||||
value.value.l_int = int(value.value.dbl)
|
||||
self.handle_field(gindex, value)
|
||||
has_succeed = True
|
||||
|
||||
|
||||
@@ -132,6 +132,16 @@ class rdc_field_t(c_int):
|
||||
RDC_FI_XGMI_5_WRITE_KB = 713
|
||||
RDC_FI_XGMI_6_WRITE_KB = 714
|
||||
RDC_FI_XGMI_7_WRITE_KB = 715
|
||||
RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU = 800
|
||||
RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU = 801
|
||||
RDC_FI_PROF_ACTIVE_CYCLES = 802
|
||||
RDC_FI_PROF_ACTIVE_WAVES = 803
|
||||
RDC_FI_PROF_ELAPSED_CYCLES = 804
|
||||
RDC_FI_PROF_EVAL_MEM_R_BW = 805
|
||||
RDC_FI_PROF_EVAL_MEM_W_BW = 806
|
||||
RDC_FI_PROF_EVAL_FLOPS_16 = 807
|
||||
RDC_FI_PROF_EVAL_FLOPS_32 = 808
|
||||
RDC_FI_PROF_EVAL_FLOPS_64 = 809
|
||||
RDC_EVNT_XGMI_0_NOP_TX = 1000
|
||||
RDC_EVNT_XGMI_0_REQ_TX = 1001
|
||||
RDC_EVNT_XGMI_0_RESP_TX = 1002
|
||||
|
||||
@@ -10,12 +10,14 @@ default_field_ids = [
|
||||
rdc_field_t.RDC_FI_POWER_USAGE,
|
||||
rdc_field_t.RDC_FI_GPU_CLOCK,
|
||||
rdc_field_t.RDC_FI_GPU_UTIL,
|
||||
rdc_field_t.RDC_FI_GPU_TEMP
|
||||
rdc_field_t.RDC_FI_GPU_TEMP,
|
||||
rdc_field_t.RDC_FI_PROF_ACTIVE_CYCLES,
|
||||
rdc_field_t.RDC_FI_PROF_ACTIVE_WAVES,
|
||||
]
|
||||
|
||||
class PrometheusReader(RdcReader):
|
||||
def __init__(self, rdc_ip_port, field_ids, update_freq, max_keep_age, max_keep_samples,
|
||||
gpu_indexes, rdc_unauth, enable_pci_id, enable_plugin_monitoring):
|
||||
gpu_indexes, rdc_unauth, enable_plugin_monitoring):
|
||||
group_name = "rdc_prometheus_plugin_group"
|
||||
field_group_name = "rdc_prometheus_plugin_fieldgroup"
|
||||
if rdc_unauth:
|
||||
@@ -32,31 +34,7 @@ class PrometheusReader(RdcReader):
|
||||
REGISTRY.unregister(PROCESS_COLLECTOR)
|
||||
REGISTRY.unregister(PLATFORM_COLLECTOR)
|
||||
|
||||
# Use the PCI id as gpu_index if enabled
|
||||
self.enable_pci_id = False
|
||||
if enable_pci_id == True:
|
||||
try:
|
||||
import sys, os
|
||||
# Relaive path of amd_smi to map gpu index to PCI id
|
||||
# change smi_lib_path if the amd_smi is installed in different folder
|
||||
smi_lib_relative_path = "../../bin"
|
||||
smi_lib_path = os.path.join(sys.path[0], smi_lib_relative_path)
|
||||
if os.path.exists(smi_lib_path+"/amd_smi.py"):
|
||||
sys.path.append(smi_lib_path)
|
||||
from amd_smi import getBus, initializeRsmi
|
||||
initializeRsmi()
|
||||
# Map between gpu indexes and PCIe bus addresses
|
||||
self.index_to_bus_addr = {}
|
||||
for item in self.gpu_indexes:
|
||||
self.index_to_bus_addr[item] = getBus(item)
|
||||
self.enable_pci_id = True
|
||||
else:
|
||||
print("cannot find smi_lib to map the PCI id")
|
||||
except Exception as error:
|
||||
print("Fail to get the PCI id", error)
|
||||
|
||||
|
||||
# Create the guages
|
||||
# Create the gauges
|
||||
self.guages = {}
|
||||
for fid in self.field_ids:
|
||||
field_name = self.rdc_util.field_id_string(fid).lower()
|
||||
@@ -64,8 +42,6 @@ class PrometheusReader(RdcReader):
|
||||
|
||||
def handle_field(self, gpu_index, value):
|
||||
gpu_label = gpu_index
|
||||
if self.enable_pci_id:
|
||||
gpu_label = self.index_to_bus_addr[gpu_index]
|
||||
if value.field_id.value in self.guages:
|
||||
self.guages[value.field_id.value].labels(gpu_label).set(value.value.l_int)
|
||||
|
||||
@@ -107,7 +83,6 @@ if __name__ == '__main__':
|
||||
parser.add_argument('--rdc_fields', default=None, nargs='+', help='The list of fields name needs to be watched, for example, " --rdc_fields RDC_FI_GPU_TEMP RDC_FI_POWER_USAGE " (default: predefined fields in the plugin)')
|
||||
parser.add_argument('--rdc_fields_file', default=None, help='The list of fields name can also be read from a file with each field name in a separated line (default: None)')
|
||||
parser.add_argument('--rdc_gpu_indexes', default=None, nargs='+', help='The list of GPUs to be watched (default: All GPUs)')
|
||||
parser.add_argument('--enable_pci_id', default=False, action='store_true', help = 'Use the PCI Device Identifier to identify GPU (default: false)')
|
||||
parser.add_argument('--enable_plugin_monitoring', default=False, action='store_true', help = 'Set this option to collect process metrics of the plugin itself (default: false)')
|
||||
|
||||
args = parser.parse_args()
|
||||
@@ -122,7 +97,7 @@ if __name__ == '__main__':
|
||||
|
||||
reader = PrometheusReader(rdc_ip_port, field_ids, args.rdc_update_freq*1000000,
|
||||
args.rdc_max_keep_age, args.rdc_max_keep_samples,
|
||||
args.rdc_gpu_indexes, args.rdc_unauth, args.enable_pci_id, args.enable_plugin_monitoring)
|
||||
args.rdc_gpu_indexes, args.rdc_unauth, args.enable_plugin_monitoring)
|
||||
start_http_server(args.listen_port)
|
||||
print("The RDC Prometheus plugin listen at port %d" % (args.listen_port))
|
||||
time.sleep(3)
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur