diff --git a/common/rdc_field.data b/common/rdc_field.data index 49998ecf7b..58756bea9f 100644 --- a/common/rdc_field.data +++ b/common/rdc_field.data @@ -43,6 +43,8 @@ FLD_DESC_ENT(RDC_FI_GPU_TEMP, "GPU temperature in millidegrees Celsiu FLD_DESC_ENT(RDC_FI_POWER_USAGE, "Power usage in microwatts", "POWER_USAGE", true) FLD_DESC_ENT(RDC_FI_PCIE_TX, "PCIe Tx utilization in bytes/second", "PCIE_TX", true) FLD_DESC_ENT(RDC_FI_PCIE_RX, "PCIe Rx utilization in bytes/second", "PCIE_RX", true) +FLD_DESC_ENT(RDC_FI_PCIE_BANDWIDTH, "PCIe bandwidth in GB/sec", "PCIE_BANDWIDTH", true) + FLD_DESC_ENT(RDC_FI_GPU_UTIL, "GPU busy percentage", "GPU_UTIL", true) FLD_DESC_ENT(RDC_FI_GPU_MEMORY_USAGE, "Memory usage of the GPU instance in bytes", "GPU_MEMORY_USAGE", true) FLD_DESC_ENT(RDC_FI_GPU_MEMORY_TOTAL, "Total memory of the GPU instance", "GPU_MEMORY_TOTAL", true) @@ -76,6 +78,25 @@ FLD_DESC_ENT(RDC_FI_ECC_FUSE_SEC, "FUSE Single Error Correction", FLD_DESC_ENT(RDC_FI_ECC_FUSE_DED, "FUSE Double Error Detection", "ECC_FUSE_DED", true) FLD_DESC_ENT(RDC_FI_ECC_UMC_SEC, "UMC Single Error Correction", "ECC_UMC_SEC", true) FLD_DESC_ENT(RDC_FI_ECC_UMC_DED, "UMC Double Error Detection", "ECC_UMC_DED", true) +FLD_DESC_ENT(RDC_FI_XGMI_0_READ_KB, "XGMI0 accumulated data read size (KB)", "XGMI_0_READ", true) +FLD_DESC_ENT(RDC_FI_XGMI_1_READ_KB, "XGMI1 accumulated data read size (KB)", "XGMI_1_READ", true) +FLD_DESC_ENT(RDC_FI_XGMI_2_READ_KB, "XGMI2 accumulated data read size (KB)", "XGMI_2_READ", true) +FLD_DESC_ENT(RDC_FI_XGMI_3_READ_KB, "XGMI3 accumulated data read size (KB)", "XGMI_3_READ", true) +FLD_DESC_ENT(RDC_FI_XGMI_4_READ_KB, "XGMI4 accumulated data read size (KB)", "XGMI_4_READ", true) +FLD_DESC_ENT(RDC_FI_XGMI_5_READ_KB, "XGMI5 accumulated data read size (KB)", "XGMI_5_READ", true) +FLD_DESC_ENT(RDC_FI_XGMI_6_READ_KB, "XGMI6 accumulated data read size (KB)", "XGMI_6_READ", true) +FLD_DESC_ENT(RDC_FI_XGMI_7_READ_KB, "XGMI7 accumulated data read size (KB)", "XGMI_7_READ", true) + +FLD_DESC_ENT(RDC_FI_XGMI_0_WRITE_KB, "XGMI0 accumulated data write size (KB)", "XGMI_0_WRITE", true) +FLD_DESC_ENT(RDC_FI_XGMI_1_WRITE_KB, "XGMI1 accumulated data write size (KB)", "XGMI_1_WRITE", true) +FLD_DESC_ENT(RDC_FI_XGMI_2_WRITE_KB, "XGMI2 accumulated data write size (KB)", "XGMI_2_WRITE", true) +FLD_DESC_ENT(RDC_FI_XGMI_3_WRITE_KB, "XGMI3 accumulated data write size (KB)", "XGMI_3_WRITE", true) +FLD_DESC_ENT(RDC_FI_XGMI_4_WRITE_KB, "XGMI4 accumulated data write size (KB)", "XGMI_4_WRITE", true) +FLD_DESC_ENT(RDC_FI_XGMI_5_WRITE_KB, "XGMI5 accumulated data write size (KB)", "XGMI_5_WRITE", true) +FLD_DESC_ENT(RDC_FI_XGMI_6_WRITE_KB, "XGMI6 accumulated data write size (KB)", "XGMI_6_WRITE", true) +FLD_DESC_ENT(RDC_FI_XGMI_7_WRITE_KB, "XGMI7 accumulated data write size (KB)", "XGMI_7_WRITE", true) + + // Events FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false) diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index e25cf901ef..882d360299 100644 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -168,6 +168,9 @@ typedef enum { */ RDC_FI_PCIE_TX = 400, //!< PCIe Tx utilization information RDC_FI_PCIE_RX, //!< PCIe Rx utilization information + // RDC_FI_PCIE_TX, RDC_FI_PCIE_RX are not supported on new ASIC + // The RDC_FI_PCIE_BANDWIDTH should be used + RDC_FI_PCIE_BANDWIDTH, //!< PCIe bandwidth in GB/sec /* * @brief GPU usage related fields @@ -224,6 +227,26 @@ typedef enum { RDC_FI_ECC_UMC_SEC, //!< UMC Single Error Correction RDC_FI_ECC_UMC_DED, //!< UMC Double Error Detection + // In new ASCI, such as MI300, the XGMI events is not supported + // Using below XGMI related fields to calculate the bandwidth. + RDC_FI_XGMI_0_READ_KB = 700, //!< XGMI_0 accumulated data read size (KB) + RDC_FI_XGMI_1_READ_KB, //!< XGMI_1 accumulated data read size (KB) + RDC_FI_XGMI_2_READ_KB, //!< XGMI_2 accumulated data read size (KB) + RDC_FI_XGMI_3_READ_KB, //!< XGMI_3 accumulated data read size (KB) + RDC_FI_XGMI_4_READ_KB, //!< XGMI_4 accumulated data read size (KB) + RDC_FI_XGMI_5_READ_KB, //!< XGMI_5 accumulated data read size (KB) + RDC_FI_XGMI_6_READ_KB, //!< XGMI_6 accumulated data read size (KB) + RDC_FI_XGMI_7_READ_KB, //!< XGMI_7 accumulated data read size (KB) + + RDC_FI_XGMI_0_WRITE_KB, //!< XGMI_0 accumulated data write size (KB) + RDC_FI_XGMI_1_WRITE_KB, //!< XGMI_1 accumulated data write size (KB) + RDC_FI_XGMI_2_WRITE_KB, //!< XGMI_2 accumulated data write size (KB) + RDC_FI_XGMI_3_WRITE_KB, //!< XGMI_3 accumulated data write size (KB) + RDC_FI_XGMI_4_WRITE_KB, //!< XGMI_4 accumulated data write size (KB) + RDC_FI_XGMI_5_WRITE_KB, //!< XGMI_5 accumulated data write size (KB) + RDC_FI_XGMI_6_WRITE_KB, //!< XGMI_6 accumulated data write size (KB) + RDC_FI_XGMI_7_WRITE_KB, //!< XGMI_7 accumulated data write size (KB) + /* * @brief Raw XGMI counter events */ diff --git a/python_binding/rdc_bootstrap.py b/python_binding/rdc_bootstrap.py index e6d9fca8fd..1fc2fea9fd 100644 --- a/python_binding/rdc_bootstrap.py +++ b/python_binding/rdc_bootstrap.py @@ -82,6 +82,7 @@ class rdc_field_t(c_int): RDC_FI_POWER_USAGE = 300 RDC_FI_PCIE_TX = 400 RDC_FI_PCIE_RX = 401 + RDC_FI_PCIE_BANDWIDTH = 402 RDC_FI_GPU_UTIL = 500 RDC_FI_GPU_MEMORY_USAGE = 501 RDC_FI_GPU_MEMORY_TOTAL = 502 @@ -115,6 +116,22 @@ class rdc_field_t(c_int): RDC_FI_ECC_FUSE_DED = 627 RDC_FI_ECC_UMC_SEC = 628 RDC_FI_ECC_UMC_DED = 629 + RDC_FI_XGMI_0_READ_KB = 700 + RDC_FI_XGMI_1_READ_KB = 701 + RDC_FI_XGMI_2_READ_KB = 702 + RDC_FI_XGMI_3_READ_KB = 703 + RDC_FI_XGMI_4_READ_KB = 704 + RDC_FI_XGMI_5_READ_KB = 705 + RDC_FI_XGMI_6_READ_KB = 706 + RDC_FI_XGMI_7_READ_KB = 707 + RDC_FI_XGMI_0_WRITE_KB = 708 + RDC_FI_XGMI_1_WRITE_KB = 709 + RDC_FI_XGMI_2_WRITE_KB = 710 + RDC_FI_XGMI_3_WRITE_KB = 711 + RDC_FI_XGMI_4_WRITE_KB = 712 + RDC_FI_XGMI_5_WRITE_KB = 713 + RDC_FI_XGMI_6_WRITE_KB = 714 + RDC_FI_XGMI_7_WRITE_KB = 715 RDC_EVNT_XGMI_0_NOP_TX = 1000 RDC_EVNT_XGMI_0_REQ_TX = 1001 RDC_EVNT_XGMI_0_RESP_TX = 1002 diff --git a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index 74e519a64c..560ded900a 100644 --- a/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -359,6 +359,50 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field value->type = INTEGER; }; + auto read_gpu_metrics_uint64_t = [&](void) { + amdsmi_gpu_metrics_t gpu_metrics; + value->status = amdsmi_get_gpu_metrics_info(processor_handle, &gpu_metrics); + RDC_LOG(RDC_DEBUG, "Read the gpu metrics:" << value->status); + if (value->status != AMDSMI_STATUS_SUCCESS) { + return; + } + + const std::unordered_map rdc_field_to_gpu_metrics = { + {RDC_FI_XGMI_0_READ_KB, gpu_metrics.xgmi_read_data_acc[0]}, + {RDC_FI_XGMI_1_READ_KB, gpu_metrics.xgmi_read_data_acc[1]}, + {RDC_FI_XGMI_2_READ_KB, gpu_metrics.xgmi_read_data_acc[2]}, + {RDC_FI_XGMI_3_READ_KB, gpu_metrics.xgmi_read_data_acc[3]}, + {RDC_FI_XGMI_4_READ_KB, gpu_metrics.xgmi_read_data_acc[4]}, + {RDC_FI_XGMI_5_READ_KB, gpu_metrics.xgmi_read_data_acc[5]}, + {RDC_FI_XGMI_6_READ_KB, gpu_metrics.xgmi_read_data_acc[6]}, + {RDC_FI_XGMI_7_READ_KB, gpu_metrics.xgmi_read_data_acc[7]}, + {RDC_FI_XGMI_0_WRITE_KB, gpu_metrics.xgmi_write_data_acc[0]}, + {RDC_FI_XGMI_1_WRITE_KB, gpu_metrics.xgmi_write_data_acc[1]}, + {RDC_FI_XGMI_2_WRITE_KB, gpu_metrics.xgmi_write_data_acc[2]}, + {RDC_FI_XGMI_3_WRITE_KB, gpu_metrics.xgmi_write_data_acc[3]}, + {RDC_FI_XGMI_4_WRITE_KB, gpu_metrics.xgmi_write_data_acc[4]}, + {RDC_FI_XGMI_5_WRITE_KB, gpu_metrics.xgmi_write_data_acc[5]}, + {RDC_FI_XGMI_6_WRITE_KB, gpu_metrics.xgmi_write_data_acc[6]}, + {RDC_FI_XGMI_7_WRITE_KB, gpu_metrics.xgmi_write_data_acc[7]}, + {RDC_FI_PCIE_BANDWIDTH, gpu_metrics.pcie_bandwidth_inst}, + }; + + // In gpu_metrics,the max value means not supported + const auto not_supported_metrics_data = std::numeric_limits::max(); + auto gpu_metrics_value_ite = rdc_field_to_gpu_metrics.find(field_id); + if (gpu_metrics_value_ite != rdc_field_to_gpu_metrics.end()) { + if (gpu_metrics_value_ite->second != not_supported_metrics_data) { + value->value.l_int = gpu_metrics_value_ite->second; + value->type = INTEGER; + return; + } else { + RDC_LOG(RDC_DEBUG, "The gpu metrics return max value which indicate not supported:" + << gpu_metrics_value_ite->second); + } + } + value->status = AMDSMI_STATUS_NOT_SUPPORTED; + }; + switch (field_id) { case RDC_FI_GPU_MEMORY_USAGE: { uint64_t u64 = 0; @@ -495,6 +539,25 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field } break; } + case RDC_FI_XGMI_0_READ_KB: + case RDC_FI_XGMI_1_READ_KB: + case RDC_FI_XGMI_2_READ_KB: + case RDC_FI_XGMI_3_READ_KB: + case RDC_FI_XGMI_4_READ_KB: + case RDC_FI_XGMI_5_READ_KB: + case RDC_FI_XGMI_6_READ_KB: + case RDC_FI_XGMI_7_READ_KB: + case RDC_FI_XGMI_0_WRITE_KB: + case RDC_FI_XGMI_1_WRITE_KB: + case RDC_FI_XGMI_2_WRITE_KB: + case RDC_FI_XGMI_3_WRITE_KB: + case RDC_FI_XGMI_4_WRITE_KB: + case RDC_FI_XGMI_5_WRITE_KB: + case RDC_FI_XGMI_6_WRITE_KB: + case RDC_FI_XGMI_7_WRITE_KB: + case RDC_FI_PCIE_BANDWIDTH: + read_gpu_metrics_uint64_t(); + break; default: break; diff --git a/rdc_libs/rdc/src/RdcSmiLib.cc b/rdc_libs/rdc/src/RdcSmiLib.cc index d6c9861440..8ede43e270 100644 --- a/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/rdc_libs/rdc/src/RdcSmiLib.cc @@ -152,9 +152,18 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI RDC_FI_GPU_CLOCK, RDC_FI_MEM_CLOCK, RDC_FI_MEMORY_TEMP, RDC_FI_GPU_TEMP, RDC_FI_POWER_USAGE, RDC_FI_PCIE_TX, - RDC_FI_PCIE_RX, RDC_FI_GPU_UTIL, + RDC_FI_PCIE_RX, RDC_FI_PCIE_BANDWIDTH, + RDC_FI_GPU_UTIL, RDC_FI_GPU_MEMORY_USAGE, RDC_FI_GPU_MEMORY_TOTAL, RDC_FI_ECC_CORRECT_TOTAL, RDC_FI_ECC_UNCORRECT_TOTAL, + RDC_FI_XGMI_0_READ_KB, RDC_FI_XGMI_1_READ_KB, + RDC_FI_XGMI_2_READ_KB, RDC_FI_XGMI_3_READ_KB, + RDC_FI_XGMI_4_READ_KB, RDC_FI_XGMI_5_READ_KB, + RDC_FI_XGMI_6_READ_KB, RDC_FI_XGMI_7_READ_KB, + RDC_FI_XGMI_0_WRITE_KB, RDC_FI_XGMI_1_WRITE_KB, + RDC_FI_XGMI_2_WRITE_KB, RDC_FI_XGMI_3_WRITE_KB, + RDC_FI_XGMI_4_WRITE_KB, RDC_FI_XGMI_5_WRITE_KB, + RDC_FI_XGMI_6_WRITE_KB, RDC_FI_XGMI_7_WRITE_KB, RDC_EVNT_XGMI_0_NOP_TX, RDC_EVNT_XGMI_0_REQ_TX, RDC_EVNT_XGMI_0_RESP_TX, RDC_EVNT_XGMI_0_BEATS_TX, RDC_EVNT_XGMI_1_NOP_TX, RDC_EVNT_XGMI_1_REQ_TX,