diff --git a/projects/rdc/common/rdc_field.data b/projects/rdc/common/rdc_field.data index 53bf261030..17b0e6337d 100644 --- a/projects/rdc/common/rdc_field.data +++ b/projects/rdc/common/rdc_field.data @@ -103,6 +103,7 @@ FLD_DESC_ENT(RDC_FI_XGMI_4_READ_KB, "XGMI4 accumulated data read size (KB)" FLD_DESC_ENT(RDC_FI_XGMI_5_READ_KB, "XGMI5 accumulated data read size (KB)", "XGMI_5_READ", true) FLD_DESC_ENT(RDC_FI_XGMI_6_READ_KB, "XGMI6 accumulated data read size (KB)", "XGMI_6_READ", true) FLD_DESC_ENT(RDC_FI_XGMI_7_READ_KB, "XGMI7 accumulated data read size (KB)", "XGMI_7_READ", true) +FLD_DESC_ENT(RDC_FI_XGMI_TOTAL_READ_KB, "XGMI accumlated data read size across all lanes (KB)", "XGMI_TOTAL_READ", true) FLD_DESC_ENT(RDC_FI_XGMI_0_WRITE_KB, "XGMI0 accumulated data write size (KB)", "XGMI_0_WRITE", true) FLD_DESC_ENT(RDC_FI_XGMI_1_WRITE_KB, "XGMI1 accumulated data write size (KB)", "XGMI_1_WRITE", true) @@ -112,7 +113,7 @@ FLD_DESC_ENT(RDC_FI_XGMI_4_WRITE_KB, "XGMI4 accumulated data write size (KB) FLD_DESC_ENT(RDC_FI_XGMI_5_WRITE_KB, "XGMI5 accumulated data write size (KB)", "XGMI_5_WRITE", true) FLD_DESC_ENT(RDC_FI_XGMI_6_WRITE_KB, "XGMI6 accumulated data write size (KB)", "XGMI_6_WRITE", true) FLD_DESC_ENT(RDC_FI_XGMI_7_WRITE_KB, "XGMI7 accumulated data write size (KB)", "XGMI_7_WRITE", true) - +FLD_DESC_ENT(RDC_FI_XGMI_TOTAL_WRITE_KB, "XGMI accumlated data write size across all lanes (KB)", "XGMI_TOTAL_WRITE", true) // ROCProfiler fields diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index f9cde1b788..60a52c10e5 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -252,6 +252,8 @@ typedef enum { RDC_FI_XGMI_5_WRITE_KB, //!< XGMI_5 accumulated data write size (KB) RDC_FI_XGMI_6_WRITE_KB, //!< XGMI_6 accumulated data write size (KB) RDC_FI_XGMI_7_WRITE_KB, //!< XGMI_7 accumulated data write size (KB) + RDC_FI_XGMI_TOTAL_READ_KB, //!< XGMI_SUM accumulated data read size (KB) + RDC_FI_XGMI_TOTAL_WRITE_KB, //!< XGMI_SUM accumulated data write size (KB) /** * @brief ROC-profiler related fields diff --git a/projects/rdc/python_binding/rdc_bootstrap.py b/projects/rdc/python_binding/rdc_bootstrap.py index de0a982516..37ba7e53ba 100644 --- a/projects/rdc/python_binding/rdc_bootstrap.py +++ b/projects/rdc/python_binding/rdc_bootstrap.py @@ -133,6 +133,8 @@ class rdc_field_t(c_int): RDC_FI_XGMI_5_WRITE_KB = 713 RDC_FI_XGMI_6_WRITE_KB = 714 RDC_FI_XGMI_7_WRITE_KB = 715 + RDC_FI_XGMI_TOTAL_READ_KB = 716 + RDC_FI_XGMI_TOTAL_WRITE_KB = 717 RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU = 800 RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU = 801 RDC_FI_PROF_ACTIVE_CYCLES = 802 diff --git a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc index c2a85ec820..da26294690 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcMetricFetcherImpl.cc @@ -431,6 +431,36 @@ rdc_status_t RdcMetricFetcherImpl::bulk_fetch_smi_fields( constexpr double kGig = 1000000000.0; +static uint64_t sum_xgmi_read(const amdsmi_gpu_metrics_t& gpu_metrics) { + uint64_t total = 0; + const auto not_supported_metrics_data = std::numeric_limits::max(); + for (int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) { + if (gpu_metrics.xgmi_read_data_acc[i] == not_supported_metrics_data){ + continue; + } + total += gpu_metrics.xgmi_read_data_acc[i]; + } + if (total == 0){ + return not_supported_metrics_data; + } + return total; +} + +static uint64_t sum_xgmi_write(const amdsmi_gpu_metrics_t& gpu_metrics) { + uint64_t total = 0; + const auto not_supported_metrics_data = std::numeric_limits::max(); + for (int i = 0; i < AMDSMI_MAX_NUM_XGMI_LINKS; ++i) { + if (gpu_metrics.xgmi_write_data_acc[i] == not_supported_metrics_data){ + continue; + } + total += gpu_metrics.xgmi_write_data_acc[i]; + } + if (total == 0){ + return not_supported_metrics_data; + } + return total; +} + rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field_t field_id, rdc_field_value* value) { if (!value) { @@ -486,6 +516,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field {RDC_FI_XGMI_5_READ_KB, gpu_metrics.xgmi_read_data_acc[5]}, {RDC_FI_XGMI_6_READ_KB, gpu_metrics.xgmi_read_data_acc[6]}, {RDC_FI_XGMI_7_READ_KB, gpu_metrics.xgmi_read_data_acc[7]}, + {RDC_FI_XGMI_TOTAL_READ_KB, sum_xgmi_read(gpu_metrics)}, {RDC_FI_XGMI_0_WRITE_KB, gpu_metrics.xgmi_write_data_acc[0]}, {RDC_FI_XGMI_1_WRITE_KB, gpu_metrics.xgmi_write_data_acc[1]}, {RDC_FI_XGMI_2_WRITE_KB, gpu_metrics.xgmi_write_data_acc[2]}, @@ -494,6 +525,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field {RDC_FI_XGMI_5_WRITE_KB, gpu_metrics.xgmi_write_data_acc[5]}, {RDC_FI_XGMI_6_WRITE_KB, gpu_metrics.xgmi_write_data_acc[6]}, {RDC_FI_XGMI_7_WRITE_KB, gpu_metrics.xgmi_write_data_acc[7]}, + {RDC_FI_XGMI_TOTAL_WRITE_KB, sum_xgmi_write(gpu_metrics)}, {RDC_FI_PCIE_BANDWIDTH, gpu_metrics.pcie_bandwidth_inst}, }; @@ -721,6 +753,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field case RDC_FI_XGMI_5_READ_KB: case RDC_FI_XGMI_6_READ_KB: case RDC_FI_XGMI_7_READ_KB: + case RDC_FI_XGMI_TOTAL_READ_KB: case RDC_FI_XGMI_0_WRITE_KB: case RDC_FI_XGMI_1_WRITE_KB: case RDC_FI_XGMI_2_WRITE_KB: @@ -729,6 +762,7 @@ rdc_status_t RdcMetricFetcherImpl::fetch_smi_field(uint32_t gpu_index, rdc_field case RDC_FI_XGMI_5_WRITE_KB: case RDC_FI_XGMI_6_WRITE_KB: case RDC_FI_XGMI_7_WRITE_KB: + case RDC_FI_XGMI_TOTAL_WRITE_KB: case RDC_FI_PCIE_BANDWIDTH: read_gpu_metrics_uint64_t(); break; diff --git a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc index 79891c71ef..4ca9ffbd22 100644 --- a/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc +++ b/projects/rdc/rdc_libs/rdc/src/RdcSmiLib.cc @@ -169,9 +169,10 @@ rdc_status_t RdcSmiLib::rdc_telemetry_fields_query(uint32_t field_ids[MAX_NUM_FI RDC_FI_ECC_MPIO_CE, RDC_FI_ECC_MPIO_UE, RDC_FI_XGMI_0_READ_KB, RDC_FI_XGMI_1_READ_KB, RDC_FI_XGMI_2_READ_KB, RDC_FI_XGMI_3_READ_KB, RDC_FI_XGMI_4_READ_KB, RDC_FI_XGMI_5_READ_KB, RDC_FI_XGMI_6_READ_KB, - RDC_FI_XGMI_7_READ_KB, RDC_FI_XGMI_0_WRITE_KB, RDC_FI_XGMI_1_WRITE_KB, - RDC_FI_XGMI_2_WRITE_KB, RDC_FI_XGMI_3_WRITE_KB, RDC_FI_XGMI_4_WRITE_KB, - RDC_FI_XGMI_5_WRITE_KB, RDC_FI_XGMI_6_WRITE_KB, RDC_FI_XGMI_7_WRITE_KB, + RDC_FI_XGMI_7_READ_KB, RDC_FI_XGMI_TOTAL_READ_KB, RDC_FI_XGMI_0_WRITE_KB, + RDC_FI_XGMI_1_WRITE_KB, RDC_FI_XGMI_2_WRITE_KB, RDC_FI_XGMI_3_WRITE_KB, + RDC_FI_XGMI_4_WRITE_KB, RDC_FI_XGMI_5_WRITE_KB, RDC_FI_XGMI_6_WRITE_KB, + RDC_FI_XGMI_7_WRITE_KB, RDC_FI_XGMI_TOTAL_WRITE_KB, RDC_EVNT_XGMI_0_NOP_TX, RDC_EVNT_XGMI_0_REQ_TX, RDC_EVNT_XGMI_0_RESP_TX, RDC_EVNT_XGMI_0_BEATS_TX, RDC_EVNT_XGMI_1_NOP_TX, RDC_EVNT_XGMI_1_REQ_TX, RDC_EVNT_XGMI_1_RESP_TX, RDC_EVNT_XGMI_1_BEATS_TX, RDC_EVNT_XGMI_0_THRPUT,