From 51de344be71ef9c8ae0edc28fb53ec6a306de319 Mon Sep 17 00:00:00 2001 From: "Galantsev, Dmitrii" Date: Thu, 27 Mar 2025 18:07:13 +0000 Subject: [PATCH] Profiler - Add CPC and CPF metrics Change-Id: I27fd725e9e1868c9afe7624d6e4aafad2a42d47e Signed-off-by: Galantsev, Dmitrii --- common/rdc_field.data | 41 ++++++++++++++++++-- include/rdc/rdc.h | 41 ++++++++++++++++++-- python_binding/rdc_bootstrap.py | 35 +++++++++++++++++ rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc | 36 +++++++++++++++++ 4 files changed, 147 insertions(+), 6 deletions(-) diff --git a/common/rdc_field.data b/common/rdc_field.data index 7dd40ed07a..f4eecac54a 100644 --- a/common/rdc_field.data +++ b/common/rdc_field.data @@ -133,21 +133,56 @@ FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false) FLD_DESC_ENT(RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "Percent of Active Pipe Tensors", "TENSOR_PERCENT", false) FLD_DESC_ENT(RDC_FI_PROF_GPU_UTIL_PERCENT, "Percent of GPU Utilization", "GPU_UTIL_PERCENT", false) -// metrics below are divided by time passed +// metrics with EVAL are divided by time passed FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", false) FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", false) FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false) FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false) FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false) -// metrics below are not divided by time passed FLD_DESC_ENT(RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "Percent of Active Pipe VALU", "VALU_UTILIZATION", false) FLD_DESC_ENT(RDC_FI_PROF_SM_ACTIVE, "Ratio of Cycles with active warp on SM","VALUBusy", false) FLD_DESC_ENT(RDC_FI_PROF_OCC_PER_ACTIVE_CU, "Mean occ per active compute unit", "OCC_CU", false) FLD_DESC_ENT(RDC_FI_PROF_OCC_ELAPSED, "Mean occ per active cu over elapsed", "OCC_CU_ELAPSED", false) -// metrics below are divided by time passed FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "Number of fp16 OPS percent of max", "FLOPS_16_PERCENT", false) FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "Number of fp32 OPS percent of max", "FLOPS_32_PERCENT", false) FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "Number of fp64 OPS percent of max", "FLOPS_64_PERCENT", false) +// CPC +FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_STAT_BUSY, "", "CPC_CPC_STAT_BUSY", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_STAT_IDLE, "", "CPC_CPC_STAT_IDLE", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_STAT_STALL, "", "CPC_CPC_STAT_STALL", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_TCIU_BUSY, "", "CPC_CPC_TCIU_BUSY", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_TCIU_IDLE, "", "CPC_CPC_TCIU_IDLE", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_UTCL2IU_BUSY, "", "CPC_CPC_UTCL2IU_BUSY", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_UTCL2IU_IDLE, "", "CPC_CPC_UTCL2IU_IDLE", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_CPC_UTCL2IU_STALL, "", "CPC_CPC_UTCL2IU_STALL", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_ME1_BUSY_FOR_PACKET_DECODE, "", "CPC_ME1_BUSY_FOR_PACKET_DECODE", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_ME1_DC0_SPI_BUSY, "", "CPC_ME1_DC0_SPI_BUSY", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_UTCL1_STALL_ON_TRANSLATION, "", "CPC_UTCL1_STALL_ON_TRANSLATION", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_ALWAYS_COUNT, "", "CPC_ALWAYS_COUNT", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_ADC_VALID_CHUNK_NOT_AVAIL, "", "CPC_ADC_VALID_CHUNK_NOT_AVAIL", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_ADC_DISPATCH_ALLOC_DONE, "", "CPC_ADC_DISPATCH_ALLOC_DONE", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_ADC_VALID_CHUNK_END, "", "CPC_ADC_VALID_CHUNK_END", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_SYNC_FIFO_FULL_LEVEL, "", "CPC_SYNC_FIFO_FULL_LEVEL", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_SYNC_FIFO_FULL, "", "CPC_SYNC_FIFO_FULL", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_GD_BUSY, "", "CPC_GD_BUSY", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_TG_SEND, "", "CPC_TG_SEND", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_WALK_NEXT_CHUNK, "", "CPC_WALK_NEXT_CHUNK", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_STALLED_BY_SE0_SPI, "", "CPC_STALLED_BY_SE0_SPI", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_STALLED_BY_SE1_SPI, "", "CPC_STALLED_BY_SE1_SPI", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_STALLED_BY_SE2_SPI, "", "CPC_STALLED_BY_SE2_SPI", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_STALLED_BY_SE3_SPI, "", "CPC_STALLED_BY_SE3_SPI", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_LTE_ALL, "", "CPC_LTE_ALL", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_SYNC_WRREQ_FIFO_BUSY, "", "CPC_SYNC_WRREQ_FIFO_BUSY", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_CANE_BUSY, "", "CPC_CANE_BUSY", false) +FLD_DESC_ENT(RDC_FI_PROF_CPC_CANE_STALL, "", "CPC_CANE_STALL", false) +// CPF +FLD_DESC_ENT(RDC_FI_PROF_CPF_CMP_UTCL1_STALL_ON_TRANSLATION, "", "CPF_CMP_UTCL1_STALL_ON_TRANSLATION", false) +FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_STAT_BUSY, "", "CPF_CPF_STAT_BUSY", false) +FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_STAT_IDLE, "", "CPF_CPF_STAT_IDLE", false) +FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_STAT_STALL, "", "CPF_CPF_STAT_STALL", false) +FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_BUSY, "", "CPF_CPF_TCIU_BUSY", false) +FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "", "CPF_CPF_TCIU_IDLE", false) +FLD_DESC_ENT(RDC_FI_PROF_CPF_CPF_TCIU_STALL, "", "CPF_CPF_TCIU_STALL", false) // Events FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false) diff --git a/include/rdc/rdc.h b/include/rdc/rdc.h index a5625dd61b..d22a0bc2f1 100644 --- a/include/rdc/rdc.h +++ b/include/rdc/rdc.h @@ -290,21 +290,56 @@ typedef enum { RDC_FI_PROF_ELAPSED_CYCLES, RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, RDC_FI_PROF_GPU_UTIL_PERCENT, - // metrics below are divided by time passed + // metrics with EVAL are divided by time passed RDC_FI_PROF_EVAL_MEM_R_BW, RDC_FI_PROF_EVAL_MEM_W_BW, RDC_FI_PROF_EVAL_FLOPS_16, RDC_FI_PROF_EVAL_FLOPS_32, RDC_FI_PROF_EVAL_FLOPS_64, - // metrics below are not divided by time passed RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, RDC_FI_PROF_SM_ACTIVE, RDC_FI_PROF_OCC_PER_ACTIVE_CU, RDC_FI_PROF_OCC_ELAPSED, - // metrics below are divided by time passed RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, + // CPC + RDC_FI_PROF_CPC_CPC_STAT_BUSY, + RDC_FI_PROF_CPC_CPC_STAT_IDLE, + RDC_FI_PROF_CPC_CPC_STAT_STALL, + RDC_FI_PROF_CPC_CPC_TCIU_BUSY, + RDC_FI_PROF_CPC_CPC_TCIU_IDLE, + RDC_FI_PROF_CPC_CPC_UTCL2IU_BUSY, + RDC_FI_PROF_CPC_CPC_UTCL2IU_IDLE, + RDC_FI_PROF_CPC_CPC_UTCL2IU_STALL, + RDC_FI_PROF_CPC_ME1_BUSY_FOR_PACKET_DECODE, + RDC_FI_PROF_CPC_ME1_DC0_SPI_BUSY, + RDC_FI_PROF_CPC_UTCL1_STALL_ON_TRANSLATION, + RDC_FI_PROF_CPC_ALWAYS_COUNT, + RDC_FI_PROF_CPC_ADC_VALID_CHUNK_NOT_AVAIL, + RDC_FI_PROF_CPC_ADC_DISPATCH_ALLOC_DONE, + RDC_FI_PROF_CPC_ADC_VALID_CHUNK_END, + RDC_FI_PROF_CPC_SYNC_FIFO_FULL_LEVEL, + RDC_FI_PROF_CPC_SYNC_FIFO_FULL, + RDC_FI_PROF_CPC_GD_BUSY, + RDC_FI_PROF_CPC_TG_SEND, + RDC_FI_PROF_CPC_WALK_NEXT_CHUNK, + RDC_FI_PROF_CPC_STALLED_BY_SE0_SPI, + RDC_FI_PROF_CPC_STALLED_BY_SE1_SPI, + RDC_FI_PROF_CPC_STALLED_BY_SE2_SPI, + RDC_FI_PROF_CPC_STALLED_BY_SE3_SPI, + RDC_FI_PROF_CPC_LTE_ALL, + RDC_FI_PROF_CPC_SYNC_WRREQ_FIFO_BUSY, + RDC_FI_PROF_CPC_CANE_BUSY, + RDC_FI_PROF_CPC_CANE_STALL, + // CPF + RDC_FI_PROF_CPF_CMP_UTCL1_STALL_ON_TRANSLATION, + RDC_FI_PROF_CPF_CPF_STAT_BUSY, + RDC_FI_PROF_CPF_CPF_STAT_IDLE, + RDC_FI_PROF_CPF_CPF_STAT_STALL, + RDC_FI_PROF_CPF_CPF_TCIU_BUSY, + RDC_FI_PROF_CPF_CPF_TCIU_IDLE, + RDC_FI_PROF_CPF_CPF_TCIU_STALL, /** * @brief Raw XGMI counter events diff --git a/python_binding/rdc_bootstrap.py b/python_binding/rdc_bootstrap.py index 2f74afbf1b..514bf54cf8 100644 --- a/python_binding/rdc_bootstrap.py +++ b/python_binding/rdc_bootstrap.py @@ -177,6 +177,41 @@ class rdc_field_t(c_int): RDC_FI_PROF_EVAL_FLOPS_16_PERCENT = 815 RDC_FI_PROF_EVAL_FLOPS_32_PERCENT = 816 RDC_FI_PROF_EVAL_FLOPS_64_PERCENT = 817 + RDC_FI_PROF_CPC_CPC_STAT_BUSY = 818 + RDC_FI_PROF_CPC_CPC_STAT_IDLE = 819 + RDC_FI_PROF_CPC_CPC_STAT_STALL = 820 + RDC_FI_PROF_CPC_CPC_TCIU_BUSY = 821 + RDC_FI_PROF_CPC_CPC_TCIU_IDLE = 822 + RDC_FI_PROF_CPC_CPC_UTCL2IU_BUSY = 823 + RDC_FI_PROF_CPC_CPC_UTCL2IU_IDLE = 824 + RDC_FI_PROF_CPC_CPC_UTCL2IU_STALL = 825 + RDC_FI_PROF_CPC_ME1_BUSY_FOR_PACKET_DECODE = 826 + RDC_FI_PROF_CPC_ME1_DC0_SPI_BUSY = 827 + RDC_FI_PROF_CPC_UTCL1_STALL_ON_TRANSLATION = 828 + RDC_FI_PROF_CPC_ALWAYS_COUNT = 829 + RDC_FI_PROF_CPC_ADC_VALID_CHUNK_NOT_AVAIL = 830 + RDC_FI_PROF_CPC_ADC_DISPATCH_ALLOC_DONE = 831 + RDC_FI_PROF_CPC_ADC_VALID_CHUNK_END = 832 + RDC_FI_PROF_CPC_SYNC_FIFO_FULL_LEVEL = 833 + RDC_FI_PROF_CPC_SYNC_FIFO_FULL = 834 + RDC_FI_PROF_CPC_GD_BUSY = 835 + RDC_FI_PROF_CPC_TG_SEND = 836 + RDC_FI_PROF_CPC_WALK_NEXT_CHUNK = 837 + RDC_FI_PROF_CPC_STALLED_BY_SE0_SPI = 838 + RDC_FI_PROF_CPC_STALLED_BY_SE1_SPI = 839 + RDC_FI_PROF_CPC_STALLED_BY_SE2_SPI = 840 + RDC_FI_PROF_CPC_STALLED_BY_SE3_SPI = 841 + RDC_FI_PROF_CPC_LTE_ALL = 842 + RDC_FI_PROF_CPC_SYNC_WRREQ_FIFO_BUSY = 843 + RDC_FI_PROF_CPC_CANE_BUSY = 844 + RDC_FI_PROF_CPC_CANE_STALL = 845 + RDC_FI_PROF_CPF_CMP_UTCL1_STALL_ON_TRANSLATION = 846 + RDC_FI_PROF_CPF_CPF_STAT_BUSY = 847 + RDC_FI_PROF_CPF_CPF_STAT_IDLE = 848 + RDC_FI_PROF_CPF_CPF_STAT_STALL = 849 + RDC_FI_PROF_CPF_CPF_TCIU_BUSY = 850 + RDC_FI_PROF_CPF_CPF_TCIU_IDLE = 851 + RDC_FI_PROF_CPF_CPF_TCIU_STALL = 852 RDC_EVNT_XGMI_0_NOP_TX = 1000 RDC_EVNT_XGMI_0_REQ_TX = 1001 RDC_EVNT_XGMI_0_RESP_TX = 1002 diff --git a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc index a0ca1c0d0f..3e1e31b0bc 100644 --- a/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc +++ b/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc @@ -126,6 +126,42 @@ RdcRocpBase::RdcRocpBase() { {RDC_FI_PROF_SM_ACTIVE, "VALUBusy"}, {RDC_FI_PROF_OCC_PER_ACTIVE_CU, "MeanOccupancyPerActiveCU"}, // RDC_FI_PROF_OCC_ELAPSED is derived from OCC_PER_ACTIVE_CU and ACTIVE_CYCLES + {RDC_FI_PROF_CPC_CPC_STAT_BUSY, "CPC_CPC_STAT_BUSY"}, + {RDC_FI_PROF_CPC_CPC_STAT_IDLE, "CPC_CPC_STAT_IDLE"}, + {RDC_FI_PROF_CPC_CPC_STAT_STALL, "CPC_CPC_STAT_STALL"}, + {RDC_FI_PROF_CPC_CPC_TCIU_BUSY, "CPC_CPC_TCIU_BUSY"}, + {RDC_FI_PROF_CPC_CPC_TCIU_IDLE, "CPC_CPC_TCIU_IDLE"}, + {RDC_FI_PROF_CPC_CPC_UTCL2IU_BUSY, "CPC_CPC_UTCL2IU_BUSY"}, + {RDC_FI_PROF_CPC_CPC_UTCL2IU_IDLE, "CPC_CPC_UTCL2IU_IDLE"}, + {RDC_FI_PROF_CPC_CPC_UTCL2IU_STALL, "CPC_CPC_UTCL2IU_STALL"}, + {RDC_FI_PROF_CPC_ME1_BUSY_FOR_PACKET_DECODE, "CPC_ME1_BUSY_FOR_PACKET_DECODE"}, + {RDC_FI_PROF_CPC_ME1_DC0_SPI_BUSY, "CPC_ME1_DC0_SPI_BUSY"}, + {RDC_FI_PROF_CPC_UTCL1_STALL_ON_TRANSLATION, "CPC_UTCL1_STALL_ON_TRANSLATION"}, + {RDC_FI_PROF_CPC_ALWAYS_COUNT, "CPC_ALWAYS_COUNT"}, + {RDC_FI_PROF_CPC_ADC_VALID_CHUNK_NOT_AVAIL, "CPC_ADC_VALID_CHUNK_NOT_AVAIL"}, + {RDC_FI_PROF_CPC_ADC_DISPATCH_ALLOC_DONE, "CPC_ADC_DISPATCH_ALLOC_DONE"}, + {RDC_FI_PROF_CPC_ADC_VALID_CHUNK_END, "CPC_ADC_VALID_CHUNK_END"}, + {RDC_FI_PROF_CPC_SYNC_FIFO_FULL_LEVEL, "CPC_SYNC_FIFO_FULL_LEVEL"}, + {RDC_FI_PROF_CPC_SYNC_FIFO_FULL, "CPC_SYNC_FIFO_FULL"}, + {RDC_FI_PROF_CPC_GD_BUSY, "CPC_GD_BUSY"}, + {RDC_FI_PROF_CPC_TG_SEND, "CPC_TG_SEND"}, + {RDC_FI_PROF_CPC_WALK_NEXT_CHUNK, "CPC_WALK_NEXT_CHUNK"}, + {RDC_FI_PROF_CPC_STALLED_BY_SE0_SPI, "CPC_STALLED_BY_SE0_SPI"}, + {RDC_FI_PROF_CPC_STALLED_BY_SE1_SPI, "CPC_STALLED_BY_SE1_SPI"}, + {RDC_FI_PROF_CPC_STALLED_BY_SE2_SPI, "CPC_STALLED_BY_SE2_SPI"}, + {RDC_FI_PROF_CPC_STALLED_BY_SE3_SPI, "CPC_STALLED_BY_SE3_SPI"}, + {RDC_FI_PROF_CPC_LTE_ALL, "CPC_LTE_ALL"}, + {RDC_FI_PROF_CPC_SYNC_WRREQ_FIFO_BUSY, "CPC_SYNC_WRREQ_FIFO_BUSY"}, + {RDC_FI_PROF_CPC_CANE_BUSY, "CPC_CANE_BUSY"}, + {RDC_FI_PROF_CPC_CANE_STALL, "CPC_CANE_STALL"}, + {RDC_FI_PROF_CPF_CMP_UTCL1_STALL_ON_TRANSLATION, "CPF_CMP_UTCL1_STALL_ON_TRANSLATION"}, + {RDC_FI_PROF_CPF_CPF_STAT_BUSY, "CPF_CPF_STAT_BUSY"}, + {RDC_FI_PROF_CPF_CPF_STAT_IDLE, "CPF_CPF_STAT_IDLE"}, + {RDC_FI_PROF_CPF_CPF_STAT_STALL, "CPF_CPF_STAT_STALL"}, + {RDC_FI_PROF_CPF_CPF_TCIU_BUSY, "CPF_CPF_TCIU_BUSY"}, + {RDC_FI_PROF_CPF_CPF_TCIU_IDLE, "CPF_CPF_TCIU_IDLE"}, + {RDC_FI_PROF_CPF_CPF_TCIU_STALL, "CPF_CPF_TCIU_STALL"}, + }; hsa_status_t status = hsa_init();