From 0186fc2481f3f31145add47a70f908d511b620aa Mon Sep 17 00:00:00 2001 From: "Pryor, Adam" Date: Fri, 24 Jan 2025 10:07:32 -0600 Subject: [PATCH] SWDEV-508477 Eval Flops Percent (#85) SWDEV-508477 - Profiler add FP*_PERCENT Change-Id: Idb6250fe6b7ba3df6fe7d30861e0fbbda7e9bdce Signed-off-by: adapryor Signed-off-by: Galantsev, Dmitrii [ROCm/rdc commit: 6f358ddc9e7780c389f40134135bead9b8f97d62] --- projects/rdc/common/rdc_field.data | 4 ++ projects/rdc/include/rdc/rdc.h | 4 ++ projects/rdc/python_binding/rdc_bootstrap.py | 3 + .../rdc_modules/rdc_rocp/RdcRocpBase.cc | 60 ++++++++++++++----- 4 files changed, 55 insertions(+), 16 deletions(-) diff --git a/projects/rdc/common/rdc_field.data b/projects/rdc/common/rdc_field.data index be3510ac5b..7de8bc5d3f 100644 --- a/projects/rdc/common/rdc_field.data +++ b/projects/rdc/common/rdc_field.data @@ -144,6 +144,10 @@ FLD_DESC_ENT(RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "Percent of Active Pipe VALU", FLD_DESC_ENT(RDC_FI_PROF_SM_ACTIVE, "Ratio of Cycles with active warp on SM","VALUBusy", false) FLD_DESC_ENT(RDC_FI_PROF_OCC_PER_ACTIVE_CU, "Mean occ per active compute unit", "OCC_CU", false) FLD_DESC_ENT(RDC_FI_PROF_OCC_ELAPSED, "Mean occ per active cu over elapsed", "OCC_CU_ELAPSED", false) +// metrics below are divided by time passed +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "Number of fp16 OPS percent of max", "FLOPS_16_PERCENT", false) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "Number of fp32 OPS percent of max", "FLOPS_32_PERCENT", false) +FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "Number of fp64 OPS percent of max", "FLOPS_64_PERCENT", false) // Events FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false) diff --git a/projects/rdc/include/rdc/rdc.h b/projects/rdc/include/rdc/rdc.h index b0a12b7bd4..941f877260 100644 --- a/projects/rdc/include/rdc/rdc.h +++ b/projects/rdc/include/rdc/rdc.h @@ -292,6 +292,10 @@ typedef enum { RDC_FI_PROF_SM_ACTIVE, RDC_FI_PROF_OCC_PER_ACTIVE_CU, RDC_FI_PROF_OCC_ELAPSED, + // metrics below are divided by time passed + RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, + RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, + RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, /** * @brief Raw XGMI counter events diff --git a/projects/rdc/python_binding/rdc_bootstrap.py b/projects/rdc/python_binding/rdc_bootstrap.py index 64aabef021..d7b6d7a0c4 100644 --- a/projects/rdc/python_binding/rdc_bootstrap.py +++ b/projects/rdc/python_binding/rdc_bootstrap.py @@ -156,6 +156,9 @@ class rdc_field_t(c_int): RDC_FI_PROF_SM_ACTIVE = 812 RDC_FI_PROF_OCC_PER_ACTIVE_CU = 813 RDC_FI_PROF_OCC_ELAPSED = 814 + RDC_FI_PROF_EVAL_FLOPS_16_PERCENT = 815 + RDC_FI_PROF_EVAL_FLOPS_32_PERCENT = 816 + RDC_FI_PROF_EVAL_FLOPS_64_PERCENT = 817 RDC_EVNT_XGMI_0_NOP_TX = 1000 RDC_EVNT_XGMI_0_REQ_TX = 1001 RDC_EVNT_XGMI_0_RESP_TX = 1002 diff --git a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc index 8f9f473bb0..a0ca1c0d0f 100644 --- a/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc +++ b/projects/rdc/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.cc @@ -118,6 +118,9 @@ RdcRocpBase::RdcRocpBase() { {RDC_FI_PROF_EVAL_FLOPS_16, "TOTAL_16_OPS"}, {RDC_FI_PROF_EVAL_FLOPS_32, "TOTAL_32_OPS"}, {RDC_FI_PROF_EVAL_FLOPS_64, "TOTAL_64_OPS"}, + {RDC_FI_PROF_EVAL_FLOPS_16_PERCENT, "RDC_OPS_16_PER_SIMDCYCLE"}, + {RDC_FI_PROF_EVAL_FLOPS_32_PERCENT, "RDC_OPS_32_PER_SIMDCYCLE"}, + {RDC_FI_PROF_EVAL_FLOPS_64_PERCENT, "RDC_OPS_64_PER_SIMDCYCLE"}, // metrics below are not divided by time passed {RDC_FI_PROF_VALU_PIPE_ISSUE_UTIL, "ValuPipeIssueUtil"}, {RDC_FI_PROF_SM_ACTIVE, "VALUBusy"}, @@ -188,18 +191,6 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, double* value) return RDC_ST_BAD_PARAMETER; } - if (field == RDC_FI_PROF_OCC_ELAPSED) { - double occupancy_val = run_profiler(gpu_index, RDC_FI_PROF_OCC_PER_ACTIVE_CU); - double active_cycles_val = run_profiler(gpu_index, RDC_FI_PROF_ACTIVE_CYCLES); - - if (active_cycles_val != 0.0) { - *value = occupancy_val / active_cycles_val; - return RDC_ST_OK; - } else { - return RDC_ST_BAD_PARAMETER; - } - } - const auto start_time = std::chrono::high_resolution_clock::now(); *value = run_profiler(gpu_index, field); const auto stop_time = std::chrono::high_resolution_clock::now(); @@ -209,11 +200,48 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, double* value) std::chrono::duration_cast(stop_time - start_time).count(); *value = *value / elapsed; } - // GPU_UTIL metric is available on more GPUs than ENGINE_ACTIVE. - // ENGINE_ACTIVE = GPU_UTIL/100, so do the math ourselves - if (field == RDC_FI_PROF_GPU_UTIL_PERCENT) { - *value = *value / 100.0F; + + auto simd_to_cu = [this, gpu_index](double prof_value, double matrix_fp) { + // profiler result of RDC_OPS_*_PER_SIMDCYCLE is per SIMD, RDC needs it per CU + // + return prof_value / (matrix_fp / static_cast(agents[gpu_index].simd_per_cu)); + }; + + switch (field) { + case RDC_FI_PROF_GPU_UTIL_PERCENT: + // GPU_UTIL metric is available on more GPUs than ENGINE_ACTIVE. + // ENGINE_ACTIVE = GPU_UTIL/100, so do the math ourselves + *value = *value / 100.0F; + break; + case RDC_FI_PROF_OCC_ELAPSED: { + const double occupancy_val = run_profiler(gpu_index, RDC_FI_PROF_OCC_PER_ACTIVE_CU); + const double active_cycles_val = run_profiler(gpu_index, RDC_FI_PROF_ACTIVE_CYCLES); + if (active_cycles_val != 0.0) { + *value = occupancy_val / active_cycles_val; + return RDC_ST_OK; + } else { + return RDC_ST_BAD_PARAMETER; + } + } break; + case RDC_FI_PROF_EVAL_FLOPS_16_PERCENT: { + // 1024, 2048, and 256 are taken from "INTRODUCING AMD CDNA 3 ARCHITECTURE" white paper + const std::string target_version = agents[gpu_index].name; + // TODO: Design a lookup table for other GPUs + const bool isMI200 = (target_version.find("gfx90a") != std::string::npos); + if (isMI200) { + *value = simd_to_cu(*value, 1024.0F); // FLOPS/clock/CU + } else { // Assume mi300 + *value = simd_to_cu(*value, 2048.0F); // FLOPS/clock/CU + } + } break; + case RDC_FI_PROF_EVAL_FLOPS_32_PERCENT: + case RDC_FI_PROF_EVAL_FLOPS_64_PERCENT: + *value = simd_to_cu(*value, 256.0F); // FLOPS/clock/CU + break; + default: + break; } + return RDC_ST_OK; }