Profiler - Modify metrics

Remove occupancy metrics and replace with OccupancyPercent

Add OCCUPANCY_PERCENT which uses OccupancyPercent
Add GR_ENGINE_ACTIVE which uses GPU_UTIL/100
Add TENSOR_ACTIVE_PERCENT which uses MfmaUtil
Modify FLOPS_64 to use FP64_ACTIVE

Change-Id: I5f30d77a0c80f5ac78abd1a9e57f8a0a3c6cc00b
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>


[ROCm/rdc commit: 28acbf0436]
このコミットが含まれているのは:
Galantsev, Dmitrii
2024-10-10 13:09:29 -05:00
コミット 793b2de0cb
4個のファイルの変更26行の追加18行の削除
+3 -3
ファイルの表示
@@ -204,10 +204,10 @@ E.g. Correct output on MI300 using [gpu-burn](https://github.com/ROCm/HIP-Exampl
export HSA_TOOLS_LIB=/opt/rocm/lib/librocprofiler64.so.1
gpu-burn
# terminal 3
rdci dmon -u -e 800,801,803 -i 0 -c 1
rdci dmon -u -e 800,801 -i 0 -c 1
# output:
# GPU MN_OCC_PER_CU MN_OCC_PER_ACT_CU ACTIVE_WAVES
# 0 1683.422 6479.242 32640.000
# GPU OCCUPANCY_PERCENT ACTIVE_WAVES
# 0 001.000 32640.000
### `HSA_STATUS_ERROR_OUT_OF_RESOURCES`
+11 -10
ファイルの表示
@@ -120,17 +120,18 @@ FLD_DESC_ENT(RDC_FI_XGMI_TOTAL_WRITE_KB, "XGMI accumlated data write size acr
// This doesn't map to rocprofiler counters directly
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
// See metrics.xml in rocprofiler
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "Mean occupancy per CU", "MN_OCC_PER_CU", false)
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "Mean occupancy per active CU", "MN_OCC_PER_ACT_CU", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_OCCUPANCY_PERCENT, "Percent of GPU occupancy", "OCCUPANCY_PERCENT", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
FLD_DESC_ENT(RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "Percent of Active Pipe Tensors", "TENSOR_PERCENT", false)
FLD_DESC_ENT(RDC_FI_PROF_GPU_UTIL_PERCENT, "Percent of GPU Utilization", "GPU_UTIL_PERCENT", false)
// metrics below are divided by time passed
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false)
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false)
// Events
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
+3 -2
ファイルの表示
@@ -258,11 +258,12 @@ typedef enum {
/**
* @brief ROC-profiler related fields
*/
RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU = 800,
RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU,
RDC_FI_PROF_OCCUPANCY_PERCENT = 800,
RDC_FI_PROF_ACTIVE_CYCLES,
RDC_FI_PROF_ACTIVE_WAVES,
RDC_FI_PROF_ELAPSED_CYCLES,
RDC_FI_PROF_TENSOR_ACTIVE_PERCENT,
RDC_FI_PROF_GPU_UTIL_PERCENT,
// metrics below are divided by time passed
RDC_FI_PROF_EVAL_MEM_R_BW,
RDC_FI_PROF_EVAL_MEM_W_BW,
+9 -3
ファイルの表示
@@ -246,17 +246,18 @@ RdcRocpBase::RdcRocpBase() {
// all fields
static const std::map<rdc_field_t, const char*> temp_field_map_k = {
{RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "MeanOccupancyPerCU"},
{RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "MeanOccupancyPerActiveCU"},
{RDC_FI_PROF_OCCUPANCY_PERCENT, "OccupancyPercent"},
{RDC_FI_PROF_ACTIVE_CYCLES, "GRBM_GUI_ACTIVE"},
{RDC_FI_PROF_ACTIVE_WAVES, "SQ_WAVES"},
{RDC_FI_PROF_ELAPSED_CYCLES, "GRBM_COUNT"},
{RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "MfmaUtil"}, // same as TENSOR_ACTIVE but available for more GPUs
{RDC_FI_PROF_GPU_UTIL_PERCENT, "GPU_UTIL"},
// metrics below are divided by time passed
{RDC_FI_PROF_EVAL_MEM_R_BW, "FETCH_SIZE"},
{RDC_FI_PROF_EVAL_MEM_W_BW, "WRITE_SIZE"},
{RDC_FI_PROF_EVAL_FLOPS_16, "TOTAL_16_OPS"},
{RDC_FI_PROF_EVAL_FLOPS_32, "TOTAL_32_OPS"},
{RDC_FI_PROF_EVAL_FLOPS_64, "TOTAL_64_OPS"},
{RDC_FI_PROF_EVAL_FLOPS_64, "FP64_ACTIVE"},
};
std::vector<std::string> all_fields;
@@ -393,6 +394,11 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, double* value)
// << (*value / elapsed) << "]");
*value = *value / elapsed;
}
// GPU_UTIL metric is available on more GPUs than ENGINE_ACTIVE.
// ENGINE_ACTIVE = GPU_UTIL/100, so do the math ourselves
if (field == RDC_FI_PROF_GPU_UTIL_PERCENT) {
*value = *value / 100.0F;
}
return Rocp2RdcError(status);
}