Profiler - Modify metrics
Remove occupancy metrics and replace with OccupancyPercent
Add OCCUPANCY_PERCENT which uses OccupancyPercent
Add GR_ENGINE_ACTIVE which uses GPU_UTIL/100
Add TENSOR_ACTIVE_PERCENT which uses MfmaUtil
Modify FLOPS_64 to use FP64_ACTIVE
Change-Id: I5f30d77a0c80f5ac78abd1a9e57f8a0a3c6cc00b
Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com>
[ROCm/rdc commit: 28acbf0436]
このコミットが含まれているのは:
@@ -204,10 +204,10 @@ E.g. Correct output on MI300 using [gpu-burn](https://github.com/ROCm/HIP-Exampl
|
||||
export HSA_TOOLS_LIB=/opt/rocm/lib/librocprofiler64.so.1
|
||||
gpu-burn
|
||||
# terminal 3
|
||||
rdci dmon -u -e 800,801,803 -i 0 -c 1
|
||||
rdci dmon -u -e 800,801 -i 0 -c 1
|
||||
# output:
|
||||
# GPU MN_OCC_PER_CU MN_OCC_PER_ACT_CU ACTIVE_WAVES
|
||||
# 0 1683.422 6479.242 32640.000
|
||||
# GPU OCCUPANCY_PERCENT ACTIVE_WAVES
|
||||
# 0 001.000 32640.000
|
||||
|
||||
### `HSA_STATUS_ERROR_OUT_OF_RESOURCES`
|
||||
|
||||
|
||||
@@ -120,17 +120,18 @@ FLD_DESC_ENT(RDC_FI_XGMI_TOTAL_WRITE_KB, "XGMI accumlated data write size acr
|
||||
// This doesn't map to rocprofiler counters directly
|
||||
// See counter_map in rdc/include/rdc_libs/rdc_modules/rdc_rocp/RdcRocpBase.h
|
||||
// See metrics.xml in rocprofiler
|
||||
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "Mean occupancy per CU", "MN_OCC_PER_CU", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "Mean occupancy per active CU", "MN_OCC_PER_ACT_CU", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_OCCUPANCY_PERCENT, "Percent of GPU occupancy", "OCCUPANCY_PERCENT", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_CYCLES, "Number of Active Cycles", "ACTIVE_CYCLES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ACTIVE_WAVES, "Number of Active Waves", "ACTIVE_WAVES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_ELAPSED_CYCLES, "Number of Elapsed Cycles over all SMs", "ELAPSED_CYCLES", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "Percent of Active Pipe Tensors", "TENSOR_PERCENT", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_GPU_UTIL_PERCENT, "Percent of GPU Utilization", "GPU_UTIL_PERCENT", false)
|
||||
// metrics below are divided by time passed
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_R_BW, "Fetched from video memory kb / ms", "MEM_R_BW", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_MEM_W_BW, "Written to video memory kb / ms", "MEM_W_BW", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_16, "Number of fp16 OPS / ms", "FLOPS_16", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_32, "Number of fp32 OPS / ms", "FLOPS_32", false)
|
||||
FLD_DESC_ENT(RDC_FI_PROF_EVAL_FLOPS_64, "Number of fp64 OPS / ms", "FLOPS_64", false)
|
||||
|
||||
// Events
|
||||
FLD_DESC_ENT(RDC_EVNT_XGMI_0_NOP_TX, "NOPs sent to neighbor 0", "XGMI_NOP_0", false)
|
||||
|
||||
@@ -258,11 +258,12 @@ typedef enum {
|
||||
/**
|
||||
* @brief ROC-profiler related fields
|
||||
*/
|
||||
RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU = 800,
|
||||
RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU,
|
||||
RDC_FI_PROF_OCCUPANCY_PERCENT = 800,
|
||||
RDC_FI_PROF_ACTIVE_CYCLES,
|
||||
RDC_FI_PROF_ACTIVE_WAVES,
|
||||
RDC_FI_PROF_ELAPSED_CYCLES,
|
||||
RDC_FI_PROF_TENSOR_ACTIVE_PERCENT,
|
||||
RDC_FI_PROF_GPU_UTIL_PERCENT,
|
||||
// metrics below are divided by time passed
|
||||
RDC_FI_PROF_EVAL_MEM_R_BW,
|
||||
RDC_FI_PROF_EVAL_MEM_W_BW,
|
||||
|
||||
@@ -246,17 +246,18 @@ RdcRocpBase::RdcRocpBase() {
|
||||
|
||||
// all fields
|
||||
static const std::map<rdc_field_t, const char*> temp_field_map_k = {
|
||||
{RDC_FI_PROF_MEAN_OCCUPANCY_PER_CU, "MeanOccupancyPerCU"},
|
||||
{RDC_FI_PROF_MEAN_OCCUPANCY_PER_ACTIVE_CU, "MeanOccupancyPerActiveCU"},
|
||||
{RDC_FI_PROF_OCCUPANCY_PERCENT, "OccupancyPercent"},
|
||||
{RDC_FI_PROF_ACTIVE_CYCLES, "GRBM_GUI_ACTIVE"},
|
||||
{RDC_FI_PROF_ACTIVE_WAVES, "SQ_WAVES"},
|
||||
{RDC_FI_PROF_ELAPSED_CYCLES, "GRBM_COUNT"},
|
||||
{RDC_FI_PROF_TENSOR_ACTIVE_PERCENT, "MfmaUtil"}, // same as TENSOR_ACTIVE but available for more GPUs
|
||||
{RDC_FI_PROF_GPU_UTIL_PERCENT, "GPU_UTIL"},
|
||||
// metrics below are divided by time passed
|
||||
{RDC_FI_PROF_EVAL_MEM_R_BW, "FETCH_SIZE"},
|
||||
{RDC_FI_PROF_EVAL_MEM_W_BW, "WRITE_SIZE"},
|
||||
{RDC_FI_PROF_EVAL_FLOPS_16, "TOTAL_16_OPS"},
|
||||
{RDC_FI_PROF_EVAL_FLOPS_32, "TOTAL_32_OPS"},
|
||||
{RDC_FI_PROF_EVAL_FLOPS_64, "TOTAL_64_OPS"},
|
||||
{RDC_FI_PROF_EVAL_FLOPS_64, "FP64_ACTIVE"},
|
||||
};
|
||||
|
||||
std::vector<std::string> all_fields;
|
||||
@@ -393,6 +394,11 @@ rdc_status_t RdcRocpBase::rocp_lookup(rdc_gpu_field_t gpu_field, double* value)
|
||||
// << (*value / elapsed) << "]");
|
||||
*value = *value / elapsed;
|
||||
}
|
||||
// GPU_UTIL metric is available on more GPUs than ENGINE_ACTIVE.
|
||||
// ENGINE_ACTIVE = GPU_UTIL/100, so do the math ourselves
|
||||
if (field == RDC_FI_PROF_GPU_UTIL_PERCENT) {
|
||||
*value = *value / 100.0F;
|
||||
}
|
||||
return Rocp2RdcError(status);
|
||||
}
|
||||
|
||||
|
||||
新しいイシューから参照
ユーザーをブロックする