diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 11d81a291b..23c73c1b7d 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -67,6 +67,7 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Fixed option specs-correction * Fixed kernel name and kernel dispatch filtering when using rocprof v3 * Fixed not collecting TCC channel counters in rocprof v3 +* Fixed peak FLOPS of F8 I8 F16 and BF16 on MI300 ### Known issues diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system-speed-of-light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system-speed-of-light.yaml index 5aa277a1d1..2113c375e2 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system-speed-of-light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0200_system-speed-of-light.yaml @@ -47,21 +47,21 @@ Panel Config: unit: GFLOP peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) tips: MFMA FLOPs (BF16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) tips: MFMA FLOPs (F16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) tips: MFMA FLOPs (F32): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) @@ -87,9 +87,9 @@ Panel Config: MFMA IOPs (Int8): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) tips: Active CUs: value: $numActiveCUs diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute-unit-compute-pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute-unit-compute-pipeline.yaml index a9fe1307c5..85bc40baf2 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute-unit-compute-pipeline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/1100_compute-unit-compute-pipeline.yaml @@ -44,23 +44,23 @@ Panel Config: MFMA FLOPs (F8): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) tips: MFMA FLOPs (BF16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) tips: MFMA FLOPs (F16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) tips: MFMA FLOPs (F32): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) @@ -86,9 +86,9 @@ Panel Config: MFMA IOPs (INT8): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) tips: - metric_table: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system-speed-of-light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system-speed-of-light.yaml index 5aa277a1d1..2113c375e2 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system-speed-of-light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0200_system-speed-of-light.yaml @@ -47,21 +47,21 @@ Panel Config: unit: GFLOP peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) tips: MFMA FLOPs (BF16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) tips: MFMA FLOPs (F16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) tips: MFMA FLOPs (F32): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) @@ -87,9 +87,9 @@ Panel Config: MFMA IOPs (Int8): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) tips: Active CUs: value: $numActiveCUs diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute-unit-compute-pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute-unit-compute-pipeline.yaml index a9fe1307c5..85bc40baf2 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute-unit-compute-pipeline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/1100_compute-unit-compute-pipeline.yaml @@ -44,23 +44,23 @@ Panel Config: MFMA FLOPs (F8): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) tips: MFMA FLOPs (BF16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) tips: MFMA FLOPs (F16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) tips: MFMA FLOPs (F32): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) @@ -86,9 +86,9 @@ Panel Config: MFMA IOPs (INT8): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) tips: - metric_table: diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system-speed-of-light.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system-speed-of-light.yaml index 5aa277a1d1..2113c375e2 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system-speed-of-light.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0200_system-speed-of-light.yaml @@ -47,21 +47,21 @@ Panel Config: unit: GFLOP peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) tips: MFMA FLOPs (BF16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) tips: MFMA FLOPs (F16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) tips: MFMA FLOPs (F32): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) @@ -87,9 +87,9 @@ Panel Config: MFMA IOPs (Int8): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) tips: Active CUs: value: $numActiveCUs diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute-unit-compute-pipeline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute-unit-compute-pipeline.yaml index a9fe1307c5..ef00cfdc81 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute-unit-compute-pipeline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/1100_compute-unit-compute-pipeline.yaml @@ -44,23 +44,23 @@ Panel Config: MFMA FLOPs (F8): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) tips: MFMA FLOPs (BF16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) tips: MFMA FLOPs (F16): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))) unit: GFLOP - peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + / ((($max_sclk * $cu_per_gpu) * 2048) / 1000)) tips: MFMA FLOPs (F32): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp))) @@ -86,10 +86,10 @@ Panel Config: MFMA IOPs (INT8): value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))) unit: GIOP - peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000) + peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000) pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))) - / ((($max_sclk * $cu_per_gpu) * 8192) / 1000)) - tips: + / ((($max_sclk * $cu_per_gpu) * 4096) / 1000)) + tips: All Peak FLOPS/clock/CU come from https://github.com/ROCm/amd_matrix_instruction_calculator/ - metric_table: id: 1102