Fix peak flops of F8 I8 F16 and BF16 on MI300

[ROCm/rocprofiler-compute commit: ab6665d317]
This commit is contained in:
Fei Zheng
2025-06-04 12:51:46 -06:00
committed by GitHub
parent d1feafe4db
commit dd2d9cddf0
7 changed files with 47 additions and 46 deletions
@@ -67,6 +67,7 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
* Fixed option specs-correction
* Fixed kernel name and kernel dispatch filtering when using rocprof v3
* Fixed not collecting TCC channel counters in rocprof v3
* Fixed peak FLOPS of F8 I8 F16 and BF16 on MI300
### Known issues
@@ -47,21 +47,21 @@ Panel Config:
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
MFMA FLOPs (BF16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
tips:
MFMA FLOPs (F16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
tips:
MFMA FLOPs (F32):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
@@ -87,9 +87,9 @@ Panel Config:
MFMA IOPs (Int8):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GIOP
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
Active CUs:
value: $numActiveCUs
@@ -44,23 +44,23 @@ Panel Config:
MFMA FLOPs (F8):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
MFMA FLOPs (BF16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
tips:
MFMA FLOPs (F16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
tips:
MFMA FLOPs (F32):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
@@ -86,9 +86,9 @@ Panel Config:
MFMA IOPs (INT8):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GIOP
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
- metric_table:
@@ -47,21 +47,21 @@ Panel Config:
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
MFMA FLOPs (BF16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
tips:
MFMA FLOPs (F16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
tips:
MFMA FLOPs (F32):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
@@ -87,9 +87,9 @@ Panel Config:
MFMA IOPs (Int8):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GIOP
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
Active CUs:
value: $numActiveCUs
@@ -44,23 +44,23 @@ Panel Config:
MFMA FLOPs (F8):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
MFMA FLOPs (BF16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
tips:
MFMA FLOPs (F16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
tips:
MFMA FLOPs (F32):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
@@ -86,9 +86,9 @@ Panel Config:
MFMA IOPs (INT8):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GIOP
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
- metric_table:
@@ -47,21 +47,21 @@ Panel Config:
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
MFMA FLOPs (BF16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
tips:
MFMA FLOPs (F16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
tips:
MFMA FLOPs (F32):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
@@ -87,9 +87,9 @@ Panel Config:
MFMA IOPs (Int8):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GIOP
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
Active CUs:
value: $numActiveCUs
@@ -44,23 +44,23 @@ Panel Config:
MFMA FLOPs (F8):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips:
MFMA FLOPs (BF16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_BF16 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
tips:
MFMA FLOPs (F16):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GFLOP
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 2048) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_F16 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
/ ((($max_sclk * $cu_per_gpu) * 2048) / 1000))
tips:
MFMA FLOPs (F32):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_F32 * 512) / (End_Timestamp - Start_Timestamp)))
@@ -86,10 +86,10 @@ Panel Config:
MFMA IOPs (INT8):
value: AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp)))
unit: GIOP
peak: ((($max_sclk * $cu_per_gpu) * 8192) / 1000)
peak: ((($max_sclk * $cu_per_gpu) * 4096) / 1000)
pop: ((100 * AVG(((SQ_INSTS_VALU_MFMA_MOPS_I8 * 512) / (End_Timestamp - Start_Timestamp))))
/ ((($max_sclk * $cu_per_gpu) * 8192) / 1000))
tips:
/ ((($max_sclk * $cu_per_gpu) * 4096) / 1000))
tips: All Peak FLOPS/clock/CU come from https://github.com/ROCm/amd_matrix_instruction_calculator/
- metric_table:
id: 1102