diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml index 66c656fb4c..9c80f75e80 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml @@ -3,9 +3,17 @@ Panel Config: id: 400 title: Roofline metrics_description: - VALU FLOPs: 'The total floating-point operations executed per second on the VALU. - This is also presented as a percent of the peak theoretical FLOPs achievable - on the specific accelerator. Note: this does not include any floating-point + VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F16 FLOPs + achievable on the specific accelerator. Note: this does not include any F16 + operations from MFMA instructions.' + VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F32 FLOPs + achievable on the specific accelerator. Note: this does not include any F32 + operations from MFMA instructions.' + VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F64 FLOPs + achievable on the specific accelerator. Note: this does not include any F64 operations from MFMA instructions.' MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations @@ -28,6 +36,11 @@ Panel Config: per second. Note: this does not include any 64-bit floating point operations from VALU instructions. The peak empirically measured F64 MFMA operations achievable on the specific accelerator is displayed alongside for comparison.' + MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations + executed per second. Note: this does not include any floating point operations + from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable + on the specific accelerator is displayed alongside for comparison. It is supported + on AMD Instinct MI350 series (gfx950) and later only.' MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. The peak empirically measured INT8 MFMA operations achievable @@ -80,15 +93,24 @@ Panel Config: unit: Unit peak: Peak (Empirical) metric: - VALU FLOPs: - value: AVG((($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 - + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) - + SQ_INSTS_VALU_TRANS_F64) )) / ((End_Timestamp - Start_Timestamp) / 1e9)) - / 1e9) + VALU FLOPs (F16): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + peak: $FP16Flops_empirical_peak + VALU FLOPs (F32): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP32Flops_empirical_peak + VALU FLOPs (F64): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP64Flops_empirical_peak MFMA FLOPs (F64): value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml index 38af3367e9..64ed38e23e 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml @@ -3,9 +3,17 @@ Panel Config: id: 400 title: Roofline metrics_description: - VALU FLOPs: 'The total floating-point operations executed per second on the VALU. - This is also presented as a percent of the peak theoretical FLOPs achievable - on the specific accelerator. Note: this does not include any floating-point + VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F16 FLOPs + achievable on the specific accelerator. Note: this does not include any F16 + operations from MFMA instructions.' + VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F32 FLOPs + achievable on the specific accelerator. Note: this does not include any F32 + operations from MFMA instructions.' + VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F64 FLOPs + achievable on the specific accelerator. Note: this does not include any F64 operations from MFMA instructions.' MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations @@ -28,6 +36,11 @@ Panel Config: per second. Note: this does not include any 64-bit floating point operations from VALU instructions. The peak empirically measured F64 MFMA operations achievable on the specific accelerator is displayed alongside for comparison.' + MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations + executed per second. Note: this does not include any floating point operations + from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable + on the specific accelerator is displayed alongside for comparison. It is supported + on AMD Instinct MI350 series (gfx950) and later only.' MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. The peak empirically measured INT8 MFMA operations achievable @@ -80,15 +93,24 @@ Panel Config: unit: Unit peak: Peak (Empirical) metric: - VALU FLOPs: - value: AVG((($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 - + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) - + SQ_INSTS_VALU_TRANS_F64) )) / ((End_Timestamp - Start_Timestamp) / 1e9)) - / 1e9) + VALU FLOPs (F16): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + peak: $FP16Flops_empirical_peak + VALU FLOPs (F32): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP32Flops_empirical_peak + VALU FLOPs (F64): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP64Flops_empirical_peak MFMA FLOPs (F64): value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml index 839c04fd2e..b06371c850 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml @@ -3,9 +3,17 @@ Panel Config: id: 400 title: Roofline metrics_description: - VALU FLOPs: 'The total floating-point operations executed per second on the VALU. - This is also presented as a percent of the peak theoretical FLOPs achievable - on the specific accelerator. Note: this does not include any floating-point + VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F16 FLOPs + achievable on the specific accelerator. Note: this does not include any F16 + operations from MFMA instructions.' + VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F32 FLOPs + achievable on the specific accelerator. Note: this does not include any F32 + operations from MFMA instructions.' + VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F64 FLOPs + achievable on the specific accelerator. Note: this does not include any F64 operations from MFMA instructions.' MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations @@ -28,6 +36,11 @@ Panel Config: per second. Note: this does not include any 64-bit floating point operations from VALU instructions. The peak empirically measured F64 MFMA operations achievable on the specific accelerator is displayed alongside for comparison.' + MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations + executed per second. Note: this does not include any floating point operations + from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable + on the specific accelerator is displayed alongside for comparison. It is supported + on AMD Instinct MI350 series (gfx950) and later only.' MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. The peak empirically measured INT8 MFMA operations achievable @@ -80,15 +93,24 @@ Panel Config: unit: Unit peak: Peak (Empirical) metric: - VALU FLOPs: - value: AVG(($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 - + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) - + SQ_INSTS_VALU_TRANS_F64) ) / ((End_Timestamp - Start_Timestamp) / 1e9)) - / 1e9) + VALU FLOPs (F16): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + peak: $FP16Flops_empirical_peak + VALU FLOPs (F32): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP32Flops_empirical_peak + VALU FLOPs (F64): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP64Flops_empirical_peak MFMA FLOPs (F64): value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml index f9f4d7cc19..c847403bb1 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml @@ -3,9 +3,17 @@ Panel Config: id: 400 title: Roofline metrics_description: - VALU FLOPs: 'The total floating-point operations executed per second on the VALU. - This is also presented as a percent of the peak theoretical FLOPs achievable - on the specific accelerator. Note: this does not include any floating-point + VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F16 FLOPs + achievable on the specific accelerator. Note: this does not include any F16 + operations from MFMA instructions.' + VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F32 FLOPs + achievable on the specific accelerator. Note: this does not include any F32 + operations from MFMA instructions.' + VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F64 FLOPs + achievable on the specific accelerator. Note: this does not include any F64 operations from MFMA instructions.' MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations @@ -28,6 +36,11 @@ Panel Config: per second. Note: this does not include any 64-bit floating point operations from VALU instructions. The peak empirically measured F64 MFMA operations achievable on the specific accelerator is displayed alongside for comparison.' + MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations + executed per second. Note: this does not include any floating point operations + from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable + on the specific accelerator is displayed alongside for comparison. It is supported + on AMD Instinct MI350 series (gfx950) and later only.' MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. The peak empirically measured INT8 MFMA operations achievable @@ -80,15 +93,24 @@ Panel Config: unit: Unit peak: Peak (Empirical) metric: - VALU FLOPs: - value: AVG(($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 - + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) - + SQ_INSTS_VALU_TRANS_F64) ) / ((End_Timestamp - Start_Timestamp) / 1e9)) - / 1e9) + VALU FLOPs (F16): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + peak: $FP16Flops_empirical_peak + VALU FLOPs (F32): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP32Flops_empirical_peak + VALU FLOPs (F64): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP64Flops_empirical_peak MFMA FLOPs (F64): value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml index 9ba1e6f1fa..c951110895 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml @@ -3,9 +3,17 @@ Panel Config: id: 400 title: Roofline metrics_description: - VALU FLOPs: 'The total floating-point operations executed per second on the VALU. - This is also presented as a percent of the peak theoretical FLOPs achievable - on the specific accelerator. Note: this does not include any floating-point + VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F16 FLOPs + achievable on the specific accelerator. Note: this does not include any F16 + operations from MFMA instructions.' + VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F32 FLOPs + achievable on the specific accelerator. Note: this does not include any F32 + operations from MFMA instructions.' + VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F64 FLOPs + achievable on the specific accelerator. Note: this does not include any F64 operations from MFMA instructions.' MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations @@ -28,6 +36,11 @@ Panel Config: per second. Note: this does not include any 64-bit floating point operations from VALU instructions. The peak empirically measured F64 MFMA operations achievable on the specific accelerator is displayed alongside for comparison.' + MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations + executed per second. Note: this does not include any floating point operations + from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable + on the specific accelerator is displayed alongside for comparison. It is supported + on AMD Instinct MI350 series (gfx950) and later only.' MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. The peak empirically measured INT8 MFMA operations achievable @@ -80,15 +93,24 @@ Panel Config: unit: Unit peak: Peak (Empirical) metric: - VALU FLOPs: - value: AVG((($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 - + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) - + SQ_INSTS_VALU_TRANS_F64) )) / ((End_Timestamp - Start_Timestamp) / 1e9)) - / 1e9) + VALU FLOPs (F16): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + peak: $FP16Flops_empirical_peak + VALU FLOPs (F32): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP32Flops_empirical_peak + VALU FLOPs (F64): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP64Flops_empirical_peak MFMA FLOPs (F64): value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml index 500c7ff805..4064a4a84b 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml +++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml @@ -3,9 +3,17 @@ Panel Config: id: 400 title: Roofline metrics_description: - VALU FLOPs: 'The total floating-point operations executed per second on the VALU. - This is also presented as a percent of the peak theoretical FLOPs achievable - on the specific accelerator. Note: this does not include any floating-point + VALU FLOPs (F16): 'The total 16-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F16 FLOPs + achievable on the specific accelerator. Note: this does not include any F16 + operations from MFMA instructions.' + VALU FLOPs (F32): 'The total 32-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F32 FLOPs + achievable on the specific accelerator. Note: this does not include any F32 + operations from MFMA instructions.' + VALU FLOPs (F64): 'The total 64-bit floating-point operations executed per second + on the VALU. This is presented with the value of the peak empirical F64 FLOPs + achievable on the specific accelerator. Note: this does not include any F64 operations from MFMA instructions.' MFMA FLOPs (F8): The total number of 8-bit brain floating point MFMA operations executed per second. This does not include any 16-bit brain floating point operations @@ -28,6 +36,11 @@ Panel Config: per second. Note: this does not include any 64-bit floating point operations from VALU instructions. The peak empirically measured F64 MFMA operations achievable on the specific accelerator is displayed alongside for comparison.' + MFMA FLOPs (F6F4): 'The total number of 4-bit and 6-bit floating point MFMA operations + executed per second. Note: this does not include any floating point operations + from VALU instructions. The peak empirically measured F6F4 MFMA operations achievable + on the specific accelerator is displayed alongside for comparison. It is supported + on AMD Instinct MI350 series (gfx950) and later only.' MFMA IOPs (Int8): 'The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions. The peak empirically measured INT8 MFMA operations achievable @@ -80,15 +93,24 @@ Panel Config: unit: Unit peak: Peak (Empirical) metric: - VALU FLOPs: - value: AVG((($wave_size * ( (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 - + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + (SQ_INSTS_VALU_ADD_F32 - + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) - + (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) - + SQ_INSTS_VALU_TRANS_F64) )) / ((End_Timestamp - Start_Timestamp) / 1e9)) - / 1e9) + VALU FLOPs (F16): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + peak: $FP16Flops_empirical_peak + VALU FLOPs (F32): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP32Flops_empirical_peak + VALU FLOPs (F64): + value: AVG((($wave_size * ( SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 )) / ((End_Timestamp + - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP64Flops_empirical_peak MFMA FLOPs (F64): value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) diff --git a/projects/rocprofiler-compute/src/utils/parser.py b/projects/rocprofiler-compute/src/utils/parser.py index 192d5db774..a2f625f206 100755 --- a/projects/rocprofiler-compute/src/utils/parser.py +++ b/projects/rocprofiler-compute/src/utils/parser.py @@ -771,7 +771,7 @@ def build_metric_value_string(dfs, dfs_type, normal_unit, profiling_config): def init_metric_evaluator( - raw_pmc_df: Union[pd.DataFrame, dict], ammolite_vars: dict + raw_pmc_df: Union[pd.DataFrame, dict], ammolite_vars: dict, empirical_peaks: dict ) -> None: if isinstance(raw_pmc_df, dict): raw_pmc_df_keys = set(raw_pmc_df.keys()) @@ -790,6 +790,7 @@ def init_metric_evaluator( # The process-local globals are used for performance optimization. globals().update(raw_pmc_df_items) globals().update(ammolite_vars) + globals().update(empirical_peaks) def run_metric_evaluator(row_expr: str) -> str: @@ -821,6 +822,38 @@ def run_metric_evaluator(row_expr: str) -> str: console_error("analysis", str(ae)) +def create_empirical_peaks_dict(empirical_peaks_df): + """Create empirical peaks dictionary""" + empirical_peaks = {} + + if not empirical_peaks_df.empty: + peak_data_row = empirical_peaks_df.iloc[0] + for col in empirical_peaks_df.columns: + empirical_peaks[f"ammolite__{col}_empirical_peak"] = peak_data_row[col] + else: + peak_names = [ + "FP16Flops", + "FP32Flops", + "FP64Flops", + "MFMAF64Flops", + "MFMAF32Flops", + "MFMAF16Flops", + "MFMABF16Flops", + "MFMAF8Flops", + "MFMAI8Ops", + "HBMBw", + "L2Bw", + "L1Bw", + "LDSBw", + "MFMA_FLOPs_F6F4", + ] + # initialize peaks to 0 + for peak_name in peak_names: + empirical_peaks[f"ammolite__{peak_name}_empirical_peak"] = 0 + + return empirical_peaks + + @demarcate def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug, config): """ @@ -927,32 +960,10 @@ def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug, "wave_size is not available in sysinfo.csv, please provide the correct " "value using --specs-correction" ) - if not empirical_peaks_df.empty: - peak_data_row = empirical_peaks_df.iloc[0] - for metric_name in empirical_peaks_df.columns: - var_name = f"ammolite__{metric_name}_empirical_peak" - locals()[var_name] = peak_data_row[metric_name] - else: - default_peaks = [ - "MFMAF64Flops", - "MFMAF32Flops", - "MFMAF16Flops", - "MFMABF16Flops", - "MFMAF8Flops", - "MFMAI8Ops", - "HBMBw", - "L2Bw", - "L1Bw", - "LDSBw", - "MFMA_FLOPs_F6F4", - ] - # set values to 0 if no no empirical peaks from roofline.csv are provided - for peak_name in default_peaks: - var_name = f"ammolite__{peak_name}_empirical_peak" - exec(f"{var_name} = 0", globals(), locals()) + + empirical_peaks = create_empirical_peaks_dict(empirical_peaks_df) # TODO: fix all $normUnit in Unit column or title - # build and eval all derived build-in global variables ammolite__build_in = {} @@ -966,6 +977,8 @@ def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug, ammolite__build_in[key] = eval(compile(s, "", "eval")) except TypeError: ammolite__build_in[key] = None + except NameError: + ammolite__build_in[key] = None except KeyError: ammolite__build_in[key] = None except AttributeError as ae: @@ -1022,12 +1035,32 @@ def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug, ) if matched_vars: for v in matched_vars: - print( - "Var ", - v, - ":", - eval(compile(v, "", "eval")), - ) + try: + value = eval( + compile(v, "", "eval") + ) + print("Var ", v, ":", value) + except NameError: + if "_empirical_peak" in v: + if v in empirical_peaks: + print( + "Var ", + v, + ":", + empirical_peaks[v], + ) + else: + print( + "Var ", + v, + ": [empirical peak not found]", # noqa + ) + else: + print( + "Var ", + v, + ": [not available in main thread]", # noqa + ) matched_cols = re.findall( r"raw_pmc_df\['\w+'\]\['\w+'\]", row[expr] ) @@ -1063,6 +1096,21 @@ def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug, eval(compile(row[expr], "", "eval")) ) print("~" * 40) + except NameError as ne: + if "empirical_peak" in str(ne): + console_warning( + "Skipping debug evaluation. Empirical peak variables " # noqa + "not available in main thread: {}".format( # noqa + str(ne) + ) + ) + else: + console_warning( + "Skipping debug evaluation. Variable not available: {}".format( # noqa + str(ne) + ) + ) + print("~" * 40) except TypeError: console_warning( "Skipping entry. Encountered a missing " @@ -1100,7 +1148,6 @@ def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug, ammolite_vars = { key: val for key, val in locals().items() if key.startswith("ammolite__") } - # Empirically, 16 is about as much as we need. processes = min(16, multiprocessing.cpu_count() // 2) @@ -1108,7 +1155,7 @@ def eval_metric(dfs, dfs_type, sys_info, empirical_peaks_df, raw_pmc_df, debug, with multiprocessing.Pool( processes=processes, initializer=init_metric_evaluator, - initargs=(raw_pmc_df, ammolite_vars), + initargs=(raw_pmc_df, ammolite_vars, empirical_peaks), ) as pool: outs = pool.map(run_metric_evaluator, row_exprs) diff --git a/projects/rocprofiler-compute/utils/autogen_hash.yaml b/projects/rocprofiler-compute/utils/autogen_hash.yaml index 7079981108..7d4fc1150f 100644 --- a/projects/rocprofiler-compute/utils/autogen_hash.yaml +++ b/projects/rocprofiler-compute/utils/autogen_hash.yaml @@ -23,12 +23,12 @@ src/rocprof_compute_soc/analysis_configs/gfx940/0300_memory_chart.yaml: 249e9ae0 src/rocprof_compute_soc/analysis_configs/gfx941/0300_memory_chart.yaml: 249e9ae0445de0927827ec14d20f946a07d50d92fd56e1993bbe0c17eb65bd51 src/rocprof_compute_soc/analysis_configs/gfx942/0300_memory_chart.yaml: 249e9ae0445de0927827ec14d20f946a07d50d92fd56e1993bbe0c17eb65bd51 src/rocprof_compute_soc/analysis_configs/gfx950/0300_memory_chart.yaml: 643b31ffa43bc3613d6f90b0c23d95093d0d0aa5bc8e72d9a0fbc1b739a08b67 -src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml: 6406ce67cd55064f0d2db2a3511c6536cc1625314ddb31366900fbf3c60ed523 -src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml: 100d555cf9e70b892e22f92ddd9c0a5d1f914d07077c4a8d35941e8ad62b5b30 -src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml: f8bf66f43c9afede4fd1f17c279050cc27cc6fbc1cdb53a71ae8ceb0eb84dc37 -src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml: 6fae04dcf4bcabe4a71f5d9eefc379a38d30cdf05fbb14e2c276e1c272fdb3f6 -src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml: c8dfe7df24f94dfa229ffa2035b802c6833ce98f7710e0889bc5710f2167d4c0 -src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml: 734fdfa818bfd8a87e01a0dd795c502a567c72158ca9b7bfe01e99451e8aa537 +src/rocprof_compute_soc/analysis_configs/gfx908/0400_roofline.yaml: f346786a63056303a0c6137f82bcea0baa3bb5bf63bfbef792df531725f64d1d +src/rocprof_compute_soc/analysis_configs/gfx90a/0400_roofline.yaml: 92aced41d421faf00ed542da38f47ad22c2fbf1f0383c550e255ed9bab95b0d8 +src/rocprof_compute_soc/analysis_configs/gfx940/0400_roofline.yaml: 867a31d84eeb5b5aa64606fcdb67732347acc43e88121fcc70af79169f44fd83 +src/rocprof_compute_soc/analysis_configs/gfx941/0400_roofline.yaml: 3f7bef2487df1ece7302de3f413a14032ba35d44fcf4e5ddee5186e2fc223797 +src/rocprof_compute_soc/analysis_configs/gfx942/0400_roofline.yaml: 58272f5d1136489255a7bf9c6ade720a0567b17ab58dc58ae796597ae4d73ce5 +src/rocprof_compute_soc/analysis_configs/gfx950/0400_roofline.yaml: 4362a8120d70cc0e62abd4367a09207897bbc3be44805092dabaefda41803391 src/rocprof_compute_soc/analysis_configs/gfx908/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb src/rocprof_compute_soc/analysis_configs/gfx90a/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb src/rocprof_compute_soc/analysis_configs/gfx940/0500_command_processor_cpc_cpf.yaml: da1c2997d42d66da2aa069caa741cf9eade124357c56e4290333de2f3e0412bb diff --git a/projects/rocprofiler-compute/utils/unified_config.yaml b/projects/rocprofiler-compute/utils/unified_config.yaml index 531afa847b..0b8e23f1ed 100644 --- a/projects/rocprofiler-compute/utils/unified_config.yaml +++ b/projects/rocprofiler-compute/utils/unified_config.yaml @@ -2802,14 +2802,24 @@ panels: peak: Peak (Empirical) metric: gfx90a: - VALU FLOPs: + VALU FLOPs (F16): value: AVG((($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) + SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + peak: $FP16Flops_empirical_peak + VALU FLOPs (F32): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP32Flops_empirical_peak + VALU FLOPs (F64): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP64Flops_empirical_peak MFMA FLOPs (F64): value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s @@ -2855,14 +2865,24 @@ panels: unit: GB/s peak: $LDSBw_empirical_peak gfx908: - VALU FLOPs: + VALU FLOPs (F16): value: AVG((($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) + SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + peak: $FP16Flops_empirical_peak + VALU FLOPs (F32): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP32Flops_empirical_peak + VALU FLOPs (F64): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP64Flops_empirical_peak MFMA FLOPs (F64): value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s @@ -2908,14 +2928,24 @@ panels: unit: GB/s peak: $LDSBw_empirical_peak gfx940: - VALU FLOPs: - value: AVG(($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - ) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + VALU FLOPs (F16): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + peak: $FP16Flops_empirical_peak + VALU FLOPs (F32): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP32Flops_empirical_peak + VALU FLOPs (F64): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP64Flops_empirical_peak MFMA FLOPs (F64): value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s @@ -2965,14 +2995,24 @@ panels: unit: GB/s peak: $LDSBw_empirical_peak gfx941: - VALU FLOPs: - value: AVG(($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) - ) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + VALU FLOPs (F16): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + peak: $FP16Flops_empirical_peak + VALU FLOPs (F32): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP32Flops_empirical_peak + VALU FLOPs (F64): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP64Flops_empirical_peak MFMA FLOPs (F64): value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s @@ -3022,14 +3062,24 @@ panels: unit: GB/s peak: $LDSBw_empirical_peak gfx942: - VALU FLOPs: + VALU FLOPs (F16): value: AVG((($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) + SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + peak: $FP16Flops_empirical_peak + VALU FLOPs (F32): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP32Flops_empirical_peak + VALU FLOPs (F64): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP64Flops_empirical_peak MFMA FLOPs (F64): value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s @@ -3079,14 +3129,24 @@ panels: unit: GB/s peak: $LDSBw_empirical_peak gfx950: - VALU FLOPs: + VALU FLOPs (F16): value: AVG((($wave_size * ( - (SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16) + - (SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32) + - (SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64) + SQ_INSTS_VALU_ADD_F16 + SQ_INSTS_VALU_MUL_F16 + (2 * SQ_INSTS_VALU_FMA_F16) + SQ_INSTS_VALU_TRANS_F16 )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s - peak: (((($max_sclk * $cu_per_gpu) * 64) * 2) / 1000) + peak: $FP16Flops_empirical_peak + VALU FLOPs (F32): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F32 + SQ_INSTS_VALU_MUL_F32 + (2 * SQ_INSTS_VALU_FMA_F32) + SQ_INSTS_VALU_TRANS_F32 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP32Flops_empirical_peak + VALU FLOPs (F64): + value: AVG((($wave_size * ( + SQ_INSTS_VALU_ADD_F64 + SQ_INSTS_VALU_MUL_F64 + (2 * SQ_INSTS_VALU_FMA_F64) + SQ_INSTS_VALU_TRANS_F64 + )) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) + unit: GFLOP/s + peak: $FP64Flops_empirical_peak MFMA FLOPs (F64): value: AVG((((SQ_INSTS_VALU_MFMA_MOPS_F64 * 512)) / ((End_Timestamp - Start_Timestamp) / 1e9)) / 1e9) unit: GFLOP/s @@ -3578,15 +3638,35 @@ panels: ) / 1e9 unit: GFLOP/s metrics_description: - VALU FLOPs: - plain: 'The total floating-point operations executed per second on the VALU. - This is also presented as a percent of the peak theoretical FLOPs achievable - on the specific accelerator. Note: this does not include any floating-point - operations from MFMA instructions.' - rst: 'The total floating-point operations executed per second on the :ref:`VALU - `. This is also presented as a percent of the peak theoretical - FLOPs achievable on the specific accelerator. Note: this does not include - any floating-point operations from :ref:`MFMA ` instructions.' + VALU FLOPs (F16): + plain: 'The total 16-bit floating-point operations executed per second on the VALU. + This is presented with the value of the peak empirical F16 FLOPs achievable + on the specific accelerator. Note: this does not include any F16 operations + from MFMA instructions.' + rst: 'The total 16-bit floating-point operations executed per second on the :ref:`VALU + `. This is presented with the value of the peak empirical F16 FLOPs achievable + on the specific accelerator. Note: this does not include any F16 operations + from :ref:`MFMA ` instructions.' + unit: GFLOPs + VALU FLOPs (F32): + plain: 'The total 32-bit floating-point operations executed per second on the VALU. + This is presented with the value of the peak empirical F32 FLOPs achievable + on the specific accelerator. Note: this does not include any F32 operations + from MFMA instructions.' + rst: 'The total 32-bit floating-point operations executed per second on the :ref:`VALU + `. This is presented with the value of the peak empirical F32 FLOPs achievable + on the specific accelerator. Note: this does not include any F32 operations + from :ref:`MFMA ` instructions.' + unit: GFLOPs + VALU FLOPs (F64): + plain: 'The total 64-bit floating-point operations executed per second on the VALU. + This is presented with the value of the peak empirical F64 FLOPs achievable + on the specific accelerator. Note: this does not include any F64 operations + from MFMA instructions.' + rst: 'The total 64-bit floating-point operations executed per second on the :ref:`VALU + `. This is presented with the value of the peak empirical F64 FLOPs achievable + on the specific accelerator. Note: this does not include any F64 operations + from :ref:`MFMA ` instructions.' unit: GFLOPs MFMA FLOPs (F8): plain: The total number of 8-bit brain floating point MFMA operations executed @@ -3646,6 +3726,19 @@ panels: measured F64 MFMA operations achievable on the specific accelerator is displayed alongside for comparison.' unit: GFLOPs + MFMA FLOPs (F6F4): + plain: 'The total number of 4-bit and 6-bit floating point MFMA operations executed + per second. Note: this does not include any floating point operations from + VALU instructions. The peak empirically measured F6F4 MFMA operations + achievable on the specific accelerator is displayed alongside for comparison. + It is supported on AMD Instinct MI350 series (gfx950) and later only.' + rst: 'The total number of 4-bit and 6-bit floating point :ref:`MFMA ` + operations executed per second. Note: this does not include any floating point + operations from :ref:`VALU ` instructions. The peak empirically + measured F6F4 MFMA operations achievable on the specific accelerator is + displayed alongside for comparison. It is supported on AMD Instinct MI350 + series (gfx950) and later only.' + unit: GFLOPs MFMA IOPs (Int8): plain: 'The total number of 8-bit integer MFMA operations executed per second. Note: this does not include any 8-bit integer operations from VALU instructions.