diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md index 4a410947cb..50812ff9bb 100644 --- a/projects/rocprofiler-compute/CHANGELOG.md +++ b/projects/rocprofiler-compute/CHANGELOG.md @@ -20,7 +20,7 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs. * Default is FP32, but user can specify as many types as desired to overlay on the same plot output * Additional datatypes for roofline profiling - * Now supports FP8, FP16, BF16, FP32, FP64, I8, I32, I64 (dependent on gpu architecture) + * Now supports FP4, FP6, FP8, FP16, BF16, FP32, FP64, I8, I32, I64 (dependent on gpu architecture) * Support host-trap PC Sampling on CLI (beta version) diff --git a/projects/rocprofiler-compute/src/argparser.py b/projects/rocprofiler-compute/src/argparser.py index 774d466e87..168e8572d7 100644 --- a/projects/rocprofiler-compute/src/argparser.py +++ b/projects/rocprofiler-compute/src/argparser.py @@ -390,12 +390,12 @@ Examples: "-R", "--roofline-data-type", required=False, - choices=["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], + choices=["FP4", "FP6", "FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], metavar="", nargs="+", type=str, default=["FP32"], - help="\t\t\tChoose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t FP8\n\t\t\t FP16\n\t\t\t BF16\n\t\t\t FP32\n\t\t\t FP64\n\t\t\t I8\n\t\t\t I32\n\t\t\t I64\n\t\t\t ", + help="Choose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t FP4\n\t\t\t FP6\n\t\t\t FP8\n\t\t\t FP16\n\t\t\t BF16\n\t\t\t FP32\n\t\t\t FP64\n\t\t\t I8\n\t\t\t I32\n\t\t\t I64\n\t\t\t ", ) # roofline_group.add_argument('-w', '--workgroups', required=False, default=-1, type=int, help="\t\t\tNumber of kernel workgroups (DEFAULT: 1024)") @@ -605,12 +605,12 @@ Examples: "-R", "--roofline-data-type", required=False, - choices=["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], + choices=["FP4", "FP6", "FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], metavar="", nargs="+", type=str, default=["FP32"], - help="\t\t\tChoose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t FP8\n\t\t\t FP16\n\t\t\t BF16\n\t\t\t FP32\n\t\t\t FP64\n\t\t\t I8\n\t\t\t I32\n\t\t\t I64\n\t\t\t ", + help="\t\t\tChoose datatypes to view roofline PDFs for: (DEFAULT: FP32)\n\t\t\t FP4\n\t\t\t FP6\n\t\t\t FP8\n\t\t\t FP16\n\t\t\t BF16\n\t\t\t FP32\n\t\t\t FP64\n\t\t\t I8\n\t\t\t I32\n\t\t\t I64\n\t\t\t ", ) analyze_advanced_group.add_argument( diff --git a/projects/rocprofiler-compute/src/roofline.py b/projects/rocprofiler-compute/src/roofline.py index c9a179987e..82ac0a0361 100644 --- a/projects/rocprofiler-compute/src/roofline.py +++ b/projects/rocprofiler-compute/src/roofline.py @@ -337,48 +337,69 @@ class Roofline: # Plot Application AI ####################### # Plot the arithmetic intensity points for each cache level - if ops_flops == "FLOP": - fig.add_trace( - go.Scatter( - x=self.__ai_data["ai_l1"][0], - y=self.__ai_data["ai_l1"][1], - name=dtype + "_ai_l1", - mode="markers", - marker_symbol=( - SYMBOLS if self.__run_parameters["include_kernel_names"] else None - ), - ) - ) - fig.add_trace( - go.Scatter( - x=self.__ai_data["ai_l2"][0], - y=self.__ai_data["ai_l2"][1], - name=dtype + "_ai_l2", - mode="markers", - marker_symbol=( - SYMBOLS if self.__run_parameters["include_kernel_names"] else None - ), - ) - ) - fig.add_trace( - go.Scatter( - x=self.__ai_data["ai_hbm"][0], - y=self.__ai_data["ai_hbm"][1], - name=dtype + "_ai_hbm", - mode="markers", - marker_symbol=( - SYMBOLS if self.__run_parameters["include_kernel_names"] else None - ), - ) - ) - # Set layout - fig.update_layout( - xaxis_title="Arithmetic Intensity (FLOPs/Byte)", - yaxis_title="Performance (GFLOP/sec)", - hovermode="x unified", - margin=dict(l=50, r=50, b=50, t=50, pad=4), + # Check for F6F4 PC which applies to both FP4 and FP6 MFMA; avoid duplicate plotting + skipAI = False + if dtype == "FP4" or dtype == "FP6": + if (dtype == "FP6") and ( + "FP4" in self.__run_parameters["roofline_data_type"] + ): + skipAI = True + console_debug( + "roofline", + "Datatype {} is captured through the F6F4 perfmon event".format(dtype), ) + dtype = "F6F4" + + if ops_flops == "FLOP": + if not skipAI: + fig.add_trace( + go.Scatter( + x=self.__ai_data["ai_l1"][0], + y=self.__ai_data["ai_l1"][1], + name=dtype + "_ai_l1", + mode="markers", + marker_symbol=( + SYMBOLS + if self.__run_parameters["include_kernel_names"] + else None + ), + ) + ) + fig.add_trace( + go.Scatter( + x=self.__ai_data["ai_l2"][0], + y=self.__ai_data["ai_l2"][1], + name=dtype + "_ai_l2", + mode="markers", + marker_symbol=( + SYMBOLS + if self.__run_parameters["include_kernel_names"] + else None + ), + ) + ) + fig.add_trace( + go.Scatter( + x=self.__ai_data["ai_hbm"][0], + y=self.__ai_data["ai_hbm"][1], + name=dtype + "_ai_hbm", + mode="markers", + marker_symbol=( + SYMBOLS + if self.__run_parameters["include_kernel_names"] + else None + ), + ) + ) + + # Set layout + fig.update_layout( + xaxis_title="Arithmetic Intensity (FLOPs/Byte)", + yaxis_title="Performance (GFLOP/sec)", + hovermode="x unified", + margin=dict(l=50, r=50, b=50, t=50, pad=4), + ) else: # Set layout fig.update_layout( diff --git a/projects/rocprofiler-compute/src/utils/roofline_calc.py b/projects/rocprofiler-compute/src/utils/roofline_calc.py index 57227fd3bb..21e99e46b9 100644 --- a/projects/rocprofiler-compute/src/utils/roofline_calc.py +++ b/projects/rocprofiler-compute/src/utils/roofline_calc.py @@ -44,15 +44,61 @@ FONT_WEIGHT = "bold" # SUPPORTED_DATATYPES table is based on datatype support in rocm-amdgpu-bench repository # Indicates which datatypes per gpu arch can be generated by the roofline binary SUPPORTED_DATATYPES = { - "gfx90a": ["FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported: F8 - "gfx940": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported: - "gfx941": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported: - "gfx942": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported: - "gfx950": ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"], # Unsupported: + "gfx90a": [ + "FP16", + "BF16", + "FP32", + "FP64", + "I8", + "I32", + "I64", + ], # Unsupported: F4, F6, F8 + "gfx940": [ + "FP8", + "FP16", + "BF16", + "FP32", + "FP64", + "I8", + "I32", + "I64", + ], # Unsupported: F4, F6 + "gfx941": [ + "FP8", + "FP16", + "BF16", + "FP32", + "FP64", + "I8", + "I32", + "I64", + ], # Unsupported: F4, F6 + "gfx942": [ + "FP8", + "FP16", + "BF16", + "FP32", + "FP64", + "I8", + "I32", + "I64", + ], # Unsupported: F4, F6 + "gfx950": [ + "FP4", + "FP6", + "FP8", + "FP16", + "BF16", + "FP32", + "FP64", + "I8", + "I32", + "I64", + ], # Unsupported: } PEAK_OPS_DATATYPES = ["FP8", "FP16", "BF16", "FP32", "FP64", "I8", "I32", "I64"] -MFMA_DATATYPES = ["FP8", "FP16", "BF16", "FP32", "FP64", "I8"] +MFMA_DATATYPES = ["FP4", "FP6", "FP8", "FP16", "BF16", "FP32", "FP64", "I8"] TOP_N = 10 @@ -67,6 +113,7 @@ class AI_Data: total_flops: float valu_flops: float + mfma_flops_f6f4: float mfma_flops_f8: float mfma_flops_f16: float mfma_flops_bf16: float @@ -212,11 +259,11 @@ def calc_ai(mspec, sort_type, ret_df): df = df.sort_values(by=["Kernel_Name"]) df = df.reset_index(drop=True) - total_flops = valu_flops = mfma_flops_f8 = mfma_flops_bf16 = mfma_flops_f16 = ( - mfma_iops_i8 - ) = mfma_flops_f32 = mfma_flops_f64 = lds_data = L1cache_data = L2cache_data = ( - hbm_data - ) = calls = totalDuration = avgDuration = 0.0 + total_flops = valu_flops = mfma_flops_f6f4 = mfma_flops_f8 = mfma_flops_bf16 = ( + mfma_flops_f16 + ) = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = lds_data = L1cache_data = ( + L2cache_data + ) = hbm_data = calls = totalDuration = avgDuration = 0.0 kernelName = "" @@ -273,6 +320,8 @@ def calc_ai(mspec, sort_type, ret_df): ) if "FP8" in supported_dt: total_flops += df["SQ_INSTS_VALU_MFMA_MOPS_F8"][idx] * 512 + if ("FP4" in supported_dt) or ("FP6" in supported_dt): + total_flops += df["SQ_INSTS_VALU_MFMA_MOPS_F6F4"][idx] * 512 except KeyError: console_debug( "roofline", @@ -313,6 +362,8 @@ def calc_ai(mspec, sort_type, ret_df): try: if "FP8" in supported_dt: mfma_flops_f8 += df["SQ_INSTS_VALU_MFMA_MOPS_F8"][idx] * 512 + if ("FP4" in supported_dt) or ("FP6" in supported_dt): + mfma_flops_f6f4 += df["SQ_INSTS_VALU_MFMA_MOPS_F6F4"][idx] * 512 mfma_flops_f16 += df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512 mfma_flops_bf16 += df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512 mfma_flops_f32 += df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512 @@ -413,6 +464,7 @@ def calc_ai(mspec, sort_type, ret_df): calls, total_flops / calls, valu_flops / calls, + mfma_flops_f6f4 / calls, mfma_flops_f8 / calls, mfma_flops_f16 / calls, mfma_flops_bf16 / calls, @@ -432,11 +484,13 @@ def calc_ai(mspec, sort_type, ret_df): kernelName, idx, calls ) ) - total_flops = valu_flops = mfma_flops_f8 = mfma_flops_bf16 = ( - mfma_flops_f16 - ) = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = lds_data = ( - L1cache_data - ) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0 + total_flops = valu_flops = mfma_flops_f6f4 = mfma_flops_f8 = ( + mfma_flops_bf16 + ) = mfma_flops_f16 = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = ( + lds_data + ) = L1cache_data = L2cache_data = hbm_data = calls = totalDuration = ( + avgDuration + ) = 0.0 if sort_type == "dispatches": myList.append( @@ -445,6 +499,7 @@ def calc_ai(mspec, sort_type, ret_df): calls, total_flops, valu_flops, + mfma_flops_f6f4, mfma_flops_f8, mfma_flops_f16, mfma_flops_bf16, @@ -459,11 +514,13 @@ def calc_ai(mspec, sort_type, ret_df): avgDuration, ) ) - total_flops = valu_flops = mfma_flops_f8 = mfma_flops_bf16 = ( - mfma_flops_f16 - ) = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = lds_data = ( - L1cache_data - ) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0 + total_flops = valu_flops = mfma_flops_f6f4 = mfma_flops_f8 = ( + mfma_flops_bf16 + ) = mfma_flops_f16 = mfma_iops_i8 = mfma_flops_f32 = mfma_flops_f64 = ( + lds_data + ) = L1cache_data = L2cache_data = hbm_data = calls = totalDuration = ( + avgDuration + ) = 0.0 myList.sort(key=lambda x: x.totalDuration, reverse=True) diff --git a/projects/rocprofiler-compute/src/utils/rooflines/roofline-rhel8-rocm6 b/projects/rocprofiler-compute/src/utils/rooflines/roofline-rhel8-rocm6 index 50ff00e825..fbfeaf3fb0 100755 Binary files a/projects/rocprofiler-compute/src/utils/rooflines/roofline-rhel8-rocm6 and b/projects/rocprofiler-compute/src/utils/rooflines/roofline-rhel8-rocm6 differ diff --git a/projects/rocprofiler-compute/src/utils/rooflines/roofline-sles15sp6-rocm6 b/projects/rocprofiler-compute/src/utils/rooflines/roofline-sles15sp6-rocm6 index e890902115..b799c1702e 100755 Binary files a/projects/rocprofiler-compute/src/utils/rooflines/roofline-sles15sp6-rocm6 and b/projects/rocprofiler-compute/src/utils/rooflines/roofline-sles15sp6-rocm6 differ diff --git a/projects/rocprofiler-compute/src/utils/rooflines/roofline-ubuntu22_04-rocm6 b/projects/rocprofiler-compute/src/utils/rooflines/roofline-ubuntu22_04-rocm6 index 1f1d57b7f1..5a5ec7faac 100755 Binary files a/projects/rocprofiler-compute/src/utils/rooflines/roofline-ubuntu22_04-rocm6 and b/projects/rocprofiler-compute/src/utils/rooflines/roofline-ubuntu22_04-rocm6 differ