diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 05cd26f8db..aeeea7f0bd 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,12 +1,12 @@
 ## How to fork from us
 
-To keep our development fast and conflict free, we recommend you to [fork](https://github.com/AMDResearch/omniperf/fork) our repository and start your work from our `develop` branch in your private repository.
+To keep our development fast and conflict free, we recommend you to [fork](https://github.com/AMDResearch/omniperf/fork) our repository and start your work from our `dev` branch in your private repository.
 
 Afterwards, git clone your repository to your local machine. But that is not it! To keep track of the original develop repository, add it as another remote.
 
 ```
 git remote add mainline https://github.com/AMDResearch/omniperf.git
-git checkout develop
+git checkout dev
 ```
 
 As always in git, start a new branch with
@@ -31,7 +31,7 @@ and apply your changes there.
 
 - Ensure the PR description clearly describes the problem and solution. If there is an existing GitHub issue open describing this bug, please include it in the description so we can close it.
 
-- Ensure the PR is based on the `develop` branch of the Omniperf GitHub repository.
+- Ensure the PR is based on the `dev` branch of the Omniperf GitHub repository.
 
 - Omniperf requires new commits to include a "Signed-off-by" token in the commit message (typically enabled via the `git commit -s` option), indicating your agreement to the projects's [Developer's Certificate of Origin](https://developercertificate.org/) and compatability with the project [LICENSE](https://github.com/AMDResearch/omniperf/blob/main/LICENSE):
 
diff --git a/src/omniperf b/src/omniperf
index dd1679afe9..72839cf0a1 100755
--- a/src/omniperf
+++ b/src/omniperf
@@ -715,7 +715,7 @@ def main():
             # Setup prerequisits for roofline
             roof_setup(args, my_parser, VER)
             # Generate roofline
-            roofline_only(args.path, args.device, args.sort, args.mem_level, args.verbose)
+            roofline_only(args.path, args.device, args.sort, args.mem_level, args.kernel_names, args.verbose)
 
         # Profile only
         else:
diff --git a/src/omniperf_analyze/omniperf_analyze.py b/src/omniperf_analyze/omniperf_analyze.py
index 368e5c50c5..ebd6a92f04 100644
--- a/src/omniperf_analyze/omniperf_analyze.py
+++ b/src/omniperf_analyze/omniperf_analyze.py
@@ -212,7 +212,7 @@ def run_cli(args, runs):
         )
 
 
-def roofline_only(path_to_dir, dev_id, sort_type, mem_level, verbose):
+def roofline_only(path_to_dir, dev_id, sort_type, mem_level, kernel_names, verbose):
     import pandas as pd
     from collections import OrderedDict
 
@@ -235,6 +235,7 @@ def roofline_only(path_to_dir, dev_id, sort_type, mem_level, verbose):
         dev_id,  # [Optional] Specify device id to collect roofline info from
         sort_type,  # [Optional] Sort AI by top kernels or dispatches
         mem_level,  # [Optional] Toggle particular level(s) of memory hierarchy
+        kernel_names,  # [Optional] Toggle overlay of kernel names in plot
         True,  # [Optional] Generate a standalone roofline analysis
     )
 
diff --git a/src/omniperf_analyze/utils/gui_components/header.py b/src/omniperf_analyze/utils/gui_components/header.py
index 3dcf34be69..dbb89982a2 100644
--- a/src/omniperf_analyze/utils/gui_components/header.py
+++ b/src/omniperf_analyze/utils/gui_components/header.py
@@ -104,7 +104,7 @@ def get_header(raw_pmc, input_filters, kernel_names):
                                             dbc.DropdownMenuItem("Cache", header=True),
                                             dbc.DropdownMenuItem(
                                                 "Local Data Share (LDS)",
-                                                href="#local_data_sharelds",
+                                                href="#local_data_share_lds",
                                                 external_link=True,
                                             ),
                                             dbc.DropdownMenuItem(
diff --git a/src/omniperf_analyze/utils/gui_components/roofline.py b/src/omniperf_analyze/utils/gui_components/roofline.py
index 8fb6e36a74..0d9e5826b3 100644
--- a/src/omniperf_analyze/utils/gui_components/roofline.py
+++ b/src/omniperf_analyze/utils/gui_components/roofline.py
@@ -25,6 +25,7 @@
 from omniperf_analyze.utils import roofline_calc
 
 import time
+import sys
 import numpy as np
 from dash import html, dash_table
 
@@ -32,6 +33,9 @@ from dash import dcc
 import plotly.graph_objects as go
 
 
+SYMBOLS = [0, 1, 2, 3, 4, 5, 13, 17, 18, 20]
+
+
 def to_int(a):
     if str(type(a)) == "<class 'NoneType'>":
         return np.nan
@@ -39,7 +43,9 @@ def to_int(a):
         return int(a)
 
 
-def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=None):
+def generate_plots(
+    roof_info, ai_data, mem_level, is_standalone, kernel_names, verbose, fig=None
+):
     if fig is None:
         fig = go.Figure()
     plotMode = "lines+text" if is_standalone else "lines"
@@ -120,6 +126,8 @@ def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=No
                 y=ai_data["ai_l1"][1],
                 name="ai_l1",
                 mode="markers",
+                marker={"color": "#00CC96"},
+                marker_symbol=SYMBOLS if kernel_names else None,
             )
         )
         fig.add_trace(
@@ -128,6 +136,8 @@ def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=No
                 y=ai_data["ai_l2"][1],
                 name="ai_l2",
                 mode="markers",
+                marker={"color": "#EF553B"},
+                marker_symbol=SYMBOLS if kernel_names else None,
             )
         )
         fig.add_trace(
@@ -136,6 +146,8 @@ def generate_plots(roof_info, ai_data, mem_level, is_standalone, verbose, fig=No
                 y=ai_data["ai_hbm"][1],
                 name="ai_hbm",
                 mode="markers",
+                marker={"color": "#636EFA"},
+                marker_symbol=SYMBOLS if kernel_names else None,
             )
         )
 
@@ -158,8 +170,13 @@ def get_roofline(
     dev_id=None,
     sort_type="kernels",
     mem_level="ALL",
+    kernel_names=False,
     is_standalone=False,
 ):
+    if kernel_names and (not is_standalone):
+        print("ERROR: --roof-only is required for --kernel-names")
+        sys.exit(1)
+
     # Roofline settings
     fp32_details = {
         "path": path_to_dir,
@@ -185,11 +202,33 @@ def get_roofline(
             print(i, "->", ai_data[i])
         print("\n")
 
-    fp32_fig = generate_plots(fp32_details, ai_data, mem_level, is_standalone, verbose)
-    fp16_fig = generate_plots(fp16_details, ai_data, mem_level, is_standalone, verbose)
-    ml_combo_fig = generate_plots(
-        int8_details, ai_data, mem_level, is_standalone, verbose, fp16_fig
+    fp32_fig = generate_plots(
+        fp32_details, ai_data, mem_level, is_standalone, kernel_names, verbose
     )
+    fp16_fig = generate_plots(
+        fp16_details, ai_data, mem_level, is_standalone, kernel_names, verbose
+    )
+    ml_combo_fig = generate_plots(
+        int8_details, ai_data, mem_level, is_standalone, kernel_names, verbose, fp16_fig
+    )
+    legend = go.Figure(
+        go.Scatter(
+            mode="markers",
+            x=[0] * 10,
+            y=ai_data["kernelNames"],
+            marker_symbol=SYMBOLS,
+            marker_size=15,
+        )
+    )
+    legend.update_layout(
+        title="Kernel Names and Markers",
+        margin=dict(b=0, r=0),
+        xaxis_range=[-1, 1],
+        xaxis_side="top",
+        height=400,
+        width=1000,
+    )
+    legend.update_xaxes(dtick=1)
 
     if is_standalone:
         dev_id = "ALL" if dev_id == -1 else str(dev_id)
@@ -198,12 +237,17 @@ def get_roofline(
         ml_combo_fig.write_image(
             path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id)
         )
+        if kernel_names:
+            # only save a legend if kernel_names option is toggled
+            legend.write_image(path_to_dir + "/kernelName_legend.pdf")
         time.sleep(1)
         # Re-save to remove loading MathJax pop up
         fp32_fig.write_image(path_to_dir + "/empirRoof_gpu-{}_fp32.pdf".format(dev_id))
         ml_combo_fig.write_image(
             path_to_dir + "/empirRoof_gpu-{}_fp8_fp16.pdf".format(dev_id)
         )
+        if kernel_names:
+            legend.write_image(path_to_dir + "/kernelName_legend.pdf")
         print("Empirical Roofline PDFs saved!")
     else:
         return html.Section(
diff --git a/src/omniperf_analyze/utils/parser.py b/src/omniperf_analyze/utils/parser.py
index cddf4960a2..ace633e35a 100644
--- a/src/omniperf_analyze/utils/parser.py
+++ b/src/omniperf_analyze/utils/parser.py
@@ -27,7 +27,6 @@ import sys
 import astunparse
 import re
 import os
-from matplotlib.pyplot import axis
 import pandas as pd
 import numpy as np
 from tabulate import tabulate
diff --git a/src/omniperf_analyze/utils/roofline_calc.py b/src/omniperf_analyze/utils/roofline_calc.py
index 987b6694de..2750052337 100644
--- a/src/omniperf_analyze/utils/roofline_calc.py
+++ b/src/omniperf_analyze/utils/roofline_calc.py
@@ -44,6 +44,8 @@ FONT_WEIGHT = "bold"
 
 SUPPORTED_SOC = ["mi200"]
 
+TOP_N = 10
+
 
 ################################################
 # Helper funcs
@@ -208,17 +210,146 @@ def plot_application(sortType, ret_df, verbose):
     kernelName = ""
 
     myList = []
-    for index, row in df.iterrows():
+    at_end = False
+    next_kernelName = ""
+
+    for idx in df.index:
         # CASE: Top kernels
         # Calculate + append AI data if
         # a) current KernelName is different than previous OR
         # b) We've reached the end of list
-        if sortType == "kernels" and (
-            (row["KernelName"] != kernelName and kernelName != "")
-            or index == df.shape[0] - 1
-        ):
-            if df.shape[0] - 1 == index:
-                calls += 1
+        if idx + 1 == df.shape[0]:
+            at_end = True
+        else:
+            next_kernelName = df["KernelName"][idx + 1]
+
+        kernelName = df["KernelName"][idx]
+        try:
+            total_flops += (
+                (
+                    64
+                    * (
+                        df["SQ_INSTS_VALU_ADD_F16"][idx]
+                        + df["SQ_INSTS_VALU_MUL_F16"][idx]
+                        + (2 * df["SQ_INSTS_VALU_FMA_F16"][idx])
+                        + df["SQ_INSTS_VALU_TRANS_F16"][idx]
+                    )
+                )
+                + (
+                    64
+                    * (
+                        df["SQ_INSTS_VALU_ADD_F32"][idx]
+                        + df["SQ_INSTS_VALU_MUL_F32"][idx]
+                        + (2 * df["SQ_INSTS_VALU_FMA_F32"][idx])
+                        + df["SQ_INSTS_VALU_TRANS_F32"][idx]
+                    )
+                )
+                + (
+                    64
+                    * (
+                        df["SQ_INSTS_VALU_ADD_F64"][idx]
+                        + df["SQ_INSTS_VALU_MUL_F64"][idx]
+                        + (2 * df["SQ_INSTS_VALU_FMA_F64"][idx])
+                        + df["SQ_INSTS_VALU_TRANS_F64"][idx]
+                    )
+                )
+                + (df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512)
+                + (df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512)
+                + (df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512)
+                + (df["SQ_INSTS_VALU_MFMA_MOPS_F64"][idx] * 512)
+            )
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped total_flops at index {}".format(kernelName[:35], idx))
+            pass
+        try:
+            valu_flops += (
+                64
+                * (
+                    df["SQ_INSTS_VALU_ADD_F16"][idx]
+                    + df["SQ_INSTS_VALU_MUL_F16"][idx]
+                    + (2 * df["SQ_INSTS_VALU_FMA_F16"][idx])
+                    + df["SQ_INSTS_VALU_TRANS_F16"][idx]
+                )
+                + 64
+                * (
+                    df["SQ_INSTS_VALU_ADD_F32"][idx]
+                    + df["SQ_INSTS_VALU_MUL_F32"][idx]
+                    + (2 * df["SQ_INSTS_VALU_FMA_F32"][idx])
+                    + df["SQ_INSTS_VALU_TRANS_F32"][idx]
+                )
+                + 64
+                * (
+                    df["SQ_INSTS_VALU_ADD_F64"][idx]
+                    + df["SQ_INSTS_VALU_MUL_F64"][idx]
+                    + (2 * df["SQ_INSTS_VALU_FMA_F64"][idx])
+                    + df["SQ_INSTS_VALU_TRANS_F64"][idx]
+                )
+            )
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped valu_flops at index {}".format(kernelName[:35], idx))
+            pass
+
+        try:
+            mfma_flops_f16 += df["SQ_INSTS_VALU_MFMA_MOPS_F16"][idx] * 512
+            mfma_flops_bf16 += df["SQ_INSTS_VALU_MFMA_MOPS_BF16"][idx] * 512
+            mfma_flops_f32 += df["SQ_INSTS_VALU_MFMA_MOPS_F32"][idx] * 512
+            mfma_flops_f64 += df["SQ_INSTS_VALU_MFMA_MOPS_F64"][idx] * 512
+            mfma_iops_i8 += df["SQ_INSTS_VALU_MFMA_MOPS_I8"][idx] * 512
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped mfma ops at index {}".format(kernelName[:35], idx))
+            pass
+
+        try:
+            lds_data += (
+                (df["SQ_LDS_IDX_ACTIVE"][idx] - df["SQ_LDS_BANK_CONFLICT"][idx])
+                * 4
+                * L2_BANKS
+            )  # L2_BANKS = 32 (since assuming mi200)
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped lds_data at index {}".format(kernelName[:35], idx))
+            pass
+
+        try:
+            L1cache_data += df["TCP_TOTAL_CACHE_ACCESSES_sum"][idx] * 64
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped L1cache_data at index {}".format(kernelName[:35], idx))
+            pass
+
+        try:
+            L2cache_data += (
+                df["TCP_TCC_WRITE_REQ_sum"][idx] * 64
+                + df["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"][idx] * 64
+                + df["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"][idx] * 64
+                + df["TCP_TCC_READ_REQ_sum"][idx] * 64
+            )
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped L2cache_data at index {}".format(kernelName[:35], idx))
+            pass
+        try:
+            hbm_data += (
+                (df["TCC_EA_RDREQ_32B_sum"][idx] * 32)
+                + ((df["TCC_EA_RDREQ_sum"][idx] - df["TCC_EA_RDREQ_32B_sum"][idx]) * 64)
+                + (df["TCC_EA_WRREQ_64B_sum"][idx] * 64)
+                + ((df["TCC_EA_WRREQ_sum"][idx] - df["TCC_EA_WRREQ_64B_sum"][idx]) * 32)
+            )
+        except KeyError:
+            if verbose >= 3:
+                print("{}: Skipped hbm_data at index {}".format(kernelName[:35], idx))
+            pass
+
+        totalDuration += df["EndNs"][idx] - df["BeginNs"][idx]
+
+        avgDuration += df["EndNs"][idx] - df["BeginNs"][idx]
+
+        calls += 1
+
+        if sortType == "kernels" and (at_end == True or (kernelName != next_kernelName)):
             myList.append(
                 AI_Data(
                     kernelName,
@@ -241,7 +372,7 @@ def plot_application(sortType, ret_df, verbose):
             if verbose >= 2:
                 print(
                     "Just added {} to AI_Data at index {}. # of calls: {}".format(
-                        kernelName, index, calls
+                        kernelName, idx, calls
                     )
                 )
             total_flops = (
@@ -262,129 +393,6 @@ def plot_application(sortType, ret_df, verbose):
                 L1cache_data
             ) = L2cache_data = hbm_data = calls = totalDuration = avgDuration = 0.0
 
-        kernelName = row["KernelName"]
-        try:
-            total_flops += (
-                (
-                    64
-                    * (
-                        row["SQ_INSTS_VALU_ADD_F16"]
-                        + row["SQ_INSTS_VALU_MUL_F16"]
-                        + (2 * row["SQ_INSTS_VALU_FMA_F16"])
-                        + row["SQ_INSTS_VALU_TRANS_F16"]
-                    )
-                )
-                + (
-                    64
-                    * (
-                        row["SQ_INSTS_VALU_ADD_F32"]
-                        + row["SQ_INSTS_VALU_MUL_F32"]
-                        + (2 * row["SQ_INSTS_VALU_FMA_F32"])
-                        + row["SQ_INSTS_VALU_TRANS_F32"]
-                    )
-                )
-                + (
-                    64
-                    * (
-                        row["SQ_INSTS_VALU_ADD_F64"]
-                        + row["SQ_INSTS_VALU_MUL_F64"]
-                        + (2 * row["SQ_INSTS_VALU_FMA_F64"])
-                        + row["SQ_INSTS_VALU_TRANS_F64"]
-                    )
-                )
-                + (row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512)
-                + (row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512)
-                + (row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512)
-                + (row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512)
-            )
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped total_flops at index {}".format(index))
-            pass
-        try:
-            valu_flops += (
-                64
-                * (
-                    row["SQ_INSTS_VALU_ADD_F16"]
-                    + row["SQ_INSTS_VALU_MUL_F16"]
-                    + (2 * row["SQ_INSTS_VALU_FMA_F16"])
-                    + row["SQ_INSTS_VALU_TRANS_F16"]
-                )
-                + 64
-                * (
-                    row["SQ_INSTS_VALU_ADD_F32"]
-                    + row["SQ_INSTS_VALU_MUL_F32"]
-                    + (2 * row["SQ_INSTS_VALU_FMA_F32"])
-                    + row["SQ_INSTS_VALU_TRANS_F32"]
-                )
-                + 64
-                * (
-                    row["SQ_INSTS_VALU_ADD_F64"]
-                    + row["SQ_INSTS_VALU_MUL_F64"]
-                    + (2 * row["SQ_INSTS_VALU_FMA_F64"])
-                    + row["SQ_INSTS_VALU_TRANS_F64"]
-                )
-            )
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped valu_flops at index {}".format(index))
-            pass
-
-        try:
-            mfma_flops_f16 += row["SQ_INSTS_VALU_MFMA_MOPS_F16"] * 512
-            mfma_flops_bf16 += row["SQ_INSTS_VALU_MFMA_MOPS_BF16"] * 512
-            mfma_flops_f32 += row["SQ_INSTS_VALU_MFMA_MOPS_F32"] * 512
-            mfma_flops_f64 += row["SQ_INSTS_VALU_MFMA_MOPS_F64"] * 512
-            mfma_iops_i8 += row["SQ_INSTS_VALU_MFMA_MOPS_I8"] * 512
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped mfma ops at index {}".format(index))
-            pass
-
-        try:
-            lds_data += (
-                (row["SQ_LDS_IDX_ACTIVE"] - row["SQ_LDS_BANK_CONFLICT"]) * 4 * L2_BANKS
-            )  # L2_BANKS = 32 (since assuming mi200)
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped lds_data at index {}".format(index))
-            pass
-
-        try:
-            L1cache_data += row["TCP_TOTAL_CACHE_ACCESSES_sum"] * 64
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped L1cache_data at index {}".format(index))
-            pass
-
-        try:
-            L2cache_data += (
-                row["TCP_TCC_WRITE_REQ_sum"] * 64
-                + row["TCP_TCC_ATOMIC_WITH_RET_REQ_sum"] * 64
-                + row["TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"] * 64
-                + row["TCP_TCC_READ_REQ_sum"] * 64
-            )
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped L2cache_data at index {}".format(index))
-            pass
-        try:
-            hbm_data += (
-                (row["TCC_EA_RDREQ_32B_sum"] * 32)
-                + ((row["TCC_EA_RDREQ_sum"] - row["TCC_EA_RDREQ_32B_sum"]) * 64)
-                + (row["TCC_EA_WRREQ_64B_sum"] * 64)
-                + ((row["TCC_EA_WRREQ_sum"] - row["TCC_EA_WRREQ_64B_sum"]) * 32)
-            )
-        except KeyError:
-            if verbose >= 2:
-                print("Skipped hbm_data at index {}".format(index))
-            pass
-
-        totalDuration += row["EndNs"] - row["BeginNs"]
-
-        avgDuration += row["EndNs"] - row["BeginNs"]
-
-        calls += 1
         if sortType == "dispatches":
             myList.append(
                 AI_Data(
@@ -428,9 +436,11 @@ def plot_application(sortType, ret_df, verbose):
     # print("Top 5 intensities ('{}')...".format(roof_details["sort"]))
     intensities = {"ai_l1": [], "ai_l2": [], "ai_hbm": []}
     curr_perf = []
+    kernelNames = []
     i = 0
     # Create list of top 5 intensities
-    while i <= 9 and i != len(myList):
+    while i < TOP_N and i != len(myList):
+        kernelNames.append(myList[i].KernelName)
         intensities["ai_l1"].append(
             myList[i].total_flops / myList[i].L1cache_data
         ) if myList[i].L1cache_data else intensities["ai_l1"].append(0)
@@ -470,6 +480,9 @@ def plot_application(sortType, ret_df, verbose):
         intensityPoints[i].append(x)
         intensityPoints[i].append(y)
 
+    # Add an entry for kernel names
+    intensityPoints["kernelNames"] = kernelNames
+
     return intensityPoints
 
 
diff --git a/src/parser.py b/src/parser.py
index b305b60187..061d85d438 100644
--- a/src/parser.py
+++ b/src/parser.py
@@ -234,6 +234,13 @@ def parse(my_parser):
         type=int,
         help="\t\t\tGPU device ID. (DEFAULT: ALL)",
     )
+    roofline_group.add_argument(
+        "--kernel-names",
+        required=False,
+        default=False,
+        action="store_true",
+        help="\t\t\tInclude kernel names in roofline plot.",
+    )
     # roofline_group.add_argument('-w', '--workgroups', required=False, default=-1, type=int, help="\t\t\tNumber of kernel workgroups (DEFAULT: 1024)")
     # roofline_group.add_argument('--wsize', required=False, default=-1, type=int, help="\t\t\tWorkgroup size (DEFAULT: 256)")
     # roofline_group.add_argument('--dataset', required=False, default = -1, type=int, help="\t\t\tDataset size (DEFAULT: 536M)")