From ae8f72fa797f6eb4d45e94157a432c56e047e36b Mon Sep 17 00:00:00 2001
From: vedithal-amd <Vignesh.Edithal@amd.com>
Date: Tue, 18 Nov 2025 23:34:38 -0500
Subject: [PATCH] [rocprofiler-compute] Use native tool for counter collection
 (#1212)

* Use native tool for counter collection

* Add native counter collection tool which uses rocprofiler-sdk C++
  library public API to get counter collection data
    * This is enabled by default, unless --no-native-tool option is
      provided or ROCPROF=rocprofv3 env. var. is provided
    * This tool is only supported for ROCm version >=7.x.x
    * This tool is not supported for attach/detach scenario
* Build native tool shared object during build time
* If using rocprof-compute without building then runtime compilation of
  t push native tool shared object is performed
* rocprofiler-sdk tools is still used for services other than counter
  collection and data collected by native tool is merged into the
  rocpd/csv output of rocprofiler-sdk tool

* Make `rocpd` choice the default choice for `--format-rocprof-output`
  option
    * If `rocpd` public API from rocprofiler-sdk library is not present,
      then fallback to `csv` choice
    * In this case only `pmc_perf.csv` is written in workload folder
      instead of multiple `csv` files for each profiling run
* Remove `json` choice from `--format-rocprof-output` option since it
  functions identical to `csv` option

* Rename option `--rocprofiler-sdk-library-path` to
  `--rocprofiler-sdk-tool-path` since we LD_PRELOAD the
  rocprofiler-sdk tool shared object and not the rocprofiler-sdk library
shared object

* Fix the meaning of `--dispatch` option in `profile` mode to mention
  dispatch iteration filtering instead of dispatch id filtering
    * --dispatch option in analyze mode does dispatch id filtering

* Move standalone binary creation logic from cmake file to docker file

* fix native counter collection tool during attach/detach

* improve logging

* fix attach detach with native tool

* fix attach detach with native tool

* do not support attach/detach in native tool

* Update changelog

* add standalone binary creation functionality in cmake

* address review comments

* address review comments

* fix formatting

* address review comments

* Adding paths for cmake to search. Also updated min. cmake requirement to 3.21 as this was when hip was supported.

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>

* Update hip compiler ID check, sometimes comes up as Clang, sometimes ROCMClang- depends on setup.
Updated formatting.

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>

* RHEL8.10 unable to compile due to defaulting to old c++ version, need to force c++17

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>

* Updating changelog per docs team recommendations

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>

* Apply suggestions from code review to changelog

Co-authored-by: Pratik Basyal <pratik.basyal@amd.com>

* Do not required HIP complier to build native counter collection tool

* fix cmake

* gersemi formatting on latest cmake change

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>

* ex ci updated dependencies to include rocprofiler-sdk, but cmake was still not capturing the path- there was a commit that added to the cmake_prefix_path entry that specified rocprof-sdk's cmake location ut was too specific for the search paths in find_package's config mode.
removing the cmake_prefix_path var and adding hints to find_package call instead, and specifying config mode so it knows how to construct the search paths

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>

* gersemi run for formatting

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>

* Still need prefix path, should not have been removed in last commit but does need to be shortened to just the rocm path to allow for find_package config mode to do the job

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>

* include cstdint for uint32_t

* Run formatting on helper.cpp

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>

* Remove rocm 7.2 release stuff from version and changelog and handle it in separate pr

* fix version

* fix changelog

* fix changelog

* run ruff formatter

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>

* fix rocprofiler-sdk attach so path

---------

Signed-off-by: Carrie Fallows <Carrie.Fallows@amd.com>
Co-authored-by: Carrie Fallows <Carrie.Fallows@amd.com>
Co-authored-by: Pratik Basyal <pratik.basyal@amd.com>
---
 .../rocprofiler-compute-formatting.yml        |  27 +
 projects/rocprofiler-compute/CHANGELOG.md     |  24 +
 projects/rocprofiler-compute/CMakeLists.txt   |  33 +-
 .../docker/Dockerfile.standalone              |  29 +-
 .../docs/archive/docs-1.x/getting_started.md  |   2 +-
 .../docs/archive/docs-1.x/profiling.md        |   2 +-
 .../docs/archive/docs-2.x/getting_started.md  |   2 +-
 .../docs/archive/docs-2.x/profiling.md        |   2 +-
 .../docs/how-to/profile/mode.rst              |   2 +-
 .../rocprofiler-compute/docs/how-to/use.rst   |   2 +-
 projects/rocprofiler-compute/src/argparser.py |  42 +-
 .../src/lib/CMakeLists.txt                    |  17 +
 .../rocprofiler-compute/src/lib/helper.cpp    | 145 +++++
 .../rocprofiler-compute/src/lib/helper.hpp    |  31 +
 .../src/lib/rocprofiler_compute_tool.cpp      | 613 ++++++++++++++++++
 .../src/rocprof_compute_base.py               |  26 +-
 .../rocprof_compute_profile/profiler_base.py  |  74 ++-
 .../profiler_rocprof_v3.py                    |  10 +-
 .../profiler_rocprofiler_sdk.py               |  58 +-
 .../src/rocprof_compute_soc/soc_base.py       |  72 +-
 .../src/utils/rocpd_data.py                   |  58 ++
 .../rocprofiler-compute/src/utils/utils.py    | 396 ++++++-----
 .../rocprofiler-compute/tests/conftest.py     |  21 +-
 .../tests/test_profile_general.py             | 281 ++++----
 .../rocprofiler-compute/tests/test_utils.py   | 203 ++----
 25 files changed, 1599 insertions(+), 573 deletions(-)
 create mode 100644 projects/rocprofiler-compute/src/lib/CMakeLists.txt
 create mode 100644 projects/rocprofiler-compute/src/lib/helper.cpp
 create mode 100644 projects/rocprofiler-compute/src/lib/helper.hpp
 create mode 100644 projects/rocprofiler-compute/src/lib/rocprofiler_compute_tool.cpp

diff --git a/.github/workflows/rocprofiler-compute-formatting.yml b/.github/workflows/rocprofiler-compute-formatting.yml
index 6042a177c6..92288a7b3a 100644
--- a/.github/workflows/rocprofiler-compute-formatting.yml
+++ b/.github/workflows/rocprofiler-compute-formatting.yml
@@ -80,6 +80,33 @@ jobs:
           exit 1
         fi
 
+  cxx:
+    runs-on: ubuntu-22.04
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        sparse-checkout: projects/rocprofiler-compute
+    - name: Install dependencies
+      working-directory: projects/rocprofiler-compute
+      run: |
+        sudo apt-get update
+        sudo apt-get install -y python3-pip
+        python3 -m pip install clang-format
+    - name: clang-format
+      working-directory: projects/rocprofiler-compute
+      run: |
+        set +e
+        clang-format -i $(find src -type f | egrep '\.(h|hpp|hh|c|cc|cpp)(|\.in)$')
+        if [ $(git diff | wc -l) -gt 0 ]; then
+          echo -e "\nError! cxx code not formatted. Run clang-format...\n"
+          echo -e "\nFiles:\n"
+          git diff --name-only
+          echo -e "\nFull diff:\n"
+          git diff
+          exit 1
+        fi
+
   python-bytecode:
     runs-on: ubuntu-22.04
 
diff --git a/projects/rocprofiler-compute/CHANGELOG.md b/projects/rocprofiler-compute/CHANGELOG.md
index d9ba678114..ec46b6848d 100644
--- a/projects/rocprofiler-compute/CHANGELOG.md
+++ b/projects/rocprofiler-compute/CHANGELOG.md
@@ -4,6 +4,30 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
 
 ## Unreleased
 
+### Added
+
+* Native tool to perform counter collection using ROCprofiler-SDK public API. It is only supported for ROCm version 7.0.0 (and later).
+  * Native tool is now the default for counter collection.
+  * Native tool for counter collection will not be used under the following conditions:
+    * Specific profiler is provided through the ``ROCPROF`` environment variable.
+    * ``--no-native-tool`` option is provided, forcing usage of the default profiler.
+    * When performing a dynamic attach to a process for profiling.
+
+### Changed
+
+* Default output format for the underlying ROCprofiler-SDK tool has been changed from ``csv`` to ``rocpd``.
+  * If the ROCprofiler-SDK ``rocpd`` public library is not available, will fall back to ``csv`` format
+
+* Option ``--rocprofiler-sdk-library-path`` has been changed to ``--rocprofiler-tool-library-path`` to better reflect the fact that we provide flexibility in choosing the path to ROCprofiler-SDK tool and not the library.
+
+### Resolved issues
+
+* Fixed the meaning of --dispatch option in profile mode in argparser to convey the fact that it control which iterations of the kernel to profile and not which dispatch ids to profile.
+
+* The meaning of --dispatch option in analyze is still the same which is which dispatch ids to analyze
+
+* Fix the functioning of --dispatch option to act as 1-based index and ensure that correct kernel iterations are being profiled
+
 ## ROCm Compute Profiler 3.4.0 for ROCm 7.2.0
 
 ### Added
diff --git a/projects/rocprofiler-compute/CMakeLists.txt b/projects/rocprofiler-compute/CMakeLists.txt
index 30b10dd2cc..6b81527e8b 100644
--- a/projects/rocprofiler-compute/CMakeLists.txt
+++ b/projects/rocprofiler-compute/CMakeLists.txt
@@ -1,4 +1,8 @@
-cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
+
+# Set cmake_prefix_path for searching, ROCM_PATH if avail otherwise default to general rocm install path
+set(CMAKE_PREFIX_PATH $ENV{ROCM_PATH} "/opt/rocm/")
+message(STATUS "ROCM_PATH: $ENV{ROCM_PATH}")
 
 if(
     CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR
@@ -36,7 +40,7 @@ string(
 project(
     rocprofiler-compute
     VERSION ${ROCPROFCOMPUTE_VERSION}
-    LANGUAGES C
+    LANGUAGES CXX
     DESCRIPTION
         "A kernel-level profiling tool for machine learning/HPC workloads running on AMD MI GPUs"
     HOMEPAGE_URL
@@ -471,6 +475,11 @@ if(${ENABLE_COVERAGE})
     )
 endif()
 
+# -------------------
+# Setup tool library
+# -------------------
+add_subdirectory(src/lib)
+
 # ---------
 # Install
 # ---------
@@ -563,6 +572,12 @@ install(
     COMPONENT main
 )
 
+#install librocprofiler-compute-tool.so
+install(
+    TARGETS rocprofiler-compute-tool
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/rocprofiler-compute COMPONENT main
+)
+
 # top-level symlink for bin/rocprof-compute
 install(
     CODE
@@ -590,21 +605,25 @@ add_custom_target(
 add_custom_target(
     standalonebinary
     # Change working directory to src
-    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/src
+    WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}
+    # Install nuitka
+    COMMAND ${Python3_EXECUTABLE} -m pip install nuitka
+    # Install patchelf
+    COMMAND ${Python3_EXECUTABLE} -m pip install patchelf
     # Check nuitka
     COMMAND ${Python3_EXECUTABLE} -m pip list | grep -i nuitka > /dev/null 2>&1
     # Check patchelf
     COMMAND ${Python3_EXECUTABLE} -m pip list | grep -i patchelf > /dev/null 2>&1
     # Create VERSION.sha file
-    COMMAND git -C ${PROJECT_SOURCE_DIR} rev-parse HEAD > VERSION.sha
+    COMMAND git rev-parse HEAD > VERSION.sha
     # Build standalone binary
     # NOTE: --no-deployment-flag=self-execution is used to avoid self-execution
-    # and fork
-    # bombs as explained in
+    # and fork bombs as explained in
     # https://nuitka.net/user-documentation/common-issue-solutions.html#fork-bombs-self-execution
     COMMAND
         ${Python3_EXECUTABLE} -m nuitka --mode=onefile --no-deployment-flag=self-execution
         --include-data-files=${PROJECT_SOURCE_DIR}/VERSION*=./ --enable-plugin=no-qt
+        --include-data-files=src/lib/rocprofiler_compute_tool.cpp=lib/rocprofiler_compute_tool.cpp
         --include-package=dash_svg --include-package-data=dash_svg
         --include-package=dash_bootstrap_components
         --include-package-data=dash_bootstrap_components --include-package=plotly
@@ -615,7 +634,7 @@ add_custom_target(
         --include-package-data=rocprof_compute_profile
         --include-package=rocprof_compute_tui --include-package-data=rocprof_compute_tui
         --include-package=rocprof_compute_soc --include-package-data=rocprof_compute_soc
-        --include-package=utils --include-package-data=utils rocprof-compute
+        --include-package=utils --include-package-data=utils src/rocprof-compute
     # Remove library rpath from executable
     COMMAND patchelf --remove-rpath rocprof-compute.bin
     # Move to build directory
diff --git a/projects/rocprofiler-compute/docker/Dockerfile.standalone b/projects/rocprofiler-compute/docker/Dockerfile.standalone
index cb1e3b07f4..a2bf836ba5 100644
--- a/projects/rocprofiler-compute/docker/Dockerfile.standalone
+++ b/projects/rocprofiler-compute/docker/Dockerfile.standalone
@@ -1,8 +1,8 @@
 FROM redhat/ubi8:8.10
 
-WORKDIR /app
+WORKDIR /app/projects/rocprofiler-compute
 
-RUN yum install -y curl gcc cmake git
+RUN yum install -y curl git cmake gcc-c++
 
 # Allows running git commands in /app
 RUN git config --global --add safe.directory /app
@@ -14,10 +14,25 @@ RUN yum install -y python39 python39-devel && \
     python3 get-pip.py
 
 CMD ["/bin/bash", "-c", "\
-    cd /app/projects/rocprofiler-compute \
-    && python3 -m pip install -r requirements.txt \
+    python3 -m pip install -r requirements.txt \
     && python3 -m pip install nuitka patchelf \
-    && rm -rf build \
-    && cmake -B build -S . \
-    && make -C build standalonebinary \
+    && git rev-parse HEAD > VERSION.sha \
+    && python3 -m nuitka --mode=onefile --no-deployment-flag=self-execution \
+        --enable-plugin=no-qt \
+        --include-data-files=VERSION*=./ \
+        --include-data-files=src/lib/rocprofiler_compute_tool.cpp=lib/rocprofiler_compute_tool.cpp \
+        --include-package=dash_svg --include-package-data=dash_svg \
+        --include-package=dash_bootstrap_components \
+        --include-package-data=dash_bootstrap_components \
+        --include-package=plotly --include-package-data=plotly \
+        --include-package=kaleido --include-package-data=kaleido \
+        --include-package=rocprof_compute_analyze \
+        --include-package-data=rocprof_compute_analyze \
+        --include-package=rocprof_compute_profile \
+        --include-package-data=rocprof_compute_profile \
+        --include-package=rocprof_compute_tui --include-package-data=rocprof_compute_tui \
+        --include-package=rocprof_compute_soc --include-package-data=rocprof_compute_soc \
+        --include-package=utils --include-package-data=utils \
+        src/rocprof-compute \
+    && patchelf --remove-rpath rocprof-compute.bin \
 "]
diff --git a/projects/rocprofiler-compute/docs/archive/docs-1.x/getting_started.md b/projects/rocprofiler-compute/docs/archive/docs-1.x/getting_started.md
index 1ee28a496a..67633415ad 100644
--- a/projects/rocprofiler-compute/docs/archive/docs-1.x/getting_started.md
+++ b/projects/rocprofiler-compute/docs/archive/docs-1.x/getting_started.md
@@ -25,7 +25,7 @@
 
     Some common filters include:
 
-    - `-k`/`--kernel` enables filtering kernels by name. `-d`/`--dispatch` enables filtering based on dispatch ID
+    - `-k`/`--kernel` enables filtering kernels by name. `-d`/`--dispatch` enables filtering based on dispatch iteration
     - `-b`/`--ipblocks` enables collects metrics for only the specified (one or more) IP Blocks.
 
     To view available metrics by IP Block you can use the `--list-metrics` argument to view a list of all available metrics organized by IP Block.
diff --git a/projects/rocprofiler-compute/docs/archive/docs-1.x/profiling.md b/projects/rocprofiler-compute/docs/archive/docs-1.x/profiling.md
index 61827add37..c5835f88a8 100644
--- a/projects/rocprofiler-compute/docs/archive/docs-1.x/profiling.md
+++ b/projects/rocprofiler-compute/docs/archive/docs-1.x/profiling.md
@@ -90,7 +90,7 @@ Profile Options:
                                                            SPI
                                                            CPC
                                                            CPF
-  -d  [ ...], --dispatch  [ ...]                        Dispatch ID filtering.
+  -d  [ ...], --dispatch  [ ...]                        Dispatch iteration filtering.
   --no-roof                                             Profile without collecting roofline data.
   -- [ ...]                                             Provide command for profiling after double dash.
 
diff --git a/projects/rocprofiler-compute/docs/archive/docs-2.x/getting_started.md b/projects/rocprofiler-compute/docs/archive/docs-2.x/getting_started.md
index 9c8740de19..87a6f7db50 100644
--- a/projects/rocprofiler-compute/docs/archive/docs-2.x/getting_started.md
+++ b/projects/rocprofiler-compute/docs/archive/docs-2.x/getting_started.md
@@ -30,7 +30,7 @@
     Some common filters include:
 
     - `-k`/`--kernel` enables filtering kernels by name.
-    - `-d`/`--dispatch` enables filtering based on dispatch ID.
+    - `-d`/`--dispatch` enables filtering based on dispatch iteration.
     - `-b`/`--block` enables collects metrics for only the specified (one or more) hardware component blocks.
 
     To view available metrics by hardware Block you can use the `--list-metrics` argument:
diff --git a/projects/rocprofiler-compute/docs/archive/docs-2.x/profiling.md b/projects/rocprofiler-compute/docs/archive/docs-2.x/profiling.md
index f79a055b24..948ec85b85 100644
--- a/projects/rocprofiler-compute/docs/archive/docs-2.x/profiling.md
+++ b/projects/rocprofiler-compute/docs/archive/docs-2.x/profiling.md
@@ -191,7 +191,7 @@ Filtering Options:
 
 - The `-k` / `--kernel` \<kernel-substr> flag allows for kernel filtering. Usage is equivalent with the current rocProf utility ([see details below](#kernel-filtering)).
 
-- The `-d` / `--dispatch` \<dispatch-id> flag allows for dispatch ID filtering. Usage is equivalent with the current rocProf utility ([see details below](#dispatch-filtering)).
+- The `-d` / `--dispatch` \<dispatch-id> flag allows for dispatch iteration filtering. Usage is equivalent with the current rocProf utility ([see details below](#dispatch-filtering)).
 
 - The `-b` / `--block` \<block-name> flag allows system profiling on one or more selected hardware components to speed up the profiling process ([see details below](#hardware-component-filtering)).
 
diff --git a/projects/rocprofiler-compute/docs/how-to/profile/mode.rst b/projects/rocprofiler-compute/docs/how-to/profile/mode.rst
index 4d08311e82..47397da4c1 100644
--- a/projects/rocprofiler-compute/docs/how-to/profile/mode.rst
+++ b/projects/rocprofiler-compute/docs/how-to/profile/mode.rst
@@ -271,7 +271,7 @@ Filtering options
    utility. See :ref:`profiling-kernel-filtering`.
 
 ``-d``, ``--dispatch <dispatch-id>``
-   Allows for dispatch ID filtering. Usage is equivalent with the current
+   Allows for dispatch iteration filtering. Usage is equivalent with the current
    ``rocprof`` utility. See :ref:`profiling-dispatch-filtering`.
 
 ``--set <metric-set>``
diff --git a/projects/rocprofiler-compute/docs/how-to/use.rst b/projects/rocprofiler-compute/docs/how-to/use.rst
index 94d269c6b8..4ab1384562 100644
--- a/projects/rocprofiler-compute/docs/how-to/use.rst
+++ b/projects/rocprofiler-compute/docs/how-to/use.rst
@@ -54,7 +54,7 @@ Common filters to customize data collection include:
    Enables filtering kernels by name.
 
 ``-d``, ``--dispatch``
-   Enables filtering based on dispatch ID.
+   Enables filtering based on dispatch iteration.
 
 ``-b``, ``--block``
    Enables collection metrics for only the specified analysis report blocks.
diff --git a/projects/rocprofiler-compute/src/argparser.py b/projects/rocprofiler-compute/src/argparser.py
index 826841e7ca..5036481093 100644
--- a/projects/rocprofiler-compute/src/argparser.py
+++ b/projects/rocprofiler-compute/src/argparser.py
@@ -178,7 +178,10 @@ Examples:
         metavar="",
         default=None,
         required=False,
-        help="\t\t\tProcess id to be attached for profiling.",
+        help=(
+            "\t\t\tProcess id to be attached for profiling.\n"
+            "\t\t\tImplies --no-native-tool"
+        ),
     )
     profile_group.add_argument(
         "--attach-duration-msec",
@@ -188,9 +191,9 @@ Examples:
         default=None,
         required=False,
         help=(
-            "\t\t\tWhen --attach-pid is used, it specifies the attach duration "
-            "in milliseconds. If not set, detachment occurs when "
-            '"Enter" key is pressed.'
+            "\t\t\tWhen --attach-pid is used, it specifies the attach duration\n"
+            "\t\t\tin milliseconds. If not set, detachment occurs when\n"
+            '\t\t\t"Enter" key is pressed.'
         ),
     )
     profile_group.add_argument(
@@ -255,7 +258,10 @@ Examples:
         nargs="+",
         dest="dispatch",
         required=False,
-        help="\t\t\tDispatch ID filtering.",
+        help=(
+            "\t\t\tWhich dispatch iterations of the kernel to filter \n"
+            "\t\t\t(e.g. 1 3:5 captures 1st, 3rd, 4th and 5th iterations)."
+        ),
     )
 
     profile_group.add_argument(
@@ -342,8 +348,8 @@ Examples:
         metavar="",
         dest="format_rocprof_output",
         choices=["csv", "rocpd"],
-        default="csv",
-        help="\t\t\tSet the format of output file of rocprof.",
+        default="rocpd",
+        help=("\t\t\tSet the format of output file of rocprof."),
     )
     profile_group.add_argument(
         "--pc-sampling-method",
@@ -370,14 +376,28 @@ Examples:
         ),
     )
     profile_group.add_argument(
-        "--rocprofiler-sdk-library-path",
+        "--rocprofiler-sdk-tool-path",
         type=str,
-        dest="rocprofiler_sdk_library_path",
+        dest="rocprofiler_sdk_tool_path",
         required=False,
         default=str(
-            Path(os.getenv("ROCM_PATH", "/opt/rocm")) / "lib/librocprofiler-sdk.so"
+            Path(os.getenv("ROCM_PATH", "/opt/rocm"))
+            / "lib/rocprofiler-sdk/librocprofiler-sdk-tool.so"
+        ),
+        help="\t\t\tSet the path to rocprofiler-sdk tool.",
+    )
+    profile_group.add_argument(
+        "--no-native-tool",
+        required=False,
+        default=False,
+        action="store_true",
+        help=(
+            "\t\t\tDo not use the native counter collection tool.\n"
+            "\t\t\tNative tool is not used if ROCPROF env. var. is set "
+            "and not equal to rocprofiler-sdk.\n"
+            "\t\t\tNative tool is not used for ROCm version < 7.x.x.\n"
+            "\t\t\tNative tool is not used attach/detach scenario"
         ),
-        help="\t\t\tSet the path to rocprofiler SDK library.",
     )
     profile_group.add_argument(
         "--retain-rocpd-output",
diff --git a/projects/rocprofiler-compute/src/lib/CMakeLists.txt b/projects/rocprofiler-compute/src/lib/CMakeLists.txt
new file mode 100644
index 0000000000..5e4afa7a15
--- /dev/null
+++ b/projects/rocprofiler-compute/src/lib/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Minimum required c++ standard is 17 for compilation
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Set cmake_prefix_path for searching, ROCM_PATH if avail otherwise default to general rocm install path
+set(CMAKE_PREFIX_PATH $ENV{ROCM_PATH} "/opt/rocm/")
+
+find_package(
+    rocprofiler-sdk
+    HINTS $ENV{ROCM_PATH}/lib/cmake /opt/rocm/lib/cmake
+    CONFIG
+    REQUIRED
+)
+
+add_library(rocprofiler-compute-tool SHARED)
+target_sources(rocprofiler-compute-tool PRIVATE rocprofiler_compute_tool.cpp helper.cpp)
+target_link_libraries(rocprofiler-compute-tool PRIVATE rocprofiler-sdk::rocprofiler-sdk)
diff --git a/projects/rocprofiler-compute/src/lib/helper.cpp b/projects/rocprofiler-compute/src/lib/helper.cpp
new file mode 100644
index 0000000000..1f5d05f8b9
--- /dev/null
+++ b/projects/rocprofiler-compute/src/lib/helper.cpp
@@ -0,0 +1,145 @@
+// MIT License
+//
+// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include "helper.hpp"
+
+#include <cstdint>
+#include <cxxabi.h>
+#include <iostream>
+#include <string>
+
+namespace helper_utils {
+
+// The function extracts the kernel name from
+// input string. By using the iterators it finds the
+// window in the string which contains only the kernel name.
+// For example 'Foo<int, float>::foo(a[], int (int))' -> 'foo'
+std::string truncate_name(std::string_view name) {
+  auto rit = name.rbegin();
+  auto rend = name.rend();
+  uint32_t counter = 0;
+  char open_token = 0;
+  char close_token = 0;
+  while (rit != rend) {
+    if (counter == 0) {
+      switch (*rit) {
+      case ')':
+        counter = 1;
+        open_token = ')';
+        close_token = '(';
+        break;
+      case '>':
+        counter = 1;
+        open_token = '>';
+        close_token = '<';
+        break;
+      case ']':
+        counter = 1;
+        open_token = ']';
+        close_token = '[';
+        break;
+      case ' ':
+        ++rit;
+        continue;
+      }
+      if (counter == 0)
+        break;
+    } else {
+      if (*rit == open_token)
+        counter++;
+      if (*rit == close_token)
+        counter--;
+    }
+    ++rit;
+  }
+  auto rbeg = rit;
+  while ((rit != rend) && (*rit != ' ') && (*rit != ':'))
+    rit++;
+  return std::string{name.substr(rend - rit, rit - rbeg)};
+}
+
+std::string cxa_demangle(std::string_view _mangled_name, int *_status) {
+  // return the mangled since there is no buffer
+  if (_mangled_name.empty()) {
+    *_status = -2;
+    return std::string{};
+  }
+
+  auto _demangled_name = std::string{_mangled_name};
+
+  // PARAMETERS to __cxa_demangle
+  //  mangled_name:
+  //      A NULL-terminated character string containing the name to be
+  //      demangled.
+  //  buffer:
+  //      A region of memory, allocated with malloc, of *length bytes, into
+  //      which the demangled name is stored. If output_buffer is not long
+  //      enough, it is expanded using realloc. output_buffer may instead be
+  //      NULL; in that case, the demangled name is placed in a region of memory
+  //      allocated with malloc.
+  //  _buflen:
+  //      If length is non-NULL, the length of the buffer containing the
+  //      demangled name is placed in *length.
+  //  status:
+  //      *status is set to one of the following values
+  size_t _demang_len = 0;
+  char *_demang = abi::__cxa_demangle(_demangled_name.c_str(), nullptr,
+                                      &_demang_len, _status);
+  switch (*_status) {
+  //  0 : The demangling operation succeeded.
+  // -1 : A memory allocation failure occurred.
+  // -2 : mangled_name is not a valid name under the C++ ABI mangling rules.
+  // -3 : One of the arguments is invalid.
+  case 0: {
+    if (_demang)
+      _demangled_name = std::string{_demang};
+    break;
+  }
+  case -1: {
+    std::clog << "[rocprofiler-compute] memory allocation failure occurred "
+                 "demangling "
+              << _demangled_name << std::endl;
+    break;
+  }
+  case -2: {
+    break;
+  }
+  case -3: {
+    std::clog << "[rocprofiler-compute] Invalid argument in: (\""
+              << _demangled_name << "\", nullptr, nullptr, "
+              << static_cast<void *>(_status) << ")" << std::endl;
+    break;
+  }
+  default:
+    break;
+  };
+
+  // if it "demangled" but the length is zero, set the status to -2
+  if (_demang_len == 0 && *_status == 0)
+    *_status = -2;
+
+  // free allocated buffer
+  ::free(_demang);
+  return _demangled_name;
+}
+
+} // namespace helper_utils
diff --git a/projects/rocprofiler-compute/src/lib/helper.hpp b/projects/rocprofiler-compute/src/lib/helper.hpp
new file mode 100644
index 0000000000..26a3f17764
--- /dev/null
+++ b/projects/rocprofiler-compute/src/lib/helper.hpp
@@ -0,0 +1,31 @@
+// MIT License
+//
+// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+#include <string>
+
+namespace helper_utils {
+
+std::string truncate_name(std::string_view name);
+std::string cxa_demangle(std::string_view _mangled_name, int *_status);
+
+} // namespace helper_utils
\ No newline at end of file
diff --git a/projects/rocprofiler-compute/src/lib/rocprofiler_compute_tool.cpp b/projects/rocprofiler-compute/src/lib/rocprofiler_compute_tool.cpp
new file mode 100644
index 0000000000..5b2f311846
--- /dev/null
+++ b/projects/rocprofiler-compute/src/lib/rocprofiler_compute_tool.cpp
@@ -0,0 +1,613 @@
+// MIT License
+//
+// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+/*
+This is a native tool for rocprofiler-compute to collect counters data for GPU
+kernel dispatches using the rocprofiler-sdk public API. This C++ tool is
+compiled into a shared object with hipcc/amdclang++ and dynamically links to the
+rocprofiler-sdk library. The shared object is injected using the LD_PRELOAD
+environment variable so that rocprofiler-sdk services can be configured before
+the GPU workload starts executing.
+
+An experimental feature for attach/detach scenarios is also provided.
+
+Code Flow:
+
+1. Entry point - rocprofiler_configure():
+    - Parses ROCPROF environment variables to configure profiling.
+    - Sets up tool metadata and logging.
+    - Returns pointers to tool_init() and tool_fini() functions.
+
+2. Tool Initialization - tool_init():
+    - Creates a profiling context.
+    - Subscribes to dispatch tracing and counting services by providing function
+callbacks.
+    - Starts the profiling context.
+
+3. Kernel registration callback - tool_tracing_callback():
+    - Invoked when a kernel is registered.
+    - Stores the kernel name to kernel id mapping.
+    - Determines which kernel names/ids to target for profiling based on ROCPROF
+environment variables.
+
+4. Kernel dispatch callback - dispatch_callback():
+    - Invoked before a kernel dispatch is enqueued.
+    - Decides whether to profile this dispatch.
+    - If profiling is required, creates or fetches from cache a counter profile
+for the agent and returns a pointer to it.
+    - The counter profile dictates which counters to collect for this dispatch.
+
+5. Kernel dispatch record callback - record_callback():
+    - Invoked after a kernel dispatch is completed.
+    - Receives the collected counter records.
+    - Stores the counter records in tool data for later processing.
+
+6. Tool Finalization - tool_fini():
+    - Called when the application is terminating.
+    - Stops the profiling context.
+    - Processes and writes the collected counter records to the output file.
+    - Cleans up resources.
+*/
+
+#include "helper.hpp"
+
+#include <rocprofiler-sdk/registration.h>
+#include <rocprofiler-sdk/rocprofiler.h>
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <random>
+#include <regex>
+#include <set>
+#include <shared_mutex>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#define ROCPROFILER_CALL(result, msg)                                          \
+  {                                                                            \
+    rocprofiler_status_t CHECKSTATUS = result;                                 \
+    if (CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) {                           \
+      std::string status_msg = rocprofiler_get_status_string(CHECKSTATUS);     \
+      std::cerr << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] "     \
+                << msg << " failed with error code " << CHECKSTATUS << ": "    \
+                << status_msg << std::endl;                                    \
+      std::stringstream errmsg{};                                              \
+      errmsg << "[" #result "][" << __FILE__ << ":" << __LINE__ << "] "        \
+             << msg " failure (" << status_msg << ")";                         \
+      throw std::runtime_error(errmsg.str());                                  \
+    }                                                                          \
+  }
+
+namespace {
+
+// Struct to store a single counter info record
+struct counter_info_record_t {
+  uint64_t dispatch_id;
+  uint64_t kernel_id;
+  uint64_t counter_id;
+  std::string counter_name;
+  double counter_value;
+};
+
+// Tool data struct, now includes a vector of counter_info_record_t
+struct tool_data_t {
+  std::mutex mut{};
+  std::unique_ptr<std::ostream> output_stream{nullptr};
+  std::unordered_map<uint64_t, std::string> counter_id_name_map{};
+  std::string requested_counters{};
+  std::string kernel_filter_include_regex{};
+  std::vector<std::pair<uint64_t, uint64_t>> kernel_filter_ranges{};
+  std::vector<counter_info_record_t> counter_records;
+  std::set<uint64_t> target_kernel_ids{};
+};
+
+using kernel_symbol_data_t =
+    rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
+
+rocprofiler_context_id_t &get_client_ctx() {
+  static rocprofiler_context_id_t ctx{0};
+  return ctx;
+}
+
+void record_callback(rocprofiler_dispatch_counting_service_data_t dispatch_data,
+                     rocprofiler_counter_record_t *record_data,
+                     size_t record_count,
+                     rocprofiler_user_data_t /* user_data */,
+                     void *callback_data_args) {
+  auto *tool_data_ptr =
+      static_cast<std::unique_ptr<tool_data_t> *>(callback_data_args);
+  tool_data_t *tool;
+  {
+    std::lock_guard<std::mutex> lock(tool_data_ptr->get()->mut);
+    tool = tool_data_ptr->get();
+  }
+
+  // For each counter, write: dispatch_id, counter_id, counter_name,
+  // counter_value
+  for (size_t i = 0; i < record_count; ++i) {
+    rocprofiler_counter_id_t counter_id{};
+    ROCPROFILER_CALL(
+        rocprofiler_query_record_counter_id(record_data[i].id, &counter_id),
+        "query record counter id");
+
+    // Store the counter info record in tool_data
+    counter_info_record_t record{dispatch_data.dispatch_info.dispatch_id,
+                                 dispatch_data.dispatch_info.kernel_id,
+                                 counter_id.handle,
+                                 tool->counter_id_name_map[counter_id.handle],
+                                 record_data[i].counter_value};
+    {
+      std::lock_guard<std::mutex> lock(tool->mut);
+      tool->counter_records.push_back(std::move(record));
+    }
+  }
+}
+
+/**
+ * Callback from rocprofiler when a code object is loaded.
+ * We use this to get record kernel names as they are registered.
+ */
+void tool_tracing_callback(rocprofiler_callback_tracing_record_t record,
+                           rocprofiler_user_data_t * /*user_data*/,
+                           void *callback_data) {
+  if (record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD &&
+      record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT &&
+      record.operation ==
+          ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) {
+    auto *data = static_cast<kernel_symbol_data_t *>(record.payload);
+    int demangle_status = 0;
+    auto kernel_name =
+        helper_utils::cxa_demangle(data->kernel_name, &demangle_status);
+    kernel_name = helper_utils::truncate_name(kernel_name);
+
+    // check if regex can be found in kernel name matches regex from tool data,
+    // if matches store kernel id
+    auto *tool_data_ptr =
+        static_cast<std::unique_ptr<tool_data_t> *>(callback_data);
+    auto *tool = tool_data_ptr->get();
+    // Lock before modifying target_kernel_ids
+    std::lock_guard<std::mutex> lock(tool->mut);
+    if (!tool->kernel_filter_include_regex.empty()) {
+      try {
+        std::regex re(tool->kernel_filter_include_regex);
+        if (!kernel_name.empty() && std::regex_search(kernel_name, re)) {
+          tool->target_kernel_ids.insert(data->kernel_id);
+        }
+      } catch (const std::regex_error &e) {
+        std::cerr
+            << "[rocprofiler-compute] [" << __FUNCTION__
+            << "] ERROR: Invalid regex in ROCPROF_KERNEL_FILTER_INCLUDE_REGEX: "
+            << tool->kernel_filter_include_regex << " : " << e.what()
+            << std::endl;
+      }
+    }
+    // If no regex specified, collect for all kernels
+    else {
+      tool->target_kernel_ids.insert(data->kernel_id);
+    }
+  }
+}
+
+/**
+ * Checks if the given kernel dispatch should be targeted for profiling.
+ * Returns true if the kernel_id is in the set of target_kernel_ids (if
+ * non-empty), and if the kernel_iteration (1-based index) matches the
+ * kernel_filter_range (if specified).
+ *
+ * @param tool Pointer to the tool_data_t structure containing profiling
+ * configuration.
+ * @param kernel_id The kernel ID of the dispatch.
+ * @param kernel_iteration The 1-based index of this kernel_id's dispatch (first
+ * dispatch is 1).
+ * @return true if the dispatch should be profiled, false otherwise.
+ */
+bool is_targetted_dispatch(const tool_data_t *tool, uint64_t kernel_id,
+                           uint64_t kernel_iteration) {
+  // If target_kernel_ids is non-empty, only allow those kernel_ids
+  if (!tool->target_kernel_ids.empty() &&
+      !tool->target_kernel_ids.count(kernel_id))
+    return false;
+
+  // If kernel_filter_ranges is set, check if kernel_iteration is in any of the
+  // specified ranges
+  if (!tool->kernel_filter_ranges.empty())
+    return std::any_of(tool->kernel_filter_ranges.begin(),
+                       tool->kernel_filter_ranges.end(),
+                       [kernel_iteration](const auto &range) {
+                         return kernel_iteration >= range.first &&
+                                kernel_iteration <= range.second;
+                       });
+
+  // If no filter ranges are specified, or all checks passed, profile this
+  // dispatch
+  return true;
+}
+
+/**
+ * @brief Creates a counter collection profile for performance monitoring on a
+ * specific GPU agent.
+ *
+ * This function parses the requested counters from the tool configuration,
+ * validates them against the counters supported by the target GPU agent, and
+ * creates a rocprofiler counter configuration for collecting the available
+ * requested counters during dispatch profiling.
+ *
+ * @param tool Pointer to tool data containing the requested counters string and
+ * counter mappings
+ * @param dispatch_data Dispatch counting service data containing agent
+ * information for the target GPU
+ *
+ * @return rocprofiler_counter_config_id_t A valid counter configuration profile
+ * ID that can be used for counter collection, or an invalid profile (handle =
+ * 0) if creation fails
+ *
+ * @details
+ * The function performs the following operations:
+ * 1. Parses the requested counters from tool->requested_counters string
+ * (format: "prefix:counter1 counter2 ...")
+ * 2. Queries all counters supported by the specified GPU agent
+ * 3. Filters the supported counters to match only those requested
+ * 4. Logs warnings for any requested counters that are not supported by the
+ * agent
+ * 5. Creates and returns a rocprofiler counter configuration for the valid
+ * counters
+ * 6. Updates the tool's counter ID to name mapping for later reference
+ *
+ * @note If no counters are requested or none of the requested counters are
+ * supported, an empty profile may be created. Unsupported counters are logged
+ * as warnings but do not cause the function to fail.
+ */
+rocprofiler_counter_config_id_t create_counter_collection_profile(
+    tool_data_t *tool,
+    rocprofiler_dispatch_counting_service_data_t dispatch_data) {
+  // get counters to collect
+  std::set<std::string> counters_to_collect;
+  const std::string &counters_str = tool->requested_counters;
+  if (!counters_str.empty()) {
+    auto pos = counters_str.find(':');
+    if (pos != std::string::npos) {
+      std::istringstream ss(counters_str.substr(pos + 1));
+      for (std::string token; ss >> token;)
+        counters_to_collect.insert(token);
+    }
+  }
+
+  // Get available counters for this agent
+  std::vector<rocprofiler_counter_id_t> gpu_counters;
+  ROCPROFILER_CALL(
+      rocprofiler_iterate_agent_supported_counters(
+          dispatch_data.dispatch_info.agent_id,
+          [](rocprofiler_agent_id_t, rocprofiler_counter_id_t *counters,
+             size_t num_counters, void *user_data) {
+            std::vector<rocprofiler_counter_id_t> *vec =
+                static_cast<std::vector<rocprofiler_counter_id_t> *>(user_data);
+            for (size_t i = 0; i < num_counters; i++) {
+              vec->push_back(counters[i]);
+            }
+            return ROCPROFILER_STATUS_SUCCESS;
+          },
+          static_cast<void *>(&gpu_counters)),
+      "fetch supported counters");
+
+  // Identify counters requested to collect which are available
+  std::vector<rocprofiler_counter_id_t> collect_counters;
+  std::vector<std::string> collect_counters_names;
+  for (auto &counter : gpu_counters) {
+    rocprofiler_counter_info_v0_t info;
+    ROCPROFILER_CALL(rocprofiler_query_counter_info(
+                         counter, ROCPROFILER_COUNTER_INFO_VERSION_0,
+                         static_cast<void *>(&info)),
+                     "query counter info");
+    if (counters_to_collect.count(std::string(info.name)) > 0) {
+      collect_counters.push_back(counter);
+      collect_counters_names.push_back(std::string(info.name));
+      tool->counter_id_name_map[counter.handle] = std::string(info.name);
+    }
+  }
+
+  // Log unsupported counters in a concise, comma-separated line
+  std::vector<std::string> unsupported_counters;
+  for (const auto &requested : counters_to_collect) {
+    if (std::find(collect_counters_names.begin(), collect_counters_names.end(),
+                  requested) == collect_counters_names.end()) {
+      unsupported_counters.push_back(requested);
+    }
+  }
+  if (!unsupported_counters.empty()) {
+    std::clog << "\033[33m[rocprofiler-compute] [" << __FUNCTION__
+              << "] WARNING: Requested counters not available: ";
+    for (size_t i = 0; i < unsupported_counters.size(); ++i) {
+      std::clog << unsupported_counters[i];
+      if (i + 1 < unsupported_counters.size())
+        std::clog << ", ";
+    }
+    std::clog << "\033[0m" << std::endl;
+  }
+
+  // Create and return collection profile for the counters
+  rocprofiler_counter_config_id_t profile = {.handle = 0};
+  ROCPROFILER_CALL(
+      rocprofiler_create_counter_config(dispatch_data.dispatch_info.agent_id,
+                                        collect_counters.data(),
+                                        collect_counters.size(), &profile),
+      "construct profile cfg");
+  return profile;
+}
+
+/**
+ * Callback from rocprofiler when an kernel dispatch is enqueued into the HSA
+ * queue. rocprofiler_counter_config_id_t* is a return to specify what counters
+ * to collect for this dispatch (dispatch_packet).
+ * We store profile in a cache to prevent constructing many identical
+ * profiles. We first check the cache to see if we have already constructed a
+ * profile for the agent. If we have, return it. Otherwise, construct a new
+ * profile.
+ */
+void dispatch_callback(
+    rocprofiler_dispatch_counting_service_data_t dispatch_data,
+    rocprofiler_counter_config_id_t *config,
+    rocprofiler_user_data_t * /*user_data*/, void *callback_data_args) {
+
+  auto kernel_id = dispatch_data.dispatch_info.kernel_id;
+
+  // create static map of kernel_id to number of dispatches (zero indexed) and
+  // update it
+  static std::unordered_map<uint64_t, uint64_t> kernel_id_iteration_map{};
+  static std::shared_mutex kernel_id_iteration_mutex;
+  uint64_t kernel_iteration = 0;
+  {
+    // Acquire unique lock for update and ensure map is updated correctly
+    std::unique_lock<std::shared_mutex> lock(kernel_id_iteration_mutex);
+    auto &iter = kernel_id_iteration_map[kernel_id];
+    iter += 1;
+    kernel_iteration = iter;
+  }
+
+  // static cast tool
+  auto *tool_data_ptr =
+      static_cast<std::unique_ptr<tool_data_t> *>(callback_data_args);
+  tool_data_t *tool;
+  {
+    std::lock_guard<std::mutex> lock(tool_data_ptr->get()->mut);
+    tool = tool_data_ptr->get();
+  }
+
+  // kernel filtering
+  if (!is_targetted_dispatch(tool, kernel_id, kernel_iteration)) {
+    return;
+  }
+
+  static std::shared_mutex m_mutex = {};
+  static std::unordered_map<uint64_t, rocprofiler_counter_config_id_t>
+      profile_cache = {};
+
+  // check cache for existing profile for this agent
+  auto search_cache = [&]() {
+    if (auto pos =
+            profile_cache.find(dispatch_data.dispatch_info.agent_id.handle);
+        pos != profile_cache.end()) {
+      *config = pos->second;
+      return true;
+    }
+    return false;
+  };
+  {
+    auto rlock = std::shared_lock{m_mutex};
+    if (search_cache())
+      return;
+  }
+
+  // get write lock to update cache
+  auto wlock = std::unique_lock{m_mutex};
+  if (search_cache())
+    return;
+
+  // cache the profile for this agent
+  rocprofiler_counter_config_id_t profile =
+      create_counter_collection_profile(tool, dispatch_data);
+  profile_cache.emplace(dispatch_data.dispatch_info.agent_id.handle, profile);
+  // Return the profile to collect those counters for this dispatch
+  *config = profile;
+}
+
+int tool_init(rocprofiler_client_finalize_t, void *user_data) {
+  std::clog << "[rocprofiler-compute] In tool init\n";
+  ROCPROFILER_CALL(rocprofiler_create_context(&get_client_ctx()),
+                   "context creation");
+
+  ROCPROFILER_CALL(rocprofiler_configure_callback_dispatch_counting_service(
+                       get_client_ctx(), dispatch_callback, user_data,
+                       record_callback, user_data),
+                   "setup counting service");
+  ROCPROFILER_CALL(rocprofiler_configure_callback_tracing_service(
+                       get_client_ctx(),
+                       ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, nullptr, 0,
+                       tool_tracing_callback, user_data),
+                   "setup code object tracing service");
+  ROCPROFILER_CALL(rocprofiler_start_context(get_client_ctx()),
+                   "start context");
+
+  return 0;
+}
+
+void generate_output(tool_data_t *tool_data) {
+  // Dispatches before the kernel to be filtered was registered may have been
+  // profiled. Remove any records whose kernel id does not match the
+  // target_kernel_ids
+  if (!tool_data->target_kernel_ids.empty()) {
+    tool_data->counter_records.erase(
+        std::remove_if(tool_data->counter_records.begin(),
+                       tool_data->counter_records.end(),
+                       [tool_data](const counter_info_record_t &record) {
+                         return tool_data->target_kernel_ids.find(
+                                    record.kernel_id) ==
+                                tool_data->target_kernel_ids.end();
+                       }),
+        tool_data->counter_records.end());
+  }
+
+  // Write collected counter records and clean up
+  if (auto &os = tool_data->output_stream) {
+    for (const auto &r : tool_data->counter_records)
+      *os << r.dispatch_id << ',' << r.counter_id << ',' << r.counter_name
+          << ',' << r.counter_value << '\n';
+    os->flush();
+  }
+}
+
+void tool_fini(void *user_data) {
+  assert(user_data);
+  std::clog << "[rocprofiler-compute] In tool fini\n";
+  rocprofiler_stop_context(get_client_ctx());
+
+  auto *tool_data_ptr = static_cast<std::unique_ptr<tool_data_t> *>(user_data);
+  generate_output(tool_data_ptr->get());
+
+  delete tool_data_ptr;
+}
+
+} // namespace
+
+std::unique_ptr<tool_data_t> create_tool_data(rocprofiler_client_id_t *id) {
+  auto tool_data = std::make_unique<tool_data_t>();
+
+  // Generate a unique output filename using a random hex string (no libuuid
+  // dependency)
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<uint32_t> dis(0, 0xFFFFFFFF);
+  std::stringstream filename_ss;
+  filename_ss << std::hex << dis(gen);
+  std::string base_filename =
+      "counter_collection_" + filename_ss.str().substr(0, 8) + ".csv";
+
+  // Require ROCPROF_OUTPUT_PATH to be set, otherwise error out
+  std::string filename;
+  const char *output_path = getenv("ROCPROF_OUTPUT_PATH");
+  if (!output_path || !*output_path) {
+    throw std::runtime_error(
+        "ROCPROF_OUTPUT_PATH environment variable must be set");
+  }
+  filename = output_path;
+  if (filename.back() != '/')
+    filename += '/';
+  // Use the generated base filename along with ROCPROF_OUTPUT_PATH
+  filename += base_filename;
+
+  // Set output stream to file
+  // Set output stream to file
+  auto ofs = std::make_unique<std::ofstream>(filename);
+  if (!ofs->is_open()) {
+    throw std::runtime_error("Failed to open output file: " + filename);
+  }
+  tool_data->output_stream = std::move(ofs);
+  // Write header at the beginning of the file
+  *tool_data->output_stream
+      << "dispatch_id,counter_id,counter_name,counter_value\n";
+  tool_data->output_stream->flush();
+
+  // Write to clog the path of the logging file
+  std::clog << id->name << " [" << __FUNCTION__
+            << "] Logging counter collection to: " << filename << std::endl;
+
+  // Store ROCPROF env. vars. in tool_data
+
+  // ROCPROF_COUNTERS env. var. is a string like "pmc: counter1 counter2 ..."
+  if (const char *v = getenv("ROCPROF_COUNTERS"))
+    tool_data->requested_counters = v;
+
+  // ROCPROF_KERNEL_FILTER_INCLUDE_REGEX env. var. is a regex string like
+  // kernel_name_1|kernel_name_2|... Used to collect counters only for kernels
+  // with names matching the regex
+  if (const char *v = getenv("ROCPROF_KERNEL_FILTER_INCLUDE_REGEX"))
+    tool_data->kernel_filter_include_regex = v;
+
+  // ROCPROF_KERNEL_FILTER_RANGE env. var. is a string like "[4,7-9,...]"
+  if (const char *v = getenv("ROCPROF_KERNEL_FILTER_RANGE")) {
+    // Remove square brackets at the ends if present
+    std::string v_str = v;
+    if (!v_str.empty() && v_str.front() == '[')
+      v_str.erase(0, 1);
+    if (!v_str.empty() && v_str.back() == ']')
+      v_str.pop_back();
+    v = v_str.c_str();
+    // Parse the range string into vector of pairs
+    std::istringstream ss(v);
+    for (std::string token; std::getline(ss, token, ',');) {
+      size_t dash_pos = token.find('-');
+      try {
+        if (dash_pos == std::string::npos) {
+          // single number
+          uint64_t num = std::stoull(token);
+          tool_data->kernel_filter_ranges.emplace_back(num, num);
+        } else {
+          // range of numbers
+          uint64_t start = std::stoull(token.substr(0, dash_pos));
+          uint64_t end = std::stoull(token.substr(dash_pos + 1));
+          tool_data->kernel_filter_ranges.emplace_back(start, end);
+        }
+      } catch (const std::invalid_argument &) {
+        std::cerr << "[rocprofiler-compute] [" << __FUNCTION__
+                  << "] ERROR: Invalid entry in ROCPROF_KERNEL_FILTER_RANGE: "
+                  << token << std::endl;
+      }
+    }
+  }
+
+  return tool_data;
+}
+
+rocprofiler_tool_configure_result_t *
+rocprofiler_configure(uint32_t version, const char *runtime_version,
+                      uint32_t priority, rocprofiler_client_id_t *id) {
+  // set the client name
+  id->name = "[rocprofiler-compute]";
+
+  // compute major/minor/patch version info
+  uint32_t major = version / 10000;
+  uint32_t minor = (version % 10000) / 100;
+  uint32_t patch = version % 100;
+
+  // generate info string
+  auto info = std::stringstream{};
+  info << id->name << " [" << __FUNCTION__ << "] (priority=" << priority
+       << ") is using rocprofiler-sdk v" << major << "." << minor << "."
+       << patch << " (" << runtime_version << ")";
+
+  std::clog << info.str() << std::endl;
+
+  // init tool data
+  auto tool_data = create_tool_data(id);
+
+  // create configure data
+  static auto cfg = rocprofiler_tool_configure_result_t{
+      sizeof(rocprofiler_tool_configure_result_t), &tool_init, &tool_fini,
+      static_cast<void *>(
+          new std::unique_ptr<tool_data_t>(std::move(tool_data)))};
+
+  // return pointer to configure data
+  return &cfg;
+}
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_base.py b/projects/rocprofiler-compute/src/rocprof_compute_base.py
index 26da0157d0..1837ec5535 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_base.py
+++ b/projects/rocprofiler-compute/src/rocprof_compute_base.py
@@ -151,6 +151,22 @@ class RocProfCompute:
         ) and block:
             console_error("Cannot use --list-available-metrics with --blocks")
 
+        # fallback to csv output format, if rocpd public api not available
+        if (
+            self.__mode == "profile"
+            and self.__args.format_rocprof_output == "rocpd"
+            and not (
+                Path(self.__args.rocprofiler_sdk_tool_path).parents[1]
+                / "librocprofiler-sdk-rocpd.so"
+            ).exists()
+        ):
+            console_warning(
+                "rocpd output format is not supported with the "
+                "current rocprofiler-sdk version. "
+                "Falling back to csv output format."
+            )
+            self.__args.format_rocprof_output = "csv"
+
     @demarcate
     def load_soc_specs(self, sysinfo: Optional[dict] = None) -> None:
         """Load OmniSoC instance for RocProfCompute run"""
@@ -180,16 +196,6 @@ class RocProfCompute:
         )
         self.__args = parser.parse_args()
 
-        if (
-            hasattr(self.__args, "format_rocprof_output")
-            and self.__args.format_rocprof_output != "rocpd"
-        ):
-            console_warning(
-                f"The option --format-rocprof-output currently set to "
-                f"{self.__args.format_rocprof_output} will default to rocpd "
-                "in a future release."
-            )
-
         if self.__args.mode is None:
             if self.__args.specs:
                 print(generate_machine_specs(self.__args))
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py
index 1aeabcd653..079a9b3045 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py
+++ b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py
@@ -27,6 +27,8 @@ import argparse
 import csv
 import shlex
 import shutil
+import sys
+import tempfile
 import time
 from abc import abstractmethod
 from pathlib import Path
@@ -67,9 +69,7 @@ class RocProfCompute_Base:
     def get_args(self) -> argparse.Namespace:
         return self.__args
 
-    def get_profiler_options(
-        self, fname: str, soc: OmniSoC_Base
-    ) -> Union[list[str], dict[str, Any]]:
+    def get_profiler_options(self) -> Union[list[str], dict[str, Any]]:
         """Fetch any version specific arguments required by profiler"""
         # assume no SoC specific options and return empty list by default
         return []
@@ -415,6 +415,58 @@ class RocProfCompute_Base:
         total_runs = len(input_files)
         total_profiling_time = 0.0
 
+        native_tool_path = None
+        # Native counter collection tool is only compatible with
+        # rocprofiler-sdk public API for ROCm version >= 7.x.x
+        # Do not use native tool in attach
+        # mode until we figure out how multiple tools can attach
+        # TODO: Figure out how multiple tools can attach
+        if (
+            self.__profiler == "rocprofiler-sdk"
+            and not args.no_native_tool
+            and int(self._soc._mspec.rocm_version.split(".")[0]) >= 7
+            and not args.attach_pid
+        ):
+            # Use native counter collection tool
+            native_tool_path = str(
+                Path(sys.argv[0]).resolve().parents[2]
+                / "lib"
+                / "rocprofiler-compute"
+                / "librocprofiler-compute-tool.so"
+            )
+            if not Path(native_tool_path).is_file():
+                # Build native counter collection tool if not exists
+                native_tool_path = str(
+                    Path(
+                        tempfile.mkdtemp(prefix="rocprofiler-compute-tool-", dir="/tmp")
+                    )
+                    / "librocprofiler-compute-tool.so"
+                )
+                link_libraries = ("rocprofiler-sdk",)
+                build_command = (
+                    # Create shared object
+                    "hipcc -shared -fPIC "
+                    # Link with dependant libraries
+                    + " ".join(f"-l{lib}" for lib in link_libraries)
+                    + " "
+                    # Compliler flags
+                    "-std=c++17 -W -Wall -Wextra -Wshadow -O2 "
+                    # rocprofiler sdk library path
+                    f"-L {str(Path(args.rocprofiler_sdk_tool_path).parent.parent)} "
+                    # native tool source files (tool.cpp and helper.cpp)
+                    f"{str(Path(__file__).parent.parent)}/"
+                    "lib/rocprofiler_compute_tool.cpp "
+                    f"{str(Path(__file__).parent.parent)}/"
+                    "lib/helper.cpp "
+                    # temporary shared object for native tool
+                    f"-o {native_tool_path}"
+                )
+                console_debug(f"Building native tool using command: {build_command}")
+                success, output = capture_subprocess_output(shlex.split(build_command))
+                console_debug(f"Build output: {output}")
+                if not success:
+                    console_error("Failed to build native counter collection tool.")
+
         for i, fname in enumerate(input_files):
             run_number = i + 1
 
@@ -465,7 +517,10 @@ class RocProfCompute_Base:
                     console_debug(output)
 
             console_log("profiling", f"Current input file: {fname}")
-            options = self.get_profiler_options(str(fname), self._soc)
+            if self.__profiler == "rocprofiler-sdk":
+                options = self.get_profiler_options(native_tool_path=native_tool_path)
+            else:
+                options = self.get_profiler_options()
             start_time = time.time()
             if self.__profiler == "rocprofv3" or self.__profiler == "rocprofiler-sdk":
                 # Only 1-run case is permitted for attach/detach
@@ -502,6 +557,10 @@ class RocProfCompute_Base:
             else:
                 console_error("Profiler not supported")
 
+        # Delete temporary native tool if created
+        if native_tool_path and native_tool_path.startswith("/tmp"):
+            shutil.rmtree(Path(native_tool_path).parent, ignore_errors=True)
+
         # PC sampling data is only collected when block "21" is specified
         if not "21" in args.filter_blocks:
             console_warning(
@@ -514,14 +573,13 @@ class RocProfCompute_Base:
         console_log(f"[Run {total_runs + 1}/{total_runs + 1}][PC sampling profile run]")
 
         start_time = time.time()
+        # No native tool for pc sampling
+        options = self.get_profiler_options()
         pc_sampling_prof(
+            profiler_options=options,
             method=args.pc_sampling_method,
             interval=args.pc_sampling_interval,
             workload_dir=args.path,
-            appcmd=shlex.split(
-                args.remaining
-            ),  # FIXME: the right solution is applying it when argparsing once!
-            rocprofiler_sdk_library_path=args.rocprofiler_sdk_library_path,
         )
         end_time = time.time()
 
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_rocprof_v3.py b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_rocprof_v3.py
index b796bee1f4..661732ebb1 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_rocprof_v3.py
+++ b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_rocprof_v3.py
@@ -46,7 +46,7 @@ class rocprof_v3_profiler(RocProfCompute_Base):
             or not self.get_args().roof_only
         )
 
-    def get_profiler_options(self, fname: str, soc: OmniSoC_Base) -> list[str]:
+    def get_profiler_options(self) -> list[str]:
         args = self.get_args()
         app_cmd = shlex.split(args.remaining)
 
@@ -90,12 +90,12 @@ class rocprof_v3_profiler(RocProfCompute_Base):
         if args.dispatch:
             for dispatch_id in args.dispatch:
                 if ":" in dispatch_id:
-                    # 4:7 -> 5-7
+                    # 4:7 -> 4-7
                     start, end = dispatch_id.split(":")
-                    dispatch.append(f"{int(start) + 1}-{end}")
+                    dispatch.append(f"{start}-{end}")
                 else:
-                    # 4 -> 5
-                    dispatch.append(f"{int(dispatch_id) + 1}")
+                    # 4 -> 4
+                    dispatch.append(f"{dispatch_id}")
         if dispatch:
             profiling_options.extend([
                 "--kernel-iteration-range",
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_rocprofiler_sdk.py b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_rocprofiler_sdk.py
index ab18e4be54..aa75baeda8 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_rocprofiler_sdk.py
+++ b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_rocprofiler_sdk.py
@@ -26,7 +26,7 @@
 import argparse
 import shlex
 from pathlib import Path
-from typing import Union
+from typing import Optional, Union
 
 from rocprof_compute_profile.profiler_base import RocProfCompute_Base
 from rocprof_compute_soc.soc_base import OmniSoC_Base
@@ -48,35 +48,44 @@ class rocprofiler_sdk_profiler(RocProfCompute_Base):
         )
 
     def get_profiler_options(
-        self, fname: str, soc: OmniSoC_Base
+        self, native_tool_path: Optional[str] = None
     ) -> dict[str, Union[str, list[str]]]:
         args = self.get_args()
         app_cmd = shlex.split(args.remaining)
 
-        rocm_libdir = Path(args.rocprofiler_sdk_library_path).parent
-        rocprofiler_sdk_tool_path = str(
-            rocm_libdir / "rocprofiler-sdk" / "librocprofiler-sdk-tool.so"
-        )
-        rocm_dir = Path(args.rocprofiler_sdk_library_path).parent.parent
-        rocprofiler_attach_tool_path = str(
-            rocm_dir / "lib" / "librocprofiler-sdk-rocattach.so"
-        )
-        ld_preload = [
-            rocprofiler_sdk_tool_path,
-            args.rocprofiler_sdk_library_path,
-            rocprofiler_attach_tool_path,
-        ]
-        options = {
-            "ROCPROFILER_LIBRARY_CTOR": "1",
+        ld_preload = [args.rocprofiler_sdk_tool_path]
+        if native_tool_path:
+            # Use native tool to collect counters
+            ld_preload.append(native_tool_path)
+            options = {"ROCPROF_COUNTER_COLLECTION": "0"}
+            console_log(
+                f"Using native counter collection tool: {str(native_tool_path)}"
+            )
+        else:
+            options = {"ROCPROF_COUNTER_COLLECTION": "1"}
+
+        options.update({
             "LD_PRELOAD": ":".join(ld_preload),
-            "ROCP_TOOL_LIBRARIES": rocprofiler_sdk_tool_path,
-            "LD_LIBRARY_PATH": str(rocm_libdir),
             "ROCPROF_KERNEL_TRACE": "1",
             "ROCPROF_OUTPUT_FORMAT": args.format_rocprof_output,
             "ROCPROF_OUTPUT_PATH": f"{args.path}/out/pmc_1",
-        }
+        })
+
+        # Create folder pointed by ROCPROF_OUTPUT_PATH
+        Path(options["ROCPROF_OUTPUT_PATH"]).mkdir(parents=True, exist_ok=True)
 
         if args.attach_pid:
+            # In attach mode, tools are provided using ROCP_TOOL_LIBRARIES
+            # instead of LD_PRELOAD.
+            options.update({
+                "ROCP_TOOL_LIBRARIES": ":".join(ld_preload),
+            })
+            options.pop("LD_PRELOAD", None)
+
+            rocprofiler_attach_tool_path = str(
+                Path(args.rocprofiler_sdk_tool_path).parent.parent
+                / "librocprofiler-sdk-rocattach.so"
+            )
             options.update({
                 "ROCPROF_ATTACH_TOOL_LIBRARY": rocprofiler_attach_tool_path,
                 "ROCPROF_ATTACH_PID": args.attach_pid,
@@ -108,13 +117,12 @@ class rocprofiler_sdk_profiler(RocProfCompute_Base):
         if args.dispatch:
             for dispatch_id in args.dispatch:
                 if ":" in dispatch_id:
-                    # 4:7 -> 5-7
+                    # 4:7 -> 4-7
                     start, end = dispatch_id.split(":")
-                    dispatch.append(f"{int(start) + 1}-{end}")
+                    dispatch.append(f"{start}-{end}")
                 else:
-                    # 4 -> 5
-                    dispatch.append(f"{int(dispatch_id) + 1}")
-
+                    # 4 -> 4
+                    dispatch.append(f"{dispatch_id}")
         if dispatch:
             options["ROCPROF_KERNEL_FILTER_RANGE"] = f"[{','.join(dispatch)}]"
         if not args.attach_pid:
diff --git a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py
index aaa27bb009..ad96b19462 100644
--- a/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py
+++ b/projects/rocprofiler-compute/src/rocprof_compute_soc/soc_base.py
@@ -51,9 +51,7 @@ from utils.utils import (
     METRIC_ID_RE,
     add_counter_extra_config_input_yaml,
     convert_metric_id_to_panel_info,
-    detect_rocprof,
     get_panel_alias,
-    get_submodules,
     is_tcc_channel_counter,
     mibench,
     parse_sets_yaml,
@@ -409,55 +407,35 @@ class OmniSoC_Base:
 
     def get_rocprof_supported_counters(self) -> set[str]:
         args = self.get_args()
-        rocprof_cmd = detect_rocprof(args)
-
-        if rocprof_cmd != "rocprofiler-sdk":
-            console_warning(
-                "rocprofv3 interface is deprecated and will be removed "
-                "in a future release."
-            )
-
         rocprof_counters: set[str] = set()
 
-        if not (
-            str(rocprof_cmd).endswith("rocprofv3")
-            or str(rocprof_cmd) == "rocprofiler-sdk"
-        ):
-            console_error(
-                f"Incompatible profiler: {rocprof_cmd}. "
-                "Supported profilers include: "
-                f"{get_submodules('rocprof_compute_profile')}"
+        # Point to counter definition
+        old_rocprofiler_metrics_path = os.environ.get("ROCPROFILER_METRICS_PATH")
+        os.environ["ROCPROFILER_METRICS_PATH"] = str(
+            config.rocprof_compute_home / "rocprof_compute_soc" / "profile_configs"
+        )
+        sys.path.append(
+            str(
+                Path(args.rocprofiler_sdk_tool_path).parents[1]
+                / "python3/site-packages"
             )
+        )
+        from rocprofv3 import avail
 
-            # Point to counter definition
-            old_rocprofiler_metrics_path = os.environ.get("ROCPROFILER_METRICS_PATH")
-            os.environ["ROCPROFILER_METRICS_PATH"] = str(
-                config.rocprof_compute_home / "rocprof_compute_soc" / "profile_configs"
-            )
-            sys.path.append(
-                str(
-                    Path(self.get_args().rocprofiler_sdk_library_path).parent
-                    / "python3/site-packages"
-                )
-            )
-            from rocprofv3 import avail
-
-            avail.loadLibrary.libname = str(
-                Path(args.rocprofiler_sdk_library_path).parent
-                / "rocprofiler-sdk"
-                / "librocprofv3-list-avail.so"
-            )
-            counters = avail.get_counters()
-            rocprof_counters = {
-                counter.name
-                for counter in counters[list(counters.keys())[0]]
-                if hasattr(counter, "block") or hasattr(counter, "expression")
-            }
-            # Reset env. var.
-            if old_rocprofiler_metrics_path is None:
-                del os.environ["ROCPROFILER_METRICS_PATH"]
-            else:
-                os.environ["ROCPROFILER_METRICS_PATH"] = old_rocprofiler_metrics_path
+        avail.loadLibrary.libname = str(
+            Path(args.rocprofiler_sdk_tool_path).parent / "librocprofv3-list-avail.so"
+        )
+        counters = avail.get_counters()
+        rocprof_counters = {
+            counter.name
+            for counter in counters[list(counters.keys())[0]]
+            if hasattr(counter, "block") or hasattr(counter, "expression")
+        }
+        # Reset env. var.
+        if old_rocprofiler_metrics_path is None:
+            del os.environ["ROCPROFILER_METRICS_PATH"]
+        else:
+            os.environ["ROCPROFILER_METRICS_PATH"] = old_rocprofiler_metrics_path
 
         return rocprof_counters
 
diff --git a/projects/rocprofiler-compute/src/utils/rocpd_data.py b/projects/rocprofiler-compute/src/utils/rocpd_data.py
index 8eec4a06a3..564073786e 100644
--- a/projects/rocprofiler-compute/src/utils/rocpd_data.py
+++ b/projects/rocprofiler-compute/src/utils/rocpd_data.py
@@ -53,6 +53,12 @@ SELECT
     value as Counter_Value
 FROM counters_collection
 """
+ROCPD_PMC_EVENT_TABLE_NAME_PREFIX = "rocpd_pmc_event_"
+TABLE_NAME_PREFIX_QUERY = (
+    "SELECT name FROM sqlite_master WHERE type='table' "
+    "AND name LIKE '{table_name_prefix}%'"
+)
+INSERT_QUERY = "INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"
 
 
 def convert_db_to_csv(
@@ -120,3 +126,55 @@ def process_rocpd_csv(df: pd.DataFrame) -> pd.DataFrame:
     # Reset dispatch IDs
     df["Dispatch_ID"] = range(len(df))
     return df
+
+
+def update_rocpd_pmc_events(counter_info: pd.DataFrame, rocpd_db_path: str) -> None:
+    """Update pmc_event table in the given rocpd database path"""
+    try:
+        with closing(sqlite3.connect(rocpd_db_path)) as conn:
+            # Get pmc_event table name
+            with closing(
+                conn.execute(
+                    TABLE_NAME_PREFIX_QUERY.format(
+                        table_name_prefix=ROCPD_PMC_EVENT_TABLE_NAME_PREFIX
+                    )
+                )
+            ) as cursor:
+                table_name = cursor.fetchone()
+            if table_name is None:
+                console_error("No pmc_event table found in the rocpd database")
+            table_name = table_name[0]
+
+            # get pmc_event table data
+            guid = table_name[len(ROCPD_PMC_EVENT_TABLE_NAME_PREFIX) :].replace(
+                "_", "-"
+            )
+            columns = ("guid", "event_id", "pmc_id", "value")
+            values = list(
+                zip(
+                    # guid
+                    [guid] * len(counter_info),
+                    # event_id
+                    counter_info["dispatch_id"],
+                    # pmc_id
+                    counter_info["counter_id"],
+                    # value
+                    counter_info["counter_value"],
+                )
+            )
+
+            # insert into pmc_event table
+            with conn:
+                placeholders = ", ".join(["?"] * len(columns))
+                conn.executemany(
+                    INSERT_QUERY.format(
+                        table_name=table_name,
+                        columns=", ".join(columns),
+                        placeholders=placeholders,
+                    ),
+                    values,
+                )
+    except OSError as e:
+        console_error(f"Database error while updating pmc_event table: {e}")
+    except Exception as e:
+        console_error(f"Unexpected error updating pmc_event table: {e}")
diff --git a/projects/rocprofiler-compute/src/utils/utils.py b/projects/rocprofiler-compute/src/utils/utils.py
index dd819a4973..0950c47366 100644
--- a/projects/rocprofiler-compute/src/utils/utils.py
+++ b/projects/rocprofiler-compute/src/utils/utils.py
@@ -41,6 +41,7 @@ import sys
 import tempfile
 import threading
 import time
+import traceback
 import uuid
 from collections.abc import Generator
 from contextlib import contextmanager
@@ -210,14 +211,14 @@ def detect_rocprof(args: argparse.Namespace) -> str:
 
     # Default is rocprofiler-sdk
     if os.environ.get("ROCPROF", "rocprofiler-sdk") == "rocprofiler-sdk":
-        if not Path(args.rocprofiler_sdk_library_path).exists():
+        if not Path(args.rocprofiler_sdk_tool_path).exists():
             console_error(
-                "Could not find rocprofiler-sdk library at "
-                f"{args.rocprofiler_sdk_library_path}"
+                "Could not find rocprofiler-sdk tool at "
+                f"{args.rocprofiler_sdk_tool_path}"
             )
         rocprof_cmd = "rocprofiler-sdk"
         console_debug(f"rocprof_cmd is {rocprof_cmd}")
-        console_debug(f"rocprofiler_sdk_path is {args.rocprofiler_sdk_library_path}")
+        console_debug(f"rocprofiler_sdk_tool_path is {args.rocprofiler_sdk_tool_path}")
     else:
         # If ROCPROF is not set to rocprofiler-sdk
         rocprof_cmd = os.environ["ROCPROF"]
@@ -705,16 +706,12 @@ def run_prof(
 
     # standard rocprof options
     if rocprof_cmd == "rocprofiler-sdk":
-        options = cast(dict[str, Union[str, list[str]]], profiler_options)
-        options["ROCPROF_COUNTER_COLLECTION"] = "1"
+        options = cast(dict[str, Union[str, list[str]]], profiler_options).copy()
         options["ROCPROF_COUNTERS"] = f"pmc: {' '.join(parse_text(fname))}"
+        options["ROCPROF_AGENT_INDEX"] = "absolute"
     else:
         default_options = ["-i", fname]
         options = default_options + cast(list[str], profiler_options)
-
-    if rocprof_cmd == "rocprofiler-sdk":
-        options["ROCPROF_AGENT_INDEX"] = "absolute"
-    else:
         options = ["-A", "absolute"] + options
 
     new_env = os.environ.copy()
@@ -758,7 +755,6 @@ def run_prof(
     ):
         new_env["ROCPROFILER_INDIVIDUAL_XCC_MODE"] = "1"
 
-    is_timestamps = Path(fname).name == "timestamps.txt"
     time_1 = time.time()
 
     if rocprof_cmd == "rocprofiler-sdk":
@@ -849,6 +845,16 @@ def run_prof(
     results_files: list[str] = []
 
     if format_rocprof_output == "rocpd":
+        # If using native tool for counter collection
+        if (
+            rocprof_cmd == "rocprofiler-sdk"
+            and options["ROCPROF_COUNTER_COLLECTION"] == "0"
+        ):
+            # Update rocpd database with counter csv created by native tool
+            rocpd_data.update_rocpd_pmc_events(
+                pd.read_csv(glob.glob(workload_dir + "/out/pmc_1/*.csv")[0]),
+                glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
+            )
         # Write results_fbase.csv
         rocpd_data.convert_db_to_csv(
             glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
@@ -865,88 +871,95 @@ def run_prof(
         # Remove temp directory
         shutil.rmtree(workload_dir + "/" + "out")
         return
-
-    # rocprofv3 requires additional processing for each process
-    results_files = process_rocprofv3_output(
-        format_rocprof_output, workload_dir, is_timestamps
-    )
-
-    if rocprof_cmd == "rocprofiler-sdk":
-        # TODO: as rocprofv3 --kokkos-trace feature improves,
-        # rocprof-compute should make updates accordingly
-        if "ROCPROF_HIP_RUNTIME_API_TRACE" in options:
-            process_hip_trace_output(workload_dir, fbase)
-    else:
-        if "--kokkos-trace" in options:
+    elif format_rocprof_output == "csv":
+        if rocprof_cmd == "rocprofiler-sdk":
+            # rocprofv3 requires additional processing for each process
+            results_files = process_rocprofv3_output(
+                workload_dir,
+                # counter data collected using native tool
+                using_native_tool=options["ROCPROF_COUNTER_COLLECTION"] == "0",
+            )
             # TODO: as rocprofv3 --kokkos-trace feature improves,
             # rocprof-compute should make updates accordingly
-            process_kokkos_trace_output(workload_dir, fbase)
-        elif "--hip-trace" in options:
-            process_hip_trace_output(workload_dir, fbase)
+            if "ROCPROF_HIP_RUNTIME_API_TRACE" in options:
+                process_hip_trace_output(workload_dir, fbase)
+        else:
+            # rocprofv3 requires additional processing for each process
+            # rocprofv3 cannot use native tool
+            results_files = process_rocprofv3_output(
+                workload_dir, using_native_tool=False
+            )
+            if "--kokkos-trace" in options:
+                # TODO: as rocprofv3 --kokkos-trace feature improves,
+                # rocprof-compute should make updates accordingly
+                process_kokkos_trace_output(workload_dir, fbase)
+            elif "--hip-trace" in options:
+                process_hip_trace_output(workload_dir, fbase)
 
-    # Combine results into single CSV file
-    if results_files:
-        combined_results = pd.concat(
-            [pd.read_csv(f) for f in results_files], ignore_index=True
+        # Combine results into single CSV file
+        if results_files:
+            combined_results = pd.concat(
+                [pd.read_csv(f) for f in results_files], ignore_index=True
+            )
+        else:
+            console_warning(
+                f"Cannot write results for {fbase}.csv due to no counter "
+                "csv files generated."
+            )
+            return
+
+        # Overwrite column to ensure unique IDs.
+        combined_results["Dispatch_ID"] = range(0, len(combined_results))
+
+        combined_results.to_csv(
+            workload_dir + "/out/pmc_1/results_" + fbase + ".csv", index=False
         )
+
+        if Path(f"{workload_dir}/out").exists():
+            # copy and remove out directory if needed
+            shutil.copyfile(
+                f"{workload_dir}/out/pmc_1/results_{fbase}.csv",
+                f"{workload_dir}/{fbase}.csv",
+            )
+            # Remove temp directory
+            shutil.rmtree(f"{workload_dir}/out")
+
+        # Standardize rocprof headers via overwrite
+        # {<key to remove>: <key to replace>}
+        output_headers = {
+            # ROCm-6.1.0 specific csv headers
+            "KernelName": "Kernel_Name",
+            "Index": "Dispatch_ID",
+            "grd": "Grid_Size",
+            "gpu-id": "GPU_ID",
+            "wgr": "Workgroup_Size",
+            "lds": "LDS_Per_Workgroup",
+            "scr": "Scratch_Per_Workitem",
+            "sgpr": "SGPR",
+            "arch_vgpr": "Arch_VGPR",
+            "accum_vgpr": "Accum_VGPR",
+            "BeginNs": "Start_Timestamp",
+            "EndNs": "End_Timestamp",
+            # ROCm-6.0.0 specific csv headers
+            "GRD": "Grid_Size",
+            "WGR": "Workgroup_Size",
+            "LDS": "LDS_Per_Workgroup",
+            "SCR": "Scratch_Per_Workitem",
+            "ACCUM_VGPR": "Accum_VGPR",
+        }
+        csv_path = Path(workload_dir) / f"{fbase}.csv"
+        df = pd.read_csv(csv_path)
+        df.rename(columns=output_headers, inplace=True)
+        df.to_csv(csv_path, index=False)
     else:
-        console_warning(
-            f"Cannot write results for {fbase}.csv due to no counter "
-            "csv files generated."
-        )
-        return
-
-    # Overwrite column to ensure unique IDs.
-    combined_results["Dispatch_ID"] = range(0, len(combined_results))
-
-    combined_results.to_csv(
-        workload_dir + "/out/pmc_1/results_" + fbase + ".csv", index=False
-    )
-
-    if Path(f"{workload_dir}/out").exists():
-        # copy and remove out directory if needed
-        shutil.copyfile(
-            f"{workload_dir}/out/pmc_1/results_{fbase}.csv",
-            f"{workload_dir}/{fbase}.csv",
-        )
-        # Remove temp directory
-        shutil.rmtree(f"{workload_dir}/out")
-
-    # Standardize rocprof headers via overwrite
-    # {<key to remove>: <key to replace>}
-    output_headers = {
-        # ROCm-6.1.0 specific csv headers
-        "KernelName": "Kernel_Name",
-        "Index": "Dispatch_ID",
-        "grd": "Grid_Size",
-        "gpu-id": "GPU_ID",
-        "wgr": "Workgroup_Size",
-        "lds": "LDS_Per_Workgroup",
-        "scr": "Scratch_Per_Workitem",
-        "sgpr": "SGPR",
-        "arch_vgpr": "Arch_VGPR",
-        "accum_vgpr": "Accum_VGPR",
-        "BeginNs": "Start_Timestamp",
-        "EndNs": "End_Timestamp",
-        # ROCm-6.0.0 specific csv headers
-        "GRD": "Grid_Size",
-        "WGR": "Workgroup_Size",
-        "LDS": "LDS_Per_Workgroup",
-        "SCR": "Scratch_Per_Workitem",
-        "ACCUM_VGPR": "Accum_VGPR",
-    }
-    csv_path = Path(workload_dir) / f"{fbase}.csv"
-    df = pd.read_csv(csv_path)
-    df.rename(columns=output_headers, inplace=True)
-    df.to_csv(csv_path, index=False)
+        console_error(f"Unknown format_rocprof_output: {format_rocprof_output}")
 
 
 def pc_sampling_prof(
+    profiler_options: Union[list[str], dict[str, Union[str, list[str]]]],
     method: str,
     interval: int,
     workload_dir: str,
-    appcmd: list[str],
-    rocprofiler_sdk_library_path: str,
 ) -> None:
     """
     Run rocprof with pc sampling. Current support v3 only.
@@ -957,19 +970,12 @@ def pc_sampling_prof(
     unit = "time" if method == "host_trap" else "cycles"
 
     if rocprof_cmd == "rocprofiler-sdk":
-        rocm_libdir = str(Path(rocprofiler_sdk_library_path).parent)
-        rocprofiler_sdk_tool_path = str(
-            Path(rocm_libdir) / "rocprofiler-sdk/librocprofiler-sdk-tool.so"
-        )
-        ld_preload = [
-            rocprofiler_sdk_tool_path,
-            rocprofiler_sdk_library_path,
-        ]
-        options = {
-            "ROCPROFILER_LIBRARY_CTOR": "1",
-            "LD_PRELOAD": ":".join(ld_preload),
-            "ROCP_TOOL_LIBRARIES": rocprofiler_sdk_tool_path,
-            "LD_LIBRARY_PATH": rocm_libdir,
+        options = cast(dict[str, Union[str, list[str]]], profiler_options).copy()
+        options.update({
+            # no counter collection for pc sampling
+            "ROCPROF_COUNTER_COLLECTION": "0",
+            # no kernel tracing for pc sampling
+            "ROCPROF_KERNEL_TRACE": "0",
             "ROCPROF_OUTPUT_FORMAT": "csv,json",
             "ROCPROF_OUTPUT_PATH": workload_dir,
             "ROCPROF_OUTPUT_FILE_NAME": "ps_file",
@@ -977,15 +983,15 @@ def pc_sampling_prof(
             "ROCPROF_PC_SAMPLING_UNIT": unit,
             "ROCPROF_PC_SAMPLING_INTERVAL": str(interval),
             "ROCPROF_PC_SAMPLING_METHOD": method,
-            "ROCPROF_KERNEL_TRACE": "1",
-        }
+        })
+        app_cmd = options.pop("APP_CMD") if "APP_CMD" in options else None
         new_env = os.environ.copy()
         for key, value in options.items():
             new_env[key] = value
         console_debug(f"pc sampling rocprof sdk env vars: {new_env}")
-        console_debug(f"pc sampling rocprof sdk user provided command: {appcmd}")
+        console_debug(f"pc sampling rocprof sdk user provided command: {app_cmd}")
         success, output = capture_subprocess_output(
-            appcmd, new_env=new_env, profileMode=True
+            app_cmd, new_env=new_env, profileMode=True
         )
     else:
         options = [
@@ -1005,9 +1011,11 @@ def pc_sampling_prof(
             "-o",
             "ps_file",  # TODO: sync up with the name from source in 2100_.yaml
             "--",
+            cast(str, profiler_options[-1]),  # app command
         ]
-        options.extend(appcmd)
 
+        console_debug(f"rocprof command: {shlex.join([rocprof_cmd] + options)}")
+        # profile the app
         success, output = capture_subprocess_output(
             [rocprof_cmd] + options, new_env=os.environ.copy(), profileMode=True
         )
@@ -1016,72 +1024,140 @@ def pc_sampling_prof(
         console_error("PC sampling failed.")
 
 
-def process_rocprofv3_output(
-    rocprof_output: str, workload_dir: str, is_timestamps: bool
-) -> list[str]:
+def convert_native_counter_collection_csv(workload_dir: str) -> None:
     """
-    rocprofv3 specific output processing.
-    takes care of json or csv formats, for csv format,
-    additional processing is performed.
+    Use native counter collection csv and rocprofiler-sdk kernel
+    trace to write counter collection csv in rocprofiler-sdk format
+    for further processing to pmc_perf.csv file
+    """
+    counter_data = pd.read_csv(
+        glob.glob(f"{workload_dir}/out/pmc_1/*.csv")[0], index_col=False
+    )
+    # Group by on counter_data based on dispatch_id and
+    # counter_id and sum the counter_value
+    counter_data = counter_data.groupby(
+        ["dispatch_id", "counter_name"], as_index=False
+    ).agg({"counter_value": "sum"})
+    kernel_data_filename = glob.glob(f"{workload_dir}/out/pmc_1/*/*_kernel_trace.csv")[
+        0
+    ]
+    kernel_data = pd.read_csv(kernel_data_filename)
+    rocprofv3_counter_data = pd.DataFrame({
+        "Correlation_Id": counter_data["dispatch_id"],
+        "Dispatch_Id": counter_data["dispatch_id"],
+        "Agent_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
+            "Agent_Id"
+        ].values,
+        "Queue_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
+            "Queue_Id"
+        ].values,
+        "Process_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
+            "Thread_Id"
+        ].values,
+        "Thread_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
+            "Thread_Id"
+        ].values,
+        "Grid_Size": (
+            kernel_data.iloc[counter_data["dispatch_id"] - 1][
+                ["Grid_Size_X", "Grid_Size_Y", "Grid_Size_Z"]
+            ]
+            .prod(axis=1)
+            .values
+        ),
+        "Kernel_Id": kernel_data.iloc[counter_data["dispatch_id"] - 1][
+            "Kernel_Id"
+        ].values,
+        "Kernel_Name": kernel_data.iloc[counter_data["dispatch_id"] - 1][
+            "Kernel_Name"
+        ].values,
+        "Workgroup_Size": (
+            kernel_data.iloc[counter_data["dispatch_id"] - 1][
+                ["Workgroup_Size_X", "Workgroup_Size_Y", "Workgroup_Size_Z"]
+            ]
+            .prod(axis=1)
+            .values
+        ),
+        "LDS_Block_Size": kernel_data.iloc[counter_data["dispatch_id"] - 1][
+            "LDS_Block_Size"
+        ].values,
+        "Scratch_Size": kernel_data.iloc[counter_data["dispatch_id"] - 1][
+            "Scratch_Size"
+        ].values,
+        "VGPR_Count": kernel_data.iloc[counter_data["dispatch_id"] - 1][
+            "VGPR_Count"
+        ].values,
+        "Accum_VGPR_Count": kernel_data.iloc[counter_data["dispatch_id"] - 1][
+            "Accum_VGPR_Count"
+        ].values,
+        "SGPR_Count": kernel_data.iloc[counter_data["dispatch_id"] - 1][
+            "SGPR_Count"
+        ].values,
+        "Counter_Name": counter_data["counter_name"],
+        "Counter_Value": counter_data["counter_value"],
+        "Start_Timestamp": kernel_data.iloc[counter_data["dispatch_id"] - 1][
+            "Start_Timestamp"
+        ].values,
+        "End_Timestamp": kernel_data.iloc[counter_data["dispatch_id"] - 1][
+            "End_Timestamp"
+        ].values,
+    })
+    rocprofv3_counter_data.to_csv(
+        kernel_data_filename.replace("kernel_trace", "counter_collection"),
+        index=False,
+    )
+
+
+def process_rocprofv3_output(workload_dir: str, using_native_tool: bool) -> list[str]:
+    """
+    rocprofv3 specific output processing for csv format.
     """
     results_files_csv: list[str] = []
 
-    if rocprof_output == "json":
-        results_files_json = glob.glob(f"{workload_dir}/out/pmc_1/*/*.json")
-
-        for json_file in results_files_json:
-            csv_file = str(Path(json_file).with_suffix(".csv"))
-            v3_json_to_csv(json_file, csv_file)
-        results_files_csv = glob.glob(f"{workload_dir}/out/pmc_1/*/*.csv")
-
-    elif rocprof_output == "csv":
-        counter_info_csvs = glob.glob(
-            f"{workload_dir}/out/pmc_1/*/*_counter_collection.csv"
-        )
-        existing_counter_files_csv = [f for f in counter_info_csvs if Path(f).is_file()]
-
-        if existing_counter_files_csv:
-            for counter_file in existing_counter_files_csv:
-                counter_path = Path(counter_file)
-                current_dir = counter_path.parent
-
-                agent_info_filepath = current_dir / counter_path.name.replace(
-                    "_counter_collection", "_agent_info"
-                )
-
-                if not agent_info_filepath.is_file():
-                    raise ValueError(
-                        f'{counter_file} has no corresponding "agent info" file'
-                    )
-
-                converted_csv_file = current_dir / counter_path.name.replace(
-                    "_counter_collection", "_converted"
-                )
-
-                try:
-                    v3_counter_csv_to_v2_csv(
-                        counter_file, str(agent_info_filepath), str(converted_csv_file)
-                    )
-                except Exception as e:
-                    console_warning(
-                        f"Error converting {counter_file} from v3 to v2 csv: {e}"
-                    )
-                    return []
-
-            results_files_csv = glob.glob(f"{workload_dir}/out/pmc_1/*/*_converted.csv")
-        elif is_timestamps:
-            # when the input is timestamps, we know counter csv file
-            # is not generated and will instead parse kernel trace file
-            results_files_csv = glob.glob(
-                f"{workload_dir}/out/pmc_1/*/*_kernel_trace.csv"
+    if using_native_tool:
+        try:
+            convert_native_counter_collection_csv(workload_dir)
+        except Exception:
+            console_error(
+                "Error converting native counter collection csv.\n"
+                f"Stacktrace:\n{traceback.format_exc()}"
             )
-        else:
-            # when the input is not for timestamps, and counter csv file
-            # is not generated, we assume failed rocprof run and will completely
-            # bypass the file generation and merging for current pmc
-            results_files_csv = []
+
+    counter_info_csvs = glob.glob(
+        f"{workload_dir}/out/pmc_1/*/*_counter_collection.csv"
+    )
+    existing_counter_files_csv = [f for f in counter_info_csvs if Path(f).is_file()]
+
+    if existing_counter_files_csv:
+        for counter_file in existing_counter_files_csv:
+            counter_path = Path(counter_file)
+            current_dir = counter_path.parent
+
+            agent_info_filepath = current_dir / counter_path.name.replace(
+                "_counter_collection", "_agent_info"
+            )
+
+            if not agent_info_filepath.is_file():
+                raise ValueError(
+                    f'{counter_file} has no corresponding "agent info" file'
+                )
+
+            converted_csv_file = current_dir / counter_path.name.replace(
+                "_counter_collection", "_converted"
+            )
+
+            try:
+                v3_counter_csv_to_v2_csv(
+                    counter_file, str(agent_info_filepath), str(converted_csv_file)
+                )
+            except Exception as e:
+                console_warning(
+                    f"Error converting {counter_file} from v3 to v2 csv: {e}"
+                )
+                return []
+
+        results_files_csv = glob.glob(f"{workload_dir}/out/pmc_1/*/*_converted.csv")
     else:
-        console_error("The output file of rocprofv3 can only support json or csv!!!")
+        return []
 
     return results_files_csv
 
diff --git a/projects/rocprofiler-compute/tests/conftest.py b/projects/rocprofiler-compute/tests/conftest.py
index e9f729c658..0c231ce221 100644
--- a/projects/rocprofiler-compute/tests/conftest.py
+++ b/projects/rocprofiler-compute/tests/conftest.py
@@ -51,12 +51,13 @@ def pytest_addoption(parser):
     )
 
     parser.addoption(
-        "--rocprofiler-sdk-library-path",
+        "--rocprofiler-sdk-tool-path",
         type=str,
         default=str(
-            Path(os.getenv("ROCM_PATH", "/opt/rocm")) / "lib/librocprofiler-sdk.so"
+            Path(os.getenv("ROCM_PATH", "/opt/rocm"))
+            / "lib/rocprofiler-sdk/librocprofiler-sdk-tool.so"
         ),
-        help="Path to the rocprofiler-sdk library",
+        help="Path to the rocprofiler-sdk tool",
     )
 
 
@@ -71,11 +72,11 @@ def binary_handler_profile_rocprof_compute(request):
         app_name="app_1",
         attach_detach_para=None,
     ):
-        if request.config.getoption("--rocprofiler-sdk-library-path"):
+        if request.config.getoption("--rocprofiler-sdk-tool-path"):
             options.extend(
                 [
-                    "--rocprofiler-sdk-library-path",
-                    request.config.getoption("--rocprofiler-sdk-library-path"),
+                    "--rocprofiler-sdk-tool-path",
+                    request.config.getoption("--rocprofiler-sdk-tool-path"),
                 ],
             )
         if request.config.getoption("--call-binary"):
@@ -114,7 +115,13 @@ def binary_handler_profile_rocprof_compute(request):
                 assert process.returncode == 0
             return process.returncode
         else:
-            baseline_opts = ["rocprof-compute", "profile", "-n", app_name, "-VVV"]
+            baseline_opts = [
+                "install/bin/rocprof-compute",
+                "profile",
+                "-n",
+                app_name,
+                "-VVV",
+            ]
             if not roof:
                 baseline_opts.append("--no-roof")
 
diff --git a/projects/rocprofiler-compute/tests/test_profile_general.py b/projects/rocprofiler-compute/tests/test_profile_general.py
index 758b2e9e5e..70fb8ef904 100644
--- a/projects/rocprofiler-compute/tests/test_profile_general.py
+++ b/projects/rocprofiler-compute/tests/test_profile_general.py
@@ -74,99 +74,23 @@ num_devices = 1
 
 attach_detach_interval_msec_no_delay = 10000
 attach_detach_interval_msec_with_delay = 60000
-
 DEFAULT_ABS_DIFF = 15
 DEFAULT_REL_DIFF = 50
 MAX_REOCCURING_COUNT = 28
 
-ALL_CSVS_MI100 = sorted([
-    "SQC_DCACHE_INFLIGHT_LEVEL.csv",
-    "SQC_ICACHE_INFLIGHT_LEVEL.csv",
-    "SQ_IFETCH_LEVEL.csv",
-    "SQ_INST_LEVEL_LDS.csv",
-    "SQ_LEVEL_WAVES.csv",
+CSVS = sorted([
     "pmc_perf.csv",
-    "pmc_perf_0.csv",
-    "pmc_perf_1.csv",
-    "pmc_perf_2.csv",
-    "pmc_perf_3.csv",
-    "pmc_perf_4.csv",
-    "pmc_perf_5.csv",
-    "pmc_perf_6.csv",
-    "sysinfo.csv",
-])
-
-ALL_CSVS_MI200 = sorted([
-    "SQC_DCACHE_INFLIGHT_LEVEL.csv",
-    "SQC_ICACHE_INFLIGHT_LEVEL.csv",
-    "SQ_IFETCH_LEVEL.csv",
-    "SQ_INST_LEVEL_LDS.csv",
-    "SQ_INST_LEVEL_SMEM.csv",
-    "SQ_INST_LEVEL_VMEM.csv",
-    "SQ_LEVEL_WAVES.csv",
-    "pmc_perf.csv",
-    "pmc_perf_0.csv",
-    "pmc_perf_1.csv",
-    "pmc_perf_2.csv",
-    "pmc_perf_3.csv",
-    "pmc_perf_4.csv",
-    "pmc_perf_5.csv",
-    "sysinfo.csv",
-])
-ALL_CSVS_MI300 = sorted([
-    "SQC_DCACHE_INFLIGHT_LEVEL.csv",
-    "SQC_ICACHE_INFLIGHT_LEVEL.csv",
-    "SQ_IFETCH_LEVEL.csv",
-    "SQ_INST_LEVEL_LDS.csv",
-    "SQ_INST_LEVEL_SMEM.csv",
-    "SQ_INST_LEVEL_VMEM.csv",
-    "SQ_LEVEL_WAVES.csv",
-    "pmc_perf.csv",
-    "pmc_perf_0.csv",
-    "pmc_perf_1.csv",
-    "pmc_perf_2.csv",
-    "pmc_perf_3.csv",
-    "pmc_perf_4.csv",
-    "pmc_perf_5.csv",
-    "sysinfo.csv",
-])
-ALL_CSVS_MI350 = sorted([
-    "SQC_DCACHE_INFLIGHT_LEVEL.csv",
-    "SQC_ICACHE_INFLIGHT_LEVEL.csv",
-    "SQ_IFETCH_LEVEL.csv",
-    "SQ_INST_LEVEL_LDS.csv",
-    "SQ_INST_LEVEL_SMEM.csv",
-    "SQ_INST_LEVEL_VMEM.csv",
-    "SQ_LEVEL_WAVES.csv",
-    "pmc_perf.csv",
-    "pmc_perf_0.csv",
-    "pmc_perf_1.csv",
-    "pmc_perf_2.csv",
-    "pmc_perf_3.csv",
-    "pmc_perf_4.csv",
-    "pmc_perf_5.csv",
-    "pmc_perf_6.csv",
-    "pmc_perf_7.csv",
-    "pmc_perf_8.csv",
-    "pmc_perf_9.csv",
-    "pmc_perf_10.csv",
-    "pmc_perf_11.csv",
-    "pmc_perf_12.csv",
     "sysinfo.csv",
 ])
 
 ROOF_ONLY_FILES = sorted([
     "empirRoof_gpu-0_FP32.pdf",
     "pmc_perf.csv",
-    "pmc_perf_0.csv",
-    "pmc_perf_1.csv",
-    "pmc_perf_2.csv",
     "roofline.csv",
     "sysinfo.csv",
 ])
 
 PC_SAMPLING_HOST_TRAP_FILES = sorted([
-    "pmc_perf_0.csv",
     "pmc_perf.csv",
     "ps_file_agent_info.csv",
     "ps_file_kernel_trace.csv",
@@ -176,7 +100,6 @@ PC_SAMPLING_HOST_TRAP_FILES = sorted([
 ])
 
 PC_SAMPLING_STOCHASTIC_FILES = sorted([
-    "pmc_perf_0.csv",
     "pmc_perf.csv",
     "ps_file_agent_info.csv",
     "ps_file_kernel_trace.csv",
@@ -550,13 +473,38 @@ def test_path(binary_handler_profile_rocprof_compute):
     file_dict = test_utils.check_csv_files(workload_dir, num_devices, num_kernels)
 
     if soc == "MI100":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
+        assert sorted(list(file_dict.keys())) == CSVS
     elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
+        assert sorted(list(file_dict.keys())) == CSVS
+    else:
+        print(f"This test is not supported for {soc}")
+        assert 0
+
+    validate(inspect.stack()[0][3], workload_dir, file_dict)
+
+    test_utils.clean_output_dir(config["cleanup"], workload_dir)
+
+
+@pytest.mark.path
+def test_path_no_native(binary_handler_profile_rocprof_compute):
+    workload_dir = test_utils.get_output_dir()
+    options = ["--no-native-tool"]
+    binary_handler_profile_rocprof_compute(config, workload_dir, options)
+
+    file_dict = test_utils.check_csv_files(workload_dir, num_devices, num_kernels)
+
+    if soc == "MI100":
+        assert sorted(list(file_dict.keys())) == CSVS
+    elif soc == "MI200":
+        assert sorted(list(file_dict.keys())) == CSVS
+    elif "MI300" in soc:
+        assert sorted(list(file_dict.keys())) == CSVS
+    elif "MI350" in soc:
+        assert sorted(list(file_dict.keys())) == CSVS
     else:
         print(f"This test is not supported for {soc}")
         assert 0
@@ -586,6 +534,107 @@ def test_path_rocpd(
     test_utils.clean_output_dir(config["cleanup"], workload_dir)
 
 
+@pytest.mark.path
+def test_path_csv(
+    binary_handler_profile_rocprof_compute, binary_handler_analyze_rocprof_compute
+):
+    workload_dir = test_utils.get_output_dir()
+    options = ["--format-rocprof-output", "csv"]
+    binary_handler_profile_rocprof_compute(config, workload_dir, options)
+
+    file_dict = test_utils.check_csv_files(workload_dir, num_devices, num_kernels)
+    all_csvs_mi100 = sorted([
+        "SQC_DCACHE_INFLIGHT_LEVEL.csv",
+        "SQC_ICACHE_INFLIGHT_LEVEL.csv",
+        "SQ_IFETCH_LEVEL.csv",
+        "SQ_INST_LEVEL_LDS.csv",
+        "SQ_LEVEL_WAVES.csv",
+        "pmc_perf.csv",
+        "pmc_perf_0.csv",
+        "pmc_perf_1.csv",
+        "pmc_perf_2.csv",
+        "pmc_perf_3.csv",
+        "pmc_perf_4.csv",
+        "pmc_perf_5.csv",
+        "pmc_perf_6.csv",
+        "sysinfo.csv",
+    ])
+    all_csvs_mi200 = sorted([
+        "SQC_DCACHE_INFLIGHT_LEVEL.csv",
+        "SQC_ICACHE_INFLIGHT_LEVEL.csv",
+        "SQ_IFETCH_LEVEL.csv",
+        "SQ_INST_LEVEL_LDS.csv",
+        "SQ_INST_LEVEL_SMEM.csv",
+        "SQ_INST_LEVEL_VMEM.csv",
+        "SQ_LEVEL_WAVES.csv",
+        "pmc_perf.csv",
+        "pmc_perf_0.csv",
+        "pmc_perf_1.csv",
+        "pmc_perf_2.csv",
+        "pmc_perf_3.csv",
+        "pmc_perf_4.csv",
+        "pmc_perf_5.csv",
+        "sysinfo.csv",
+    ])
+    all_csvs_mi300 = sorted([
+        "SQC_DCACHE_INFLIGHT_LEVEL.csv",
+        "SQC_ICACHE_INFLIGHT_LEVEL.csv",
+        "SQ_IFETCH_LEVEL.csv",
+        "SQ_INST_LEVEL_LDS.csv",
+        "SQ_INST_LEVEL_SMEM.csv",
+        "SQ_INST_LEVEL_VMEM.csv",
+        "SQ_LEVEL_WAVES.csv",
+        "pmc_perf.csv",
+        "pmc_perf_0.csv",
+        "pmc_perf_1.csv",
+        "pmc_perf_2.csv",
+        "pmc_perf_3.csv",
+        "pmc_perf_4.csv",
+        "pmc_perf_5.csv",
+        "sysinfo.csv",
+    ])
+    all_csvs_mi350 = sorted([
+        "SQC_DCACHE_INFLIGHT_LEVEL.csv",
+        "SQC_ICACHE_INFLIGHT_LEVEL.csv",
+        "SQ_IFETCH_LEVEL.csv",
+        "SQ_INST_LEVEL_LDS.csv",
+        "SQ_INST_LEVEL_SMEM.csv",
+        "SQ_INST_LEVEL_VMEM.csv",
+        "SQ_LEVEL_WAVES.csv",
+        "pmc_perf.csv",
+        "pmc_perf_0.csv",
+        "pmc_perf_1.csv",
+        "pmc_perf_2.csv",
+        "pmc_perf_3.csv",
+        "pmc_perf_4.csv",
+        "pmc_perf_5.csv",
+        "pmc_perf_6.csv",
+        "pmc_perf_7.csv",
+        "pmc_perf_8.csv",
+        "pmc_perf_9.csv",
+        "pmc_perf_10.csv",
+        "pmc_perf_11.csv",
+        "pmc_perf_12.csv",
+        "sysinfo.csv",
+    ])
+
+    if soc == "MI100":
+        assert sorted(list(file_dict.keys())) == all_csvs_mi100
+    elif soc == "MI200":
+        assert sorted(list(file_dict.keys())) == all_csvs_mi200
+    elif "MI300" in soc:
+        assert sorted(list(file_dict.keys())) == all_csvs_mi300
+    elif "MI350" in soc:
+        assert sorted(list(file_dict.keys())) == all_csvs_mi350
+    else:
+        print(f"This test is not supported for {soc}")
+        assert 0
+
+    validate(inspect.stack()[0][3], workload_dir, file_dict)
+
+    test_utils.clean_output_dir(config["cleanup"], workload_dir)
+
+
 @pytest.mark.roofline
 def test_roof_basic_validation(binary_handler_profile_rocprof_compute):
     """
@@ -1422,13 +1471,13 @@ def test_device_filter(binary_handler_profile_rocprof_compute):
 
     file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels)
     if soc == "MI100":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
+        assert sorted(list(file_dict.keys())) == CSVS
     elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
+        assert sorted(list(file_dict.keys())) == CSVS
     else:
         print(f"Testing isn't supported yet for {soc}")
         assert 0
@@ -1452,13 +1501,13 @@ def test_kernel(binary_handler_profile_rocprof_compute):
 
     file_dict = test_utils.check_csv_files(workload_dir, num_devices, num_kernels)
     if soc == "MI100":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
+        assert sorted(list(file_dict.keys())) == CSVS
     elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
+        assert sorted(list(file_dict.keys())) == CSVS
     else:
         print(f"Testing isn't supported yet for {soc}")
         assert 0
@@ -1474,19 +1523,19 @@ def test_kernel(binary_handler_profile_rocprof_compute):
 
 @pytest.mark.dispatch
 def test_dispatch_0(binary_handler_profile_rocprof_compute):
-    options = ["--dispatch", "0"]
+    options = ["--dispatch", "1"]
     workload_dir = test_utils.get_output_dir()
     binary_handler_profile_rocprof_compute(config, workload_dir, options)
 
     file_dict = test_utils.check_csv_files(workload_dir, num_devices, 1)
     if soc == "MI100":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
+        assert sorted(list(file_dict.keys())) == CSVS
     elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
+        assert sorted(list(file_dict.keys())) == CSVS
     else:
         print(f"Testing isn't supported yet for {soc}")
         assert 0
@@ -1497,7 +1546,7 @@ def test_dispatch_0(binary_handler_profile_rocprof_compute):
         file_dict,
         [
             "--dispatch",
-            "0",
+            "1",
         ],
     )
 
@@ -1506,19 +1555,19 @@ def test_dispatch_0(binary_handler_profile_rocprof_compute):
 
 @pytest.mark.dispatch
 def test_dispatch_0_1(binary_handler_profile_rocprof_compute):
-    options = ["--dispatch", "0:2"]
+    options = ["--dispatch", "1:2"]
     workload_dir = test_utils.get_output_dir()
     binary_handler_profile_rocprof_compute(config, workload_dir, options)
 
     file_dict = test_utils.check_csv_files(workload_dir, num_devices, 2)
     if soc == "MI100":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
+        assert sorted(list(file_dict.keys())) == CSVS
     elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
+        assert sorted(list(file_dict.keys())) == CSVS
     else:
         print(f"Testing isn't supported yet for {soc}")
         assert 0
@@ -1527,7 +1576,7 @@ def test_dispatch_0_1(binary_handler_profile_rocprof_compute):
         inspect.stack()[0][3],
         workload_dir,
         file_dict,
-        ["--dispatch", "0", "1"],
+        ["--dispatch", "1", "2"],
     )
 
     test_utils.clean_output_dir(config["cleanup"], workload_dir)
@@ -1535,19 +1584,19 @@ def test_dispatch_0_1(binary_handler_profile_rocprof_compute):
 
 @pytest.mark.dispatch
 def test_dispatch_2(binary_handler_profile_rocprof_compute):
-    options = ["--dispatch", "0"]
+    options = ["--dispatch", "1"]
     workload_dir = test_utils.get_output_dir()
     binary_handler_profile_rocprof_compute(config, workload_dir, options)
 
     file_dict = test_utils.check_csv_files(workload_dir, num_devices, 1)
     if soc == "MI100":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
+        assert sorted(list(file_dict.keys())) == CSVS
     elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
+        assert sorted(list(file_dict.keys())) == CSVS
     else:
         print(f"Testing isn't supported yet for {soc}")
         assert 0
@@ -1558,7 +1607,7 @@ def test_dispatch_2(binary_handler_profile_rocprof_compute):
         file_dict,
         [
             "--dispatch",
-            "0",
+            "1",
         ],
     )
 
@@ -1573,13 +1622,13 @@ def test_join_type_grid(binary_handler_profile_rocprof_compute):
 
     file_dict = test_utils.check_csv_files(workload_dir, num_devices, num_kernels)
     if soc == "MI100":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
+        assert sorted(list(file_dict.keys())) == CSVS
     elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
+        assert sorted(list(file_dict.keys())) == CSVS
     else:
         print(f"Testing isn't supported yet for {soc}")
         assert 0
@@ -1602,13 +1651,13 @@ def test_join_type_kernel(binary_handler_profile_rocprof_compute):
     file_dict = test_utils.check_csv_files(workload_dir, num_devices, num_kernels)
 
     if soc == "MI100":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
+        assert sorted(list(file_dict.keys())) == CSVS
     elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
+        assert sorted(list(file_dict.keys())) == CSVS
     elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
+        assert sorted(list(file_dict.keys())) == CSVS
     else:
         print(f"Testing isn't supported yet for {soc}")
         assert 0
diff --git a/projects/rocprofiler-compute/tests/test_utils.py b/projects/rocprofiler-compute/tests/test_utils.py
index 48bd0e02ce..4e00b2c544 100644
--- a/projects/rocprofiler-compute/tests/test_utils.py
+++ b/projects/rocprofiler-compute/tests/test_utils.py
@@ -381,7 +381,7 @@ def test_detect_rocprof_env_rocprof_not_found(monkeypatch):
     """
 
     class DummyArgs:
-        rocprofiler_sdk_library_path = "/fake/path"
+        rocprofiler_sdk_tool_path = "/fake/path"
 
     # Set ROCPROF to 'rocprof'
     monkeypatch.setenv("ROCPROF", "rocprofv3")
@@ -416,7 +416,7 @@ def test_detect_rocprof_env_rocprof_found(monkeypatch):
     """
 
     class DummyArgs:
-        rocprofiler_sdk_library_path = "/fake/path"
+        rocprofiler_sdk_tool_path = "/fake/path"
 
     monkeypatch.setenv("ROCPROF", "rocprof")
     # shutil.which returns a fake path for 'rocprof'
@@ -448,7 +448,7 @@ def test_detect_rocprof_env_not_set(monkeypatch):
     """
 
     class DummyArgs:
-        rocprofiler_sdk_library_path = "/fake/path"
+        rocprofiler_sdk_tool_path = "/fake/path"
 
     monkeypatch.delenv("ROCPROF", raising=False)
     monkeypatch.setattr("pathlib.Path.exists", lambda _: True)
@@ -475,7 +475,7 @@ def test_detect_rocprof_sdk(monkeypatch):
     """
 
     class DummyArgs:
-        rocprofiler_sdk_library_path = "/some/sdk/path"
+        rocprofiler_sdk_tool_path = "/some/sdk/path"
 
     monkeypatch.setenv("ROCPROF", "rocprofiler-sdk")
     monkeypatch.setattr("pathlib.Path.exists", lambda self: True)
@@ -2500,6 +2500,7 @@ def test_run_prof_success_rocprofiler_sdk(tmp_path, monkeypatch):
     profiler_options = {
         "APP_CMD": ["./test_app"],
         "ROCPROF_OUTPUT_PATH": workload_dir,
+        "ROCPROF_COUNTER_COLLECTION": "1",
         "ROCP_TOOL_LIBRARIES": "/opt/rocm/lib/rocprofiler-sdk/"
         "librocprofiler-sdk-tool.so",
     }
@@ -3061,13 +3062,14 @@ def test_run_prof_v3_sdk_and_cli_calls_trace_processing(tmp_path, monkeypatch):
 
     mspec = MockMSpec()
     loglevel = logging.INFO
-    format_rocprof_output = True
+    format_rocprof_output = "csv"
 
     monkeypatch.setattr("utils.utils.rocprof_cmd", "rocprofiler-sdk")
 
     profiler_options_sdk_hip = {
         "APP_CMD": "my_app",
         "ROCPROF_HIP_RUNTIME_API_TRACE": "1",
+        "ROCPROF_COUNTER_COLLECTION": "1",
         "ROCP_TOOL_LIBRARIES": "/opt/rocm/lib/rocprofiler-sdk/"
         "librocprofiler-sdk-tool.so",
     }
@@ -3123,44 +3125,6 @@ def test_run_prof_v3_sdk_and_cli_calls_trace_processing(tmp_path, monkeypatch):
 # =============================================================================
 
 
-def test_process_rocprofv3_output_json_format(tmp_path, monkeypatch):
-    """
-    Test process_rocprofv3_output with json format converts JSON files to CSV.
-
-    Args:
-        tmp_path (Path): Temporary directory for test files.
-        monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching.
-
-    Returns:
-        None: Asserts CSV files are created from JSON files.
-    """
-    workload_dir = str(tmp_path)
-    output_dir = tmp_path / "out" / "pmc_1" / "subdir"
-    output_dir.mkdir(parents=True)
-
-    json_file1 = output_dir / "test1.json"
-    json_file2 = output_dir / "test2.json"
-    json_file1.write_text('{"test": "data1"}')
-    json_file2.write_text('{"test": "data2"}')
-
-    monkeypatch.setattr("glob.glob", lambda pattern: [str(json_file1), str(json_file2)])
-
-    def mock_v3_json_to_csv(json_path, csv_path):
-        Path(csv_path).write_text("csv,data\ntest,value")
-
-    monkeypatch.setattr("utils.utils.v3_json_to_csv", mock_v3_json_to_csv)
-
-    import utils.utils as utils_mod
-
-    result = utils_mod.process_rocprofv3_output("json", workload_dir, False)
-
-    assert len(result) == 2
-    csv_file1 = output_dir / "test1.csv"
-    csv_file2 = output_dir / "test2.csv"
-    assert csv_file1.exists()
-    assert csv_file2.exists()
-
-
 def test_process_rocprofv3_output_csv_format_with_counter_files(tmp_path, monkeypatch):
     """
     Test process_rocprofv3_output with csv format processes counter collection files.
@@ -3201,7 +3165,7 @@ def test_process_rocprofv3_output_csv_format_with_counter_files(tmp_path, monkey
 
     import utils.utils as utils_mod
 
-    result = utils_mod.process_rocprofv3_output("csv", workload_dir, False)
+    result = utils_mod.process_rocprofv3_output(workload_dir, False)
 
     assert len(result) == 1
     assert str(converted_file) in result
@@ -3247,7 +3211,7 @@ def test_process_rocprofv3_output_csv_format_conversion_error(tmp_path, monkeypa
 
     import utils.utils as utils_mod
 
-    result = utils_mod.process_rocprofv3_output("csv", workload_dir, False)
+    result = utils_mod.process_rocprofv3_output(workload_dir, False)
 
     assert result == []
     assert len(warnings) == 1
@@ -3282,42 +3246,7 @@ def test_process_rocprofv3_output_csv_format_missing_agent_file(tmp_path, monkey
     import utils.utils as utils_mod
 
     with pytest.raises(ValueError, match='has no corresponding "agent info" file'):
-        utils_mod.process_rocprofv3_output("csv", workload_dir, False)
-
-
-def test_process_rocprofv3_output_csv_format_timestamps_fallback(tmp_path, monkeypatch):
-    """
-    Test process_rocprofv3_output falls back to kernel trace files for timestamps.
-
-    Args:
-        tmp_path (Path): Temporary directory for test files.
-        monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching.
-
-    Returns:
-        None: Asserts kernel trace files are used when is_timestamps is True.
-    """
-    workload_dir = str(tmp_path)
-    output_dir = tmp_path / "out" / "pmc_1" / "subdir"
-    output_dir.mkdir(parents=True)
-
-    trace_file = output_dir / "test_kernel_trace.csv"
-    trace_file.write_text("kernel,trace\ntest,data")
-
-    def mock_glob(pattern):
-        if "_counter_collection.csv" in pattern:
-            return []
-        elif "_kernel_trace.csv" in pattern:
-            return [str(trace_file)]
-        return []
-
-    monkeypatch.setattr("glob.glob", mock_glob)
-
-    import utils.utils as utils_mod
-
-    result = utils_mod.process_rocprofv3_output("csv", workload_dir, True)
-
-    assert len(result) == 1
-    assert str(trace_file) in result
+        utils_mod.process_rocprofv3_output(workload_dir, False)
 
 
 def test_process_rocprofv3_output_csv_format_no_files_non_timestamps(
@@ -3340,53 +3269,7 @@ def test_process_rocprofv3_output_csv_format_no_files_non_timestamps(
 
     import utils.utils as utils_mod
 
-    result = utils_mod.process_rocprofv3_output("csv", workload_dir, False)
-
-    assert result == []
-
-
-def test_process_rocprofv3_output_invalid_format(monkeypatch):
-    """
-    Test process_rocprofv3_output raises error for invalid output format.
-
-    Args:
-        monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching.
-
-    Returns:
-        None: Asserts console_error is called for invalid format.
-    """
-
-    def mock_console_error(msg):
-        raise RuntimeError(f"console_error: {msg}")
-
-    monkeypatch.setattr("utils.utils.console_error", mock_console_error)
-
-    import utils.utils as utils_mod
-
-    with pytest.raises(
-        RuntimeError, match="The output file of rocprofv3 can only support json or csv"
-    ):
-        utils_mod.process_rocprofv3_output("invalid", "/tmp", False)
-
-
-def test_process_rocprofv3_output_json_format_no_files(tmp_path, monkeypatch):
-    """
-    Test process_rocprofv3_output with json format when no JSON files exist.
-
-    Args:
-        tmp_path (Path): Temporary directory for test files.
-        monkeypatch (pytest.MonkeyPatch): Pytest fixture for patching.
-
-    Returns:
-        None: Asserts empty list returned when no JSON files found.
-    """
-    workload_dir = str(tmp_path)
-
-    monkeypatch.setattr("glob.glob", lambda pattern: [])
-
-    import utils.utils as utils_mod
-
-    result = utils_mod.process_rocprofv3_output("json", workload_dir, False)
+    result = utils_mod.process_rocprofv3_output(workload_dir, False)
 
     assert result == []
 
@@ -3439,7 +3322,7 @@ def test_process_rocprofv3_output_csv_format_multiple_counter_files(
 
     import utils.utils as utils_mod
 
-    result = utils_mod.process_rocprofv3_output("csv", workload_dir, False)
+    result = utils_mod.process_rocprofv3_output(workload_dir, False)
 
     assert len(result) == 2
     assert str(converted_file1) in result
@@ -8180,8 +8063,8 @@ def test_add_counter_overwrite_existing():
 # additional test detect_rocprof console error
 # =============================================================================
 class MockArgs:
-    def __init__(self, rocprofiler_sdk_library_path):
-        self.rocprofiler_sdk_library_path = rocprofiler_sdk_library_path
+    def __init__(self, rocprofiler_sdk_tool_path):
+        self.rocprofiler_sdk_tool_path = rocprofiler_sdk_tool_path
 
 
 @mock.patch.dict(os.environ, {"ROCPROF": "rocprofiler-sdk"}, clear=True)
@@ -8192,7 +8075,7 @@ def test_detect_rocprof_calls_console_error_if_sdk_path_invalid(
 ):
     """
     Tests that detect_rocprof calls console_error when ROCPROF is 'rocprofiler-sdk'
-    and the rocprofiler_sdk_library_path does not exist.
+    and the rocprofiler_sdk_tool_path does not exist.
     Focuses on the console_error call.
     """
     mock_path_instance = mock.Mock()
@@ -8200,13 +8083,13 @@ def test_detect_rocprof_calls_console_error_if_sdk_path_invalid(
     mock_path_constructor.return_value = mock_path_instance
 
     fake_library_path = "/some/invalid/path/to/librocprofiler_sdk.so"
-    args = MockArgs(rocprofiler_sdk_library_path=fake_library_path)
+    args = MockArgs(rocprofiler_sdk_tool_path=fake_library_path)
 
     with mock.patch("utils.utils.console_debug") as mock_console_debug:  # noqa
         utils.detect_rocprof(args)
 
     expected_error_message = (
-        "Could not find rocprofiler-sdk library at " + fake_library_path
+        "Could not find rocprofiler-sdk tool at " + fake_library_path
     )
     mock_console_error_func.assert_called_once_with(expected_error_message)
 
@@ -8442,7 +8325,7 @@ def test_pc_sampling_prof_sdk_path_nonexistent_librocprofiler_sdk_tool(
     mock_console_debug, mock_console_error, mock_capture_subprocess, tmp_path
 ):
     """
-    Edge Case: rocprofiler_sdk_library_path is valid, but librocprofiler-sdk-tool.so
+    Edge Case: rocprofiler_sdk_tool_path is valid, but librocprofiler-sdk-tool.so
     is NOT found next to it (or in rocprofiler-sdk subdir).
     This test primarily checks if the paths are constructed. The actual check for
     file existence before `capture_subprocess_output` is not in the provided snippet,
@@ -8452,31 +8335,29 @@ def test_pc_sampling_prof_sdk_path_nonexistent_librocprofiler_sdk_tool(
         method = "host_trap"
         interval = 1000
         workload_dir = str(tmp_path)
-        appcmd = "my_app --arg"
+        options = {"APP_CMD": "my_app --arg"}
 
         sdk_lib_dir = tmp_path / "rocm_sdk" / "lib"
         sdk_lib_dir.mkdir(parents=True, exist_ok=True)
-        rocprofiler_sdk_library_path = str(sdk_lib_dir / "librocprofiler_sdk.so")
-        Path(rocprofiler_sdk_library_path).touch()
+        rocprofiler_sdk_tool_path = str(sdk_lib_dir / "librocprofiler_sdk.so")
+        Path(rocprofiler_sdk_tool_path).touch()
 
         expected_tool_path = str(
             sdk_lib_dir / "rocprofiler-sdk" / "librocprofiler-sdk-tool.so"
         )
 
+        options["LD_PRELOAD"] = expected_tool_path
+
         mock_capture_subprocess.return_value = (True, "Success output")
 
-        utils.pc_sampling_prof(
-            method, interval, workload_dir, appcmd, rocprofiler_sdk_library_path
-        )
+        utils.pc_sampling_prof(options, method, interval, workload_dir)
 
         assert mock_capture_subprocess.called
         call_args = mock_capture_subprocess.call_args
         called_env = call_args.kwargs.get("new_env", {})
 
         assert "LD_PRELOAD" in called_env
-        ld_preload_paths = called_env["LD_PRELOAD"].split(":")
-        assert expected_tool_path in ld_preload_paths
-        assert rocprofiler_sdk_library_path in ld_preload_paths
+        assert called_env["LD_PRELOAD"] == expected_tool_path
 
         mock_console_error.assert_not_called()
 
@@ -8495,14 +8376,12 @@ def test_pc_sampling_prof_subprocess_fails(
         method = "stochastic"
         interval = 5000
         workload_dir = str(tmp_path)
-        appcmd = "another_app"
-        rocprofiler_sdk_library_path = "/some/path/librocprofiler_sdk.so"
+        options = ["another_app"]
+        rocprofiler_sdk_tool_path = "/some/path/librocprofiler_sdk.so"  # noqa: F841
 
         mock_capture_subprocess.return_value = (False, "Error output from subprocess")
 
-        utils.pc_sampling_prof(
-            method, interval, workload_dir, appcmd, rocprofiler_sdk_library_path
-        )
+        utils.pc_sampling_prof(options, method, interval, workload_dir)
 
         mock_capture_subprocess.assert_called_once()
         mock_console_error.assert_called_once_with("PC sampling failed.")
@@ -8510,10 +8389,11 @@ def test_pc_sampling_prof_subprocess_fails(
     mock_capture_subprocess.reset_mock()
     mock_console_error.reset_mock()
     with mock.patch("utils.utils.rocprof_cmd", "rocprofiler-sdk"):
+        options = {"APP_CMD": "another_app"}
         sdk_lib_dir = tmp_path / "rocm_sdk_fail" / "lib"
         sdk_lib_dir.mkdir(parents=True, exist_ok=True)
-        rocprofiler_sdk_library_path_sdk = str(sdk_lib_dir / "librocprofiler_sdk.so")
-        Path(rocprofiler_sdk_library_path_sdk).touch()
+        rocprofiler_sdk_tool_path_sdk = str(sdk_lib_dir / "librocprofiler_sdk.so")
+        Path(rocprofiler_sdk_tool_path_sdk).touch()
 
         tool_dir = sdk_lib_dir / "rocprofiler-sdk"
         tool_dir.mkdir(parents=True, exist_ok=True)
@@ -8524,9 +8404,7 @@ def test_pc_sampling_prof_subprocess_fails(
             "Error output from SDK subprocess",
         )
 
-        utils.pc_sampling_prof(
-            method, interval, workload_dir, appcmd, rocprofiler_sdk_library_path_sdk
-        )
+        utils.pc_sampling_prof(options, method, interval, workload_dir)
 
         mock_capture_subprocess.assert_called_once()
         mock_console_error.assert_called_once_with("PC sampling failed.")
@@ -8547,14 +8425,12 @@ def test_pc_sampling_prof_empty_appcmd(
         method = "host_trap"
         interval = 100
         workload_dir = str(tmp_path)
-        appcmd = ""
-        rocprofiler_sdk_library_path = "/some/path/librocprofiler_sdk.so"
+        options = ["--"]
+        rocprofiler_sdk_tool_path = "/some/path/librocprofiler_sdk.so"  # noqa: F841
 
         mock_capture_subprocess.return_value = (True, "Output with empty appcmd")
 
-        utils.pc_sampling_prof(
-            method, interval, workload_dir, appcmd, rocprofiler_sdk_library_path
-        )
+        utils.pc_sampling_prof(options, method, interval, workload_dir)
 
         assert mock_capture_subprocess.called
         options_list = mock_capture_subprocess.call_args[0][0]
@@ -8566,17 +8442,16 @@ def test_pc_sampling_prof_empty_appcmd(
     with mock.patch("utils.utils.rocprof_cmd", "rocprofiler-sdk"):
         sdk_lib_dir = tmp_path / "rocm_sdk_empty" / "lib"
         sdk_lib_dir.mkdir(parents=True, exist_ok=True)
-        rocprofiler_sdk_library_path_sdk = str(sdk_lib_dir / "librocprofiler_sdk.so")
-        Path(rocprofiler_sdk_library_path_sdk).touch()
+        rocprofiler_sdk_tool_path_sdk = str(sdk_lib_dir / "librocprofiler_sdk.so")
+        Path(rocprofiler_sdk_tool_path_sdk).touch()
         tool_dir = sdk_lib_dir / "rocprofiler-sdk"
         tool_dir.mkdir(parents=True, exist_ok=True)
         (tool_dir / "librocprofiler-sdk-tool.so").touch()
 
         mock_capture_subprocess.return_value = (True, "Output with empty appcmd SDK")
+        options = {"APP_CMD": ""}
 
-        utils.pc_sampling_prof(
-            method, interval, workload_dir, appcmd, rocprofiler_sdk_library_path_sdk
-        )
+        utils.pc_sampling_prof(options, method, interval, workload_dir)
 
         assert mock_capture_subprocess.called
         assert mock_capture_subprocess.call_args[0][0] == ""