Remove rocprofv1/v2 in favour of rocprofiler-sdk (#673)

* Set default rocprof interface as rocprofiler-sdk * Remove rocrprofv1 and rocprofv2 interfaces * Remove deprecation notice for rocprof v1/v2/v3 interfaces * Make rocprofiler-sdk the default interface and make rocprofv3 interface opt-in using ROCPROF=rocprofv3 * Add deprecation notice for rocprofv3
2025-09-24 10:37:01 -04:00
parent 7df02745eb
commit bd7a1de879
16 changed files with 235 additions and 2365 deletions
@@ -127,6 +127,9 @@ Full documentation for ROCm Compute Profiler is available at [https://rocm.docs.
  * `--list-available-metrics` analyze mode option to display the metrics available for analysis.
  * `--block` option cannot be used with `--list-metrics` and `--list-available-metrics`options.

+* Default rocprof interface changed from rocprofv3 to rocprofiler-sdk
+  * Use ROCPROF=rocprofv3 to use rocprofv3 interface
+
 ### Removed

 * Usage of `rocm-smi` in favor of `amd-smi`.
@@ -113,11 +113,7 @@ class RocProfCompute:

    def detect_profiler(self) -> None:
        profiler_mode = detect_rocprof(self.__args)
-        if str(profiler_mode).endswith("rocprof"):
-            self.__profiler_mode = "rocprofv1"
-        elif str(profiler_mode).endswith("rocprofv2"):
-            self.__profiler_mode = "rocprofv2"
-        elif str(profiler_mode).endswith("rocprofv3"):
+        if str(profiler_mode).endswith("rocprofv3"):
            self.__profiler_mode = "rocprofv3"
        elif str(profiler_mode) == "rocprofiler-sdk":
            self.__profiler_mode = "rocprofiler-sdk"
@@ -303,16 +299,32 @@ class RocProfCompute:

        sys.exit(0)

+        profiler_classes = {
+            "rocprofv3": (
+                "rocprof_compute_profile.profiler_rocprof_v3",
+                "rocprof_v3_profiler",
+            ),
+            "rocprofiler-sdk": (
+                "rocprof_compute_profile.profiler_rocprofiler_sdk",
+                "rocprofiler_sdk_profiler",
+            ),
+        }
+
+        if self.__profiler_mode not in profiler_classes:
+            console_error("Unsupported profiler")
+
+        module_name, class_name = profiler_classes[self.__profiler_mode]
+        module = importlib.import_module(module_name)
+        profiler_class = getattr(module, class_name)
+
+        return profiler_class(
+            self.__args,
+            self.__profiler_mode,
+            self.__soc[self.__mspec.gpu_arch],
+        )
+
    def create_profiler(self) -> object:
        profiler_classes = {
-            "rocprofv1": (
-                "rocprof_compute_profile.profiler_rocprof_v1",
-                "rocprof_v1_profiler",
-            ),
-            "rocprofv2": (
-                "rocprof_compute_profile.profiler_rocprof_v2",
-                "rocprof_v2_profiler",
-            ),
            "rocprofv3": (
                "rocprof_compute_profile.profiler_rocprof_v3",
                "rocprof_v3_profiler",
@@ -466,16 +466,9 @@ class RocProfCompute_Base:
                    console_debug(output)

            console_log("profiling", f"Current input file: {fname}")
-
-            if self.__profiler in (
-                "rocprofv1",
-                "rocprofv2",
-                "rocprofv3",
-                "rocprofiler-sdk",
-            ):
-                options = self.get_profiler_options(str(fname), self._soc)
-                start_time = time.time()
-
+            options = self.get_profiler_options(fname, self._soc)
+            start_time = time.time()
+            if self.__profiler == "rocprofv3" or self.__profiler == "rocprofiler-sdk":
                # Only 1-run case is permitted for attach/detach
                if (isinstance(options, list) and "--pid" in options) or (
                    isinstance(options, dict)
@@ -490,7 +483,6 @@ class RocProfCompute_Base:
                            f'passes. Please use "--block" or "--set" '
                            f"to adjust or reduce the requested performance metrics!"
                        )
-
                run_prof(
                    fname=str(fname),
                    profiler_options=options,
@@ -1,109 +0,0 @@
-##############################################################################
-# MIT License
-#
-# Copyright (c) 2021 - 2025 Advanced Micro Devices, Inc. All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-##############################################################################
-
-from pathlib import Path
-
-import config
-from rocprof_compute_profile.profiler_base import RocProfCompute_Base
-from utils.logger import console_log, demarcate
-from utils.utils import replace_timestamps, store_app_cmd
-
-
-class rocprof_v1_profiler(RocProfCompute_Base):
-    def __init__(self, profiling_args, profiler_mode, soc):
-        super().__init__(profiling_args, profiler_mode, soc)
-        self.ready_to_profile = (
-            self.get_args().roof_only
-            and not Path(self.get_args().path).joinpath("pmc_perf.csv").is_file()
-            or not self.get_args().roof_only
-        )
-
-    def get_profiler_options(self, fname, soc):
-        fbase = Path(fname).stem
-        app_cmd = self.get_args().remaining
-
-        args = []
-        # rocprof v1 does not support some counters on gfx 908 architecture
-        if soc.get_arch() == "gfx908":
-            metrics_path = str(
-                Path(str(config.rocprof_compute_home)).joinpath(
-                    "rocprof_compute_soc", "profile_configs", "metrics.xml"
-                )
-            )
-            args += ["-m", metrics_path]
-
-        args += [
-            # v1 requires request for timestamps
-            "--timestamp",
-            "on",
-            # v1 requires csv extension
-            "-o",
-            self.get_args().path + "/" + fbase + ".csv",
-            # v1 does require quotes on app cmd
-            '"' + app_cmd + '"',
-        ]
-        # store original args for debug message
-        store_app_cmd([
-            "--timestamp",
-            "on",
-            "-o",
-            self.get_args().path + "/" + fbase + ".csv",
-            app_cmd,
-        ])
-        return args
-
-    # -----------------------
-    # Required child methods
-    # -----------------------
-    @demarcate
-    def pre_processing(self):
-        """Perform any pre-processing steps prior to profiling."""
-        super().pre_processing()
-
-    @demarcate
-    def run_profiling(self, version: str, prog: str):
-        """Run profiling."""
-        if self.ready_to_profile:
-            if self.get_args().roof_only:
-                console_log(
-                    "roofline", "Generating pmc_perf.csv (roofline counters only)."
-                )
-            # Log profiling options and setup filtering
-            super().run_profiling(version, prog)
-        else:
-            console_log("roofline", "Detected existing pmc_perf.csv")
-
-    @demarcate
-    def post_processing(self):
-        """Perform any post-processing steps prior to profiling."""
-        if self.ready_to_profile:
-            # Manually join each pmc_perf*.csv output
-            self.join_prof()
-            # Run roofline microbenchmark
-            super().post_processing()
-            # Replace timestamp data to solve a known rocprof bug
-            replace_timestamps(self.get_args().path)
-        else:
-            console_log("roofline", "Detected existing pmc_perf.csv")
@@ -1,99 +0,0 @@
-##############################################################################
-# MIT License
-#
-# Copyright (c) 2021 - 2025 Advanced Micro Devices, Inc. All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-##############################################################################
-
-import shlex
-from pathlib import Path
-
-import config
-from rocprof_compute_profile.profiler_base import RocProfCompute_Base
-from utils.logger import console_log, demarcate
-from utils.utils import replace_timestamps, store_app_cmd
-
-
-class rocprof_v2_profiler(RocProfCompute_Base):
-    def __init__(self, profiling_args, profiler_mode, soc):
-        super().__init__(profiling_args, profiler_mode, soc)
-        self.ready_to_profile = (
-            self.get_args().roof_only
-            and not Path(self.get_args().path).joinpath("pmc_perf.csv").is_file()
-            or not self.get_args().roof_only
-        )
-
-    def get_profiler_options(self, fname, soc):
-        app_cmd = shlex.split(self.get_args().remaining)
-
-        args = []
-        # rocprof v2 does not support some counters on gfx 908 architecture
-        if soc.get_arch() == "gfx908":
-            metrics_path = str(
-                Path(str(config.rocprof_compute_home)).joinpath(
-                    "rocprof_compute_soc", "profile_configs", "metrics.xml"
-                )
-            )
-            args += ["-m", metrics_path]
-
-        args += [
-            # v2 requires output directory argument
-            "-d",
-            self.get_args().path + "/" + "out",
-        ]
-        args.extend(app_cmd)
-        # store args for debug message
-        store_app_cmd(args)
-        return args
-
-    # -----------------------
-    # Required child methods
-    # -----------------------
-    @demarcate
-    def pre_processing(self):
-        """Perform any pre-processing steps prior to profiling."""
-        super().pre_processing()
-
-    @demarcate
-    def run_profiling(self, version, prog):
-        """Run profiling."""
-        if self.ready_to_profile:
-            if self.get_args().roof_only:
-                console_log(
-                    "roofline", "Generating pmc_perf.csv (roofline counters only)."
-                )
-            # Log profiling options and setup filtering
-            super().run_profiling(version, prog)
-        else:
-            console_log("roofline", "Detected existing pmc_perf.csv")
-
-    @demarcate
-    def post_processing(self):
-        """Perform any post-processing steps prior to profiling."""
-        if self.ready_to_profile:
-            # Manually join each pmc_perf*.csv output
-            self.join_prof()
-            # Run roofline microbenchmark
-            super().post_processing()
-            # Replace timestamp data to solve a known rocprof bug
-            replace_timestamps(self.get_args().path)
-        else:
-            console_log("roofline", "Detected existing pmc_perf.csv")
@@ -1,737 +0,0 @@
-<gfx908>
-  # CPC counters
-  <metric
-    name="CPC_ME1_BUSY_FOR_PACKET_DECODE" block=CPC event=13 descr="Me1 busy for packet decode."
-  ></metric>
-  <metric
-    name="CPC_UTCL1_STALL_ON_TRANSLATION" block=CPC event=24 descr="One of the UTCL1s is stalled waiting on translation, XNACK or PENDING response."
-  ></metric>
-  <metric
-    name="CPC_CPC_STAT_BUSY" block=CPC event=25 descr="CPC Busy."
-  ></metric>
-  <metric
-    name="CPC_CPC_STAT_IDLE" block=CPC event=26 descr="CPC Idle."
-  ></metric>
-  <metric
-    name="CPC_CPC_STAT_STALL" block=CPC event=27 descr="CPC Stalled."
-  ></metric>
-  <metric
-    name="CPC_CPC_TCIU_BUSY" block=CPC event=28 descr="CPC TCIU interface Busy."
-  ></metric>
-  <metric
-    name="CPC_CPC_TCIU_IDLE" block=CPC event=29 descr="CPC TCIU interface Idle."
-  ></metric>
-  <metric
-    name="CPC_CPC_UTCL2IU_BUSY" block=CPC event=30 descr="CPC UTCL2 interface Busy."
-  ></metric>
-  <metric
-    name="CPC_CPC_UTCL2IU_IDLE" block=CPC event=31 descr="CPC UTCL2 interface Idle."
-  ></metric>
-  <metric
-    name="CPC_CPC_UTCL2IU_STALL" block=CPC event=32 descr="CPC UTCL2 interface Stalled waiting on Free, Tags or Translation."
-  ></metric>
-  <metric
-    name="CPC_ME1_DC0_SPI_BUSY" block=CPC event=33 descr="CPC Me1 Processor Busy."
-  ></metric>
-  <metric
-    name="CPF_CMP_UTCL1_STALL_ON_TRANSLATION" block=CPF event=20 descr="One of the Compute UTCL1s is stalled waiting on translation, XNACK or PENDING response."
-  ></metric>
-  <metric
-    name="CPF_CPF_STAT_BUSY" block=CPF event=23 descr="CPF Busy."
-  ></metric>
-  <metric
-    name="CPF_CPF_STAT_IDLE" block=CPF event=24 descr="CPF Idle."
-  ></metric>
-  <metric
-    name="CPF_CPF_STAT_STALL" block=CPF event=25 descr="CPF Stalled."
-  ></metric>
-  <metric
-    name="CPF_CPF_TCIU_BUSY" block=CPF event=26 descr="CPF TCIU interface Busy."
-  ></metric>
-  <metric
-    name="CPF_CPF_TCIU_IDLE" block=CPF event=27 descr="CPF TCIU interface Idle."
-  ></metric>
-  <metric
-    name="CPF_CPF_TCIU_STALL" block=CPF event=28 descr="CPF TCIU interface Stalled waiting on Free, Tags."
-  ></metric>
-  # GRBM counters
-  <metric
-    name="GRBM_COUNT" block=GRBM event=0 descr="Tie High - Count Number of Clocks"
-  ></metric>
-  <metric
-    name="GRBM_GUI_ACTIVE" block=GRBM event=2 descr="The GUI is Active"
-  ></metric>
-  <metric
-    name="GRBM_CP_BUSY" block=GRBM event=3 descr="Any of the Command Processor (CPG/CPC/CPF) blocks are busy."
-  ></metric>
-  <metric
-    name="GRBM_SPI_BUSY" block=GRBM event=11 descr="Any of the Shader Pipe Interpolators (SPI) are busy in the shader engine(s)."
-  ></metric>
-  <metric
-    name="GRBM_TA_BUSY" block=GRBM event=13 descr="Any of the Texture Pipes (TA) are busy in the shader engine(s)."
-  ></metric>
-  <metric
-    name="GRBM_TC_BUSY" block=GRBM event=28 descr="Any of the Texture Cache Blocks (TCP/TCI/TCA/TCC) are busy."
-  ></metric>
-  <metric
-    name="GRBM_CPC_BUSY" block=GRBM event=30 descr="The Command Processor Compute (CPC) is busy."
-  ></metric>
-  <metric
-    name="GRBM_CPF_BUSY" block=GRBM event=31 descr="The Command Processor Fetchers (CPF) is busy."
-  ></metric>
-  <metric
-    name="GRBM_UTCL2_BUSY" block=GRBM event=34 descr="The Unified Translation Cache Level-2 (UTCL2) block is busy."
-  ></metric>
-  <metric
-    name="GRBM_EA_BUSY" block=GRBM event=35 descr="The Efficiency Arbiter (EA) block is busy."
-  ></metric>
-  # SPI counters
-  <metric
-    name="SPI_CSN_WINDOW_VALID" block=SPI event=47 descr="Clock count enabled by perfcounter_start event. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"
-  ></metric>
-  <metric
-    name="SPI_CSN_BUSY" block=SPI event=48 descr="Number of clocks with outstanding waves (SPI or SH). Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"
-  ></metric>
-  <metric
-    name="SPI_CSN_NUM_THREADGROUPS" block=SPI event=49 descr="Number of threadgroups launched. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"
-  ></metric>
-  <metric
-    name="SPI_CSN_WAVE" block=SPI event=52 descr="Number of waves. Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"
-  ></metric>
-  <metric
-    name="SPI_RA_REQ_NO_ALLOC" block=SPI event=79 descr="Arb cycles with requests but no allocation. Source is RA0"
-  ></metric>
-  <metric
-    name="SPI_RA_REQ_NO_ALLOC_CSN" block=SPI event=85 descr="Arb cycles with CSn req and no CSn alloc. Source is RA0"
-  ></metric>
-  <metric
-    name="SPI_RA_RES_STALL_CSN" block=SPI event=91 descr="Arb cycles with CSn req and no CSn fits. Source is RA0"
-  ></metric>
-  <metric
-    name="SPI_RA_TMP_STALL_CSN" block=SPI event=97 descr="Cycles where csn wants to req but does not fit in temp space."
-  ></metric>
-  <metric
-    name="SPI_RA_WAVE_SIMD_FULL_CSN" block=SPI event=103 descr="Sum of SIMD where WAVE can't take csn wave when !fits. Source is RA0"
-  ></metric>
-  <metric
-    name="SPI_RA_VGPR_SIMD_FULL_CSN" block=SPI event=109 descr="Sum of SIMD where VGPR can't take csn wave when !fits. Source is RA0"
-  ></metric>
-  <metric
-    name="SPI_RA_SGPR_SIMD_FULL_CSN" block=SPI event=115 descr="Sum of SIMD where SGPR can't take csn wave when !fits. Source is RA0"
-  ></metric>
-  <metric
-    name="SPI_RA_LDS_CU_FULL_CSN" block=SPI event=120 descr="Sum of CU where LDS can't take csn wave when !fits. Source is RA0"
-  ></metric>
-  <metric
-    name="SPI_RA_BAR_CU_FULL_CSN" block=SPI event=123 descr="Sum of CU where BARRIER can't take csn wave when !fits. Source is RA0"
-  ></metric>
-  <metric
-    name="SPI_RA_BULKY_CU_FULL_CSN" block=SPI event=125 descr="Sum of CU where BULKY can't take csn wave when !fits. Source is RA0"
-  ></metric>
-  <metric
-    name="SPI_RA_TGLIM_CU_FULL_CSN" block=SPI event=127 descr="Cycles where csn wants to req but all CU are at tg_limit"
-  ></metric>
-  <metric
-    name="SPI_RA_WVLIM_STALL_CSN" block=SPI event=133 descr="Number of clocks csn is stalled due to WAVE LIMIT."
-  ></metric>
-  <metric
-    name="SPI_SWC_CSC_WR" block=SPI event=189 descr="Number of clocks to write CSC waves to SGPRs (need to multiply this value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"
-  ></metric>
-  <metric
-    name="SPI_VWC_CSC_WR" block=SPI event=195 descr="Number of clocks to write CSC waves to VGPRs (need to multiply this value by 4) Requires SPI_DEBUG_CNTL.DEBUG_PIPE_SEL to select source, DEBUG_PIPE_SEL = 1, source is CS1; DEBUG_PIPE_SEL = 2, source is CS2; DEBUG_PIPE_SEL = 3, source is CS3; default, source is CS0;"
-  ></metric>
-  # SQ counters
-  <metric
-    name="SQ_ACCUM_PREV" block=SQ event=1 descr="For counter N, increment by the value of counter N-1. Only accumulates once every 4 cycles."
-  ></metric>
-  <metric
-    name="SQ_CYCLES" block=SQ event=2 descr="Clock cycles. (nondeterministic, per-simd, global)"
-  ></metric>
-  <metric
-    name="SQ_BUSY_CYCLES" block=SQ event=3 descr="Clock cycles while SQ is reporting that it is busy. (nondeterministic, per-simd, global)"
-  ></metric>
-  <metric
-    name="SQ_WAVES" block=SQ event=4 descr="Count number of waves sent to SQs. (per-simd, emulated, global)"
-  ></metric>
-  <metric
-    name="SQ_LEVEL_WAVES" block=SQ event=5 descr="Track the number of waves. Set ACCUM_PREV for the next counter to use this. (level, per-simd, global)"
-  ></metric>
-  <metric
-    name="SQ_WAVES_EQ_64" block=SQ event=6 descr="Count number of waves with exactly 64 active threads sent to SQs. (per-simd, emulated, global)"
-  ></metric>
-  <metric
-    name="SQ_WAVES_LT_64" block=SQ event=7 descr="Count number of waves with <64 active threads sent to SQs. (per-simd, emulated, global)"
-  ></metric>
-  <metric
-    name="SQ_WAVES_LT_48" block=SQ event=8 descr="Count number of waves with <48 active threads sent to SQs. (per-simd, emulated, global)"
-  ></metric>
-  <metric
-    name="SQ_WAVES_LT_32" block=SQ event=9 descr="Count number of waves sent <32 active threads sent to SQs. (per-simd, emulated, global)"
-  ></metric>
-  <metric
-    name="SQ_WAVES_LT_16" block=SQ event=10 descr="Count number of waves sent <16 active threads sent to SQs. (per-simd, emulated, global)"
-  ></metric>
-  <metric
-    name="SQ_BUSY_CU_CYCLES" block=SQ event=13 descr="Count quad-cycles each CU is busy. (nondeterministic, per-simd)"
-  ></metric>
-  <metric
-    name="SQ_ITEMS" block=SQ event=14 descr="Number of valid items per wave. (per-simd, global)"
-  ></metric>
-  <metric
-    name="SQ_INSTS" block=SQ event=25 descr="Number of instructions issued. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_VALU" block=SQ event=26 descr="Number of VALU instructions issued. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_MFMA" block=SQ event=27 descr="Number of MFMA instructions issued. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_VMEM_WR" block=SQ event=28 descr="Number of VMEM write instructions issued (including FLAT). (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_VMEM_RD" block=SQ event=29 descr="Number of VMEM read instructions issued (including FLAT). (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_VMEM" block=SQ event=30 descr="Number of VMEM instructions issued. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_SALU" block=SQ event=31 descr="Number of SALU instructions issued. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_SMEM" block=SQ event=32 descr="Number of SMEM instructions issued. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_FLAT" block=SQ event=33 descr="Number of FLAT instructions issued. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_FLAT_LDS_ONLY" block=SQ event=34 descr="Number of FLAT instructions issued that read/wrote only from/to LDS (only works if EARLY_TA_DONE is enabled). (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_LDS" block=SQ event=35 descr="Number of LDS instructions issued (including FLAT). (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_GDS" block=SQ event=36 descr="Number of GDS instructions issued. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_EXP_GDS" block=SQ event=38 descr="Number of EXP and GDS instructions issued, excluding skipped export instructions. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_BRANCH" block=SQ event=39 descr="Number of Branch instructions issued. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_SENDMSG" block=SQ event=40 descr="Number of Sendmsg instructions issued. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_VSKIPPED" block=SQ event=41 descr="Number of vector instructions skipped. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INST_LEVEL_VMEM" block=SQ event=42 descr="Number of in-flight VMEM instructions. Set next counter to ACCUM_PREV and divide by INSTS_VMEM for average latency. Includes FLAT instructions. (per-simd, level, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQ_INST_LEVEL_SMEM" block=SQ event=43 descr="Number of in-flight SMEM instructions (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv). Set next counter to ACCUM_PREV and divide by INSTS_SMEM for average latency per smem request. Falls slightly short of total request latency because some fetches are divided into two requests that may finish at different times and this counter collects the average latency of the two. (per-simd, level, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQ_INST_LEVEL_LDS" block=SQ event=44 descr="Number of in-flight LDS instructions. Set next counter to ACCUM_PREV and divide by INSTS_LDS for average latency. Includes FLAT instructions. (per-simd, level, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQ_WAVE_CYCLES" block=SQ event=47 descr="Number of wave-cycles spent by waves in the CUs (per-simd, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQ_WAIT_ANY" block=SQ event=58 descr="Number of wave-cycles spent waiting for anything (per-simd, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQ_WAIT_INST_ANY" block=SQ event=61 descr="Number of wave-cycles spent waiting for any instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQ_WAIT_INST_LDS" block=SQ event=64 descr="Number of wave-cycles spent waiting for LDS instruction issue. In units of 4 cycles. (per-simd, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQ_ACTIVE_INST_ANY" block=SQ event=69 descr="Number of cycles each wave is working on an instruction. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_ACTIVE_INST_VMEM" block=SQ event=70 descr="Number of cycles the SQ instruction arbiter is working on a VMEM instruction. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_ACTIVE_INST_LDS" block=SQ event=71 descr="Number of cycles the SQ instruction arbiter is working on a LDS instruction. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_ACTIVE_INST_VALU" block=SQ event=72 descr="Number of cycles the SQ instruction arbiter is working on a VALU instruction. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_ACTIVE_INST_SCA" block=SQ event=73 descr="Number of cycles the SQ instruction arbiter is working on a SALU or SMEM instruction. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_ACTIVE_INST_EXP_GDS" block=SQ event=74 descr="Number of cycles the SQ instruction arbiter is working on an EXPORT or GDS instruction. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_ACTIVE_INST_MISC" block=SQ event=75 descr="Number of cycles the SQ instruction aribter is working on a BRANCH or SENDMSG instruction. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_ACTIVE_INST_FLAT" block=SQ event=76 descr="Number of cycles the SQ instruction arbiter is working on a FLAT instruction. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INST_CYCLES_VMEM_WR" block=SQ event=77 descr="Number of cycles needed to send addr and cmd data for VMEM write instructions. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INST_CYCLES_VMEM_RD" block=SQ event=78 descr="Number of cycles needed to send addr and cmd data for VMEM read instructions. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INST_CYCLES_SMEM" block=SQ event=84 descr="Number of cycles needed to execute scalar memory reads. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_INST_CYCLES_SALU" block=SQ event=85 descr="Number of cycles needed to execute non-memory read scalar operations. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_THREAD_CYCLES_VALU" block=SQ event=86 descr="Number of thread-cycles used to execute VALU operations (similar to INST_CYCLES_VALU but multiplied by # of active threads). (per-simd)"
-  ></metric>
-  <metric
-    name="SQ_IFETCH" block=SQ event=88 descr="Number of instruction fetch requests from cache. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_IFETCH_LEVEL" block=SQ event=89 descr="Number of instruction fetch requests from cache. (per-simd, level)"
-  ></metric>
-  <metric
-    name="SQ_LDS_BANK_CONFLICT" block=SQ event=94 descr="Number of cycles LDS is stalled by bank conflicts. (emulated)"
-  ></metric>
-  <metric
-    name="SQ_LDS_ADDR_CONFLICT" block=SQ event=95 descr="Number of cycles LDS is stalled by address conflicts. (emulated,nondeterministic)"
-  ></metric>
-  <metric
-    name="SQ_LDS_UNALIGNED_STALL" block=SQ event=96 descr="Number of cycles LDS is stalled processing flat unaligned load/store ops. (emulated)"
-  ></metric>
-  <metric
-    name="SQ_LDS_MEM_VIOLATIONS" block=SQ event=97 descr="Number of threads that have a memory violation in the LDS.(emulated)"
-  ></metric>
-  <metric
-    name="SQ_LDS_ATOMIC_RETURN" block=SQ event=98 descr="Number of atomic return cycles in LDS. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_LDS_IDX_ACTIVE" block=SQ event=99 descr="Number of cycles LDS is used for indexed (non-direct,non-interpolation) operations. (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQ_ACCUM_PREV_HIRES" block=SQ event=158 descr="For counter N, increment by the value of counter N-1."
-  ></metric>
-  <metric
-    name="SQ_WAVES_RESTORED" block=SQ event=159 descr="Count number of context-restored waves sent to SQs. (per-simd, emulated, global)"
-  ></metric>
-  <metric
-    name="SQ_WAVES_SAVED" block=SQ event=160 descr="Count number of context-saved waves. (per-simd, emulated, global)"
-  ></metric>
-  <metric
-    name="SQ_INSTS_SMEM_NORM" block=SQ event=161 descr="Number of SMEM instructions issued normalized to match smem_level (*2 load/store; *2 atomic; *2 memtime; *4 wb/inv). (per-simd, emulated)"
-  ></metric>
-  <metric
-    name="SQC_DCACHE_INPUT_VALID_READYB" block=SQ event=260 descr="Input stalled by SQC (per-SQ, nondeterministic, unwindowed)"
-  ></metric>
-  <metric
-    name="SQC_TC_REQ" block=SQ event=262 descr="Total number of TC requests that were issued by instruction and constant caches. (No-Masking, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQC_TC_INST_REQ" block=SQ event=263 descr="Number of insruction requests to the TC (No-Masking, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQC_TC_DATA_READ_REQ" block=SQ event=264 descr="Number of data read requests to the TC (No-Masking, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQC_TC_DATA_WRITE_REQ" block=SQ event=265 descr="Number of data write requests to the TC (No-Masking, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQC_TC_DATA_ATOMIC_REQ" block=SQ event=266 descr="Number of data atomic requests to the TC (No-Masking, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQC_TC_STALL" block=SQ event=267 descr="Valid request stalled TC request interface (no-credits). (No-Masking, nondeterministic, unwindowed)"
-  ></metric>
-  <metric
-    name="SQC_ICACHE_REQ" block=SQ event=270 descr="Number of requests. (per-SQ, per-Bank)"
-  ></metric>
-  <metric
-    name="SQC_ICACHE_HITS" block=SQ event=271 descr="Number of cache hits. (per-SQ, per-Bank, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQC_ICACHE_MISSES" block=SQ event=272 descr="Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQC_ICACHE_MISSES_DUPLICATE" block=SQ event=273 descr="Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ, per-Bank, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQC_DCACHE_REQ" block=SQ event=290 descr="Number of requests (post-bank-serialization). (per-SQ, per-Bank)"
-  ></metric>
-  <metric
-    name="SQC_DCACHE_HITS" block=SQ event=291 descr="Number of cache hits. (per-SQ, per-Bank, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQC_DCACHE_MISSES" block=SQ event=292 descr="Number of cache misses, includes uncached requests. (per-SQ, per-Bank, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQC_DCACHE_MISSES_DUPLICATE" block=SQ event=293 descr="Number of misses that were duplicates (access to a non-resident, miss pending CL). (per-SQ, per-Bank, nondeterministic)"
-  ></metric>
-  <metric
-    name="SQC_DCACHE_ATOMIC" block=SQ event=298 descr="Number of atomic requests. (per-SQ, per-Bank)"
-  ></metric>
-  <metric
-    name="SQC_DCACHE_REQ_READ_1" block=SQ event=323 descr="Number of constant cache 1 dw read requests. (per-SQ)"
-  ></metric>
-  <metric
-    name="SQC_DCACHE_REQ_READ_2" block=SQ event=324 descr="Number of constant cache 2 dw read requests. (per-SQ)"
-  ></metric>
-  <metric
-    name="SQC_DCACHE_REQ_READ_4" block=SQ event=325 descr="Number of constant cache 4 dw read requests. (per-SQ)"
-  ></metric>
-  <metric
-    name="SQC_DCACHE_REQ_READ_8" block=SQ event=326 descr="Number of constant cache 8 dw read requests. (per-SQ)"
-  ></metric>
-  <metric
-    name="SQC_DCACHE_REQ_READ_16" block=SQ event=327 descr="Number of constant cache 16 dw read requests. (per-SQ)"
-  ></metric>
-  # TA counters
-  <metric
-    name="TA_TA_BUSY" block=TA event=15 descr="TA block is busy. Perf_Windowing not supported for this counter."
-  ></metric>
-  <metric
-    name="TA_TOTAL_WAVEFRONTS" block=TA event=32 descr="Total number of wavefronts processed by TA."
-  ></metric>
-  <metric
-    name="TA_BUFFER_WAVEFRONTS" block=TA event=44 descr="Number of buffer wavefronts processed by TA."
-  ></metric>
-  <metric
-    name="TA_BUFFER_READ_WAVEFRONTS" block=TA event=45 descr="Number of buffer read wavefronts processed by TA."
-  ></metric>
-  <metric
-    name="TA_BUFFER_WRITE_WAVEFRONTS" block=TA event=46 descr="Number of buffer write wavefronts processed by TA."
-  ></metric>
-  <metric
-    name="TA_BUFFER_ATOMIC_WAVEFRONTS" block=TA event=47 descr="Number of buffer atomic wavefronts processed by TA."
-  ></metric>
-  <metric
-    name="TA_BUFFER_TOTAL_CYCLES" block=TA event=49 descr="Number of buffer cycles issued to TC."
-  ></metric>
-  <metric
-    name="TA_BUFFER_COALESCED_READ_CYCLES" block=TA event=52 descr="Number of buffer coalesced read cycles issued to TC."
-  ></metric>
-  <metric
-    name="TA_BUFFER_COALESCED_WRITE_CYCLES" block=TA event=53 descr="Number of buffer coalesced write cycles issued to TC."
-  ></metric>
-  <metric
-    name="TA_ADDR_STALLED_BY_TC_CYCLES" block=TA event=54 descr="Number of cycles addr path stalled by TC. Perf_Windowing not supported for this counter."
-  ></metric>
-  <metric
-    name="TA_ADDR_STALLED_BY_TD_CYCLES" block=TA event=55 descr="Number of cycles addr path stalled by TD. Perf_Windowing not supported for this counter."
-  ></metric>
-  <metric
-    name="TA_DATA_STALLED_BY_TC_CYCLES" block=TA event=56 descr="Number of cycles data path stalled by TC. Perf_Windowing not supported for this counter."
-  ></metric>
-  <metric
-    name="TA_FLAT_WAVEFRONTS" block=TA event=100 descr="Number of flat opcode wavfronts processed by the TA."
-  ></metric>
-  <metric
-    name="TA_FLAT_READ_WAVEFRONTS" block=TA event=101 descr="Number of flat opcode reads processed by the TA."
-  ></metric>
-  <metric
-    name="TA_FLAT_WRITE_WAVEFRONTS" block=TA event=102 descr="Number of flat opcode writes processed by the TA."
-  ></metric>
-  <metric
-    name="TA_FLAT_ATOMIC_WAVEFRONTS" block=TA event=103 descr="Number of flat opcode atomics processed by the TA."
-  ></metric>
-  # TCA counters
-  <metric
-    name="TCA_CYCLE" block=TCA event=1 descr="Number of cycles. Not windowable."
-  ></metric>
-  <metric
-    name="TCA_BUSY" block=TCA event=2 descr="Number of cycles we have a request pending. Not windowable."
-  ></metric>
-  # TCC counters
-  <metric
-    name="TCC_CYCLE" block=TCC event=1 descr="Number of cycles. Not windowable."
-  ></metric>
-  <metric
-    name="TCC_BUSY" block=TCC event=2 descr="Number of cycles we have a request pending. Not windowable."
-  ></metric>
-  <metric
-    name="TCC_REQ" block=TCC event=3 descr="Number of requests of all types. This is measured at the tag block. This may be more than the number of requests arriving at the TCC, but it is a good indication of the total amount of work that needs to be performed."
-  ></metric>
-  <metric
-    name="TCC_STREAMING_REQ" block=TCC event=4 descr="Number of streaming requests. This is measured at the tag block."
-  ></metric>
-  <metric
-    name="TCC_NC_REQ" block=TCC event=5 descr="The number of noncoherently cached requests. This is measured at the tag block."
-  ></metric>
-  <metric
-    name="TCC_UC_REQ" block=TCC event=6 descr="The number of uncached requests. This is measured at the tag block."
-  ></metric>
-  <metric
-    name="TCC_CC_REQ" block=TCC event=7 descr="The number of coherently cached requests. This is measured at the tag block."
-  ></metric>
-  <metric
-    name="TCC_RW_REQ" block=TCC event=8 descr="The number of RW requests. This is measured at the tag block."
-  ></metric>
-  <metric
-    name="TCC_PROBE" block=TCC event=9 descr="Number of probe requests. Not windowable."
-  ></metric>
-  <metric
-    name="TCC_PROBE_ALL" block=TCC event=10 descr="Number of external probe requests with with EA_TCC_preq_all== 1. Not windowable."
-  ></metric>
-  <metric
-    name="TCC_READ" block=TCC event=12 descr="Number of read requests. Compressed reads are included in this, but metadata reads are not included."
-  ></metric>
-  <metric
-    name="TCC_WRITE" block=TCC event=13 descr="Number of write requests."
-  ></metric>
-  <metric
-    name="TCC_ATOMIC" block=TCC event=14 descr="Number of atomic requests of all types."
-  ></metric>
-  <metric
-    name="TCC_HIT" block=TCC event=17 descr="Number of cache hits."
-  ></metric>
-  <metric
-    name="TCC_MISS" block=TCC event=19 descr="Number of cache misses. UC reads count as misses."
-  ></metric>
-  <metric
-    name="TCC_WRITEBACK" block=TCC event=22 descr="Number of lines written back to main memory. This includes writebacks of dirty lines and uncached write/atomic requests."
-  ></metric>
-  <metric
-    name="TCC_EA_WRREQ" block=TCC event=26 descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Atomics may travel over the same interface and are generally classified as write requests. This does not include probe commands."
-  ></metric>
-  <metric
-    name="TCC_EA_WRREQ_64B" block=TCC event=27 descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface."
-  ></metric>
-  <metric
-    name="TCC_EA_WR_UNCACHED_32B" block=TCC event=29 descr="Number of 32-byte write/atomic going over the TC_EA_wrreq interface due to uncached traffic. Note that CC mtypes can produce uncached requests, and those are included in this. A 64-byte request will be counted as 2"
-  ></metric>
-  <metric
-    name="TCC_EA_WRREQ_STALL" block=TCC event=30 descr="Number of cycles a write request was stalled."
-  ></metric>
-  <metric
-    name="TCC_EA_WRREQ_IO_CREDIT_STALL" block=TCC event=31 descr="Number of cycles a EA write request was stalled because the interface was out of IO credits."
-  ></metric>
-  <metric
-    name="TCC_EA_WRREQ_GMI_CREDIT_STALL" block=TCC event=32 descr="Number of cycles a EA write request was stalled because the interface was out of GMI credits."
-  ></metric>
-  <metric
-    name="TCC_EA_WRREQ_DRAM_CREDIT_STALL" block=TCC event=33 descr="Number of cycles a EA write request was stalled because the interface was out of DRAM credits."
-  ></metric>
-  <metric
-    name="TCC_TOO_MANY_EA_WRREQS_STALL" block=TCC event=34 descr="Number of cycles the TCC could not send a EA write request because it already reached its maximum number of pending EA write requests."
-  ></metric>
-  <metric
-    name="TCC_EA_WRREQ_LEVEL" block=TCC event=35 descr="The sum of the number of EA write requests in flight. This is primarily meant for measure average EA write latency. Average write latency = TCC_PERF_SEL_EA_WRREQ_LEVEL/TCC_PERF_SEL_EA_WRREQ."
-  ></metric>
-  <metric
-    name="TCC_EA_ATOMIC" block=TCC event=36 descr="Number of transactions going over the TC_EA_wrreq interface that are actually atomic requests."
-  ></metric>
-  <metric
-    name="TCC_EA_ATOMIC_LEVEL" block=TCC event=37 descr="The sum of the number of EA atomics in flight. This is primarily meant for measure average EA atomic latency. Average atomic latency = TCC_PERF_SEL_EA_WRREQ_ATOMIC_LEVEL/TCC_PERF_SEL_EA_WRREQ_ATOMIC."
-  ></metric>
-  <metric
-    name="TCC_EA_RDREQ" block=TCC event=38 descr="Number of TCC/EA read requests (either 32-byte or 64-byte)"
-  ></metric>
-  <metric
-    name="TCC_EA_RDREQ_32B" block=TCC event=39 descr="Number of 32-byte TCC/EA read requests"
-  ></metric>
-  <metric
-    name="TCC_EA_RD_UNCACHED_32B" block=TCC event=40 descr="Number of 32-byte TCC/EA read due to uncached traffic. A 64-byte request will be counted as 2"
-  ></metric>
-  <metric
-    name="TCC_EA_RDREQ_IO_CREDIT_STALL" block=TCC event=41 descr="Number of cycles there was a stall because the read request interface was out of IO credits. Stalls occur regardless of whether a read needed to be performed or not."
-  ></metric>
-  <metric
-    name="TCC_EA_RDREQ_GMI_CREDIT_STALL" block=TCC event=42 descr="Number of cycles there was a stall because the read request interface was out of GMI credits. Stalls occur regardless of whether a read needed to be performed or not."
-  ></metric>
-  <metric
-    name="TCC_EA_RDREQ_DRAM_CREDIT_STALL" block=TCC event=43 descr="Number of cycles there was a stall because the read request interface was out of DRAM credits. Stalls occur regardless of whether a read needed to be performed or not."
-  ></metric>
-  <metric
-    name="TCC_EA_RDREQ_LEVEL" block=TCC event=44 descr="The sum of the number of TCC/EA read requests in flight. This is primarily meant for measure average EA read latency. Average read latency = TCC_PERF_SEL_EA_RDREQ_LEVEL/TCC_PERF_SEL_EA_RDREQ."
-  ></metric>
-  <metric
-    name="TCC_TAG_STALL" block=TCC event=45 descr="Number of cycles the normal request pipeline in the tag was stalled for any reason. Normally, stalls of this nature are measured exactly from one point the pipeline, but that is not the case for this counter. Probes can stall the pipeline at a variety of places, and there is no single point that can reasonably measure the total stalls accurately."
-  ></metric>
-  <metric
-    name="TCC_NORMAL_WRITEBACK" block=TCC event=68 descr="Number of writebacks due to requests that are not writeback requests."
-  ></metric>
-  <metric
-    name="TCC_ALL_TC_OP_WB_WRITEBACK" block=TCC event=73 descr="Number of writebacks due to all TC_OP writeback requests."
-  ></metric>
-  <metric
-    name="TCC_NORMAL_EVICT" block=TCC event=74 descr="Number of evictions due to requests that are not invalidate or probe requests."
-  ></metric>
-  <metric
-    name="TCC_ALL_TC_OP_INV_EVICT" block=TCC event=80 descr="Number of evictions due to all TC_OP invalidate requests."
-  ></metric>
-  <metric
-    name="TCC_EA_RDREQ_DRAM" block=TCC event=102 descr="Number of TCC/EA read requests (either 32-byte or 64-byte) destined for DRAM (MC)."
-  ></metric>
-  <metric
-    name="TCC_EA_WRREQ_DRAM" block=TCC event=103 descr="Number of TCC/EA write requests (either 32-byte of 64-byte) destined for DRAM (MC)."
-  ></metric>
-  <metric
-    name="TCC_CLIENT184_REQ" block=TCC event=312 descr=""
-  ></metric>
-  <metric
-    name="TCC_CLIENT185_REQ" block=TCC event=313 descr=""
-  ></metric>
-  <metric
-    name="TCC_CLIENT186_REQ" block=TCC event=314 descr=""
-  ></metric>
-  <metric
-    name="TCC_CLIENT187_REQ" block=TCC event=315 descr=""
-  ></metric>
-  <metric
-    name="TCC_CLIENT188_REQ" block=TCC event=316 descr=""
-  ></metric>
-  <metric
-    name="TCC_CLIENT189_REQ" block=TCC event=317 descr=""
-  ></metric>
-  <metric
-    name="TCC_CLIENT190_REQ" block=TCC event=318 descr=""
-  ></metric>
-  <metric
-    name="TCC_CLIENT191_REQ" block=TCC event=319 descr=""
-  ></metric>
-  # TCP counters
-  <metric
-    name="TCP_GATE_EN1" block=TCP event=0 descr="TCP interface clocks are turned on. Not Windowed."
-  ></metric>
-  <metric
-    name="TCP_GATE_EN2" block=TCP event=1 descr="TCP core clocks are turned on. Not Windowed."
-  ></metric>
-  <metric
-    name="TCP_TCP_TA_DATA_STALL_CYCLES" block=TCP event=6 descr="TCP stalls TA data interface. Not Windowed."
-  ></metric>
-  <metric
-    name="TCP_TD_TCP_STALL_CYCLES" block=TCP event=7 descr="TD stalls TCP"
-  ></metric>
-  <metric
-    name="TCP_TCR_TCP_STALL_CYCLES" block=TCP event=8 descr="TCR stalls TCP_TCR_req interface"
-  ></metric>
-  <metric
-    name="TCP_READ_TAGCONFLICT_STALL_CYCLES" block=TCP event=11 descr="Tagram conflict stall on a read"
-  ></metric>
-  <metric
-    name="TCP_WRITE_TAGCONFLICT_STALL_CYCLES" block=TCP event=12 descr="Tagram conflict stall on a write"
-  ></metric>
-  <metric
-    name="TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES" block=TCP event=13 descr="Tagram conflict stall on an atomic"
-  ></metric>
-  <metric
-    name="TCP_PENDING_STALL_CYCLES" block=TCP event=22 descr="Stall due to data pending from L2"
-  ></metric>
-  <metric
-    name="TCP_TA_TCP_STATE_READ" block=TCP event=27 descr="Number of state reads"
-  ></metric>
-  <metric
-    name="TCP_VOLATILE" block=TCP event=28 descr="Total number of L1 volatile pixels/buffers from TA"
-  ></metric>
-  <metric
-    name="TCP_TOTAL_ACCESSES" block=TCP event=29 descr="Total number of pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_READ+TCP_PERF_SEL_TOTAL_NONREAD"
-  ></metric>
-  <metric
-    name="TCP_TOTAL_READ" block=TCP event=30 descr="Total number of read pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_HIT_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_LRU_READ + TCP_PERF_SEL_TOTAL_MISS_EVICT_READ"
-  ></metric>
-  <metric
-    name="TCP_TOTAL_WRITE" block=TCP event=32 descr="Total number of local write pixels/buffers from TA. Equals TCP_PERF_SEL_TOTAL_MISS_LRU_WRITE+ TCP_PERF_SEL_TOTAL_MISS_EVICT_WRITE"
-  ></metric>
-  <metric
-    name="TCP_TOTAL_ATOMIC_WITH_RET" block=TCP event=38 descr="Total number of atomic with return pixels/buffers from TA"
-  ></metric>
-  <metric
-    name="TCP_TOTAL_ATOMIC_WITHOUT_RET" block=TCP event=39 descr="Total number of atomic without return pixels/buffers from TA"
-  ></metric>
-  <metric
-    name="TCP_TOTAL_WRITEBACK_INVALIDATES" block=TCP event=45 descr="Total number of cache invalidates. Equals TCP_PERF_SEL_TOTAL_WBINVL1+ TCP_PERF_SEL_TOTAL_WBINVL1_VOL+ TCP_PERF_SEL_CP_TCP_INVALIDATE+ TCP_PERF_SEL_SQ_TCP_INVALIDATE_VOL. Not Windowed."
-  ></metric>
-  <metric
-    name="TCP_UTCL1_REQUEST" block=TCP event=47 descr="Total CLIENT_UTCL1 NORMAL requests"
-  ></metric>
-  <metric
-    name="TCP_UTCL1_TRANSLATION_MISS" block=TCP event=48 descr="Total utcl1 translation misses"
-  ></metric>
-  <metric
-    name="TCP_UTCL1_TRANSLATION_HIT" block=TCP event=49 descr="Total utcl1 translation hits"
-  ></metric>
-  <metric
-    name="TCP_UTCL1_PERMISSION_MISS" block=TCP event=50 descr="Total utcl1 permission misses"
-  ></metric>
-  <metric
-    name="TCP_TOTAL_CACHE_ACCESSES" block=TCP event=60 descr="Count of total cache line (tag) accesses (includes hits and misses)."
-  ></metric>
-  <metric
-    name="TCP_TCP_LATENCY" block=TCP event=65 descr="Total TCP wave latency (from first clock of wave entering to first clock of wave leaving), divide by TA_TCP_STATE_READ to avg wave latency"
-  ></metric>
-  <metric
-    name="TCP_TCC_READ_REQ_LATENCY" block=TCP event=66 descr="Total TCP->TCC request latency for reads and atomics with return. Not Windowed."
-  ></metric>
-  <metric
-    name="TCP_TCC_WRITE_REQ_LATENCY" block=TCP event=67 descr="Total TCP->TCC request latency for writes and atomics without return. Not Windowed."
-  ></metric>
-  <metric
-    name="TCP_TCC_READ_REQ" block=TCP event=69 descr="Total read requests from TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_WRITE_REQ" block=TCP event=70 descr="Total write requests from TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_ATOMIC_WITH_RET_REQ" block=TCP event=71 descr="Total atomic with return requests from TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_ATOMIC_WITHOUT_RET_REQ" block=TCP event=72 descr="Total atomic without return requests from TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_NC_READ_REQ" block=TCP event=75 descr="Total read requests with NC mtype from this TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_NC_WRITE_REQ" block=TCP event=76 descr="Total write requests with NC mtype from this TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_NC_ATOMIC_REQ" block=TCP event=77 descr="Total atomic requests with NC mtype from this TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_UC_READ_REQ" block=TCP event=78 descr="Total read requests with UC mtype from this TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_UC_WRITE_REQ" block=TCP event=79 descr="Total write requests with UC mtype from this TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_UC_ATOMIC_REQ" block=TCP event=80 descr="Total atomic requests with UC mtype from this TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_CC_READ_REQ" block=TCP event=81 descr="Total write requests with CC mtype from this TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_CC_WRITE_REQ" block=TCP event=82 descr="Total write requests with CC mtype from this TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_CC_ATOMIC_REQ" block=TCP event=83 descr="Total atomic requests with CC mtype from this TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_RW_READ_REQ" block=TCP event=85 descr="Total write requests with RW mtype from this TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_RW_WRITE_REQ" block=TCP event=86 descr="Total write requests with RW mtype from this TCP to all TCCs"
-  ></metric>
-  <metric
-    name="TCP_TCC_RW_ATOMIC_REQ" block=TCP event=87 descr="Total atomic requests with RW mtype from this TCP to all TCCs"
-  ></metric>
-  # TD counters
-  <metric
-    name="TD_TD_BUSY" block=TD event=1 descr="TD is processing or waiting for data. Perf_Windowing not supported for this counter."
-  ></metric>
-  <metric
-    name="TD_TC_STALL" block=TD event=15 descr="TD is stalled waiting for TC data."
-  ></metric>
-  <metric
-    name="TD_RESERVED_18" block=TD event=18 descr="RESERVED_18"
-  ></metric>
-  <metric
-    name="TD_LOAD_WAVEFRONT" block=TD event=25 descr="Count the wavefronts with opcode = load, include atomics and store."
-  ></metric>
-  <metric
-    name="TD_ATOMIC_WAVEFRONT" block=TD event=26 descr="Count the wavefronts with opcode = atomic."
-  ></metric>
-  <metric
-    name="TD_STORE_WAVEFRONT" block=TD event=27 descr="Count the wavefronts with opcode = store."
-  ></metric>
-  <metric
-    name="TD_COALESCABLE_WAVEFRONT" block=TD event=32 descr="Count wavefronts that TA finds coalescable."
-  ></metric>
-</gfx908>
@@ -1,163 +0,0 @@
-#include "gfx908_metrics.xml"
-
-<gfx9_expr>
-  <metric name="TA_BUSY_avr" expr=avr(TA_TA_BUSY,16) descr="TA block is busy. Average over TA instances."></metric>
-  <metric name="TA_BUSY_max" expr=max(TA_TA_BUSY,16) descr="TA block is busy. Max over TA instances."></metric>
-  <metric name="TA_BUSY_min" expr=min(TA_TA_BUSY,16) descr="TA block is busy. Min over TA instances."></metric>
-  <metric name="TA_FLAT_READ_WAVEFRONTS_sum" expr=sum(TA_FLAT_READ_WAVEFRONTS,16) descr="Number of flat opcode reads processed by the TA. Sum over TA instances."></metric>
-  <metric name="TA_FLAT_WRITE_WAVEFRONTS_sum" expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) descr="Number of flat opcode writes processed by the TA. Sum over TA instances."></metric>
-
-  <metric name="TCC_BUSY_avr" expr=avr(TCC_BUSY,16) descr="TCC_BUSY avr over all memory channels."></metric>
-  <metric name="TCC_REQ_sum" expr=sum(TCC_REQ,16) descr="TCC_REQ sum over all memory channels."></metric>
-  <metric name="TCC_HIT_sum" expr=sum(TCC_HIT,16) descr="Number of cache hits. Sum over TCC instances."></metric>
-  <metric name="TCC_MISS_sum" expr=sum(TCC_MISS,16) descr="Number of cache misses. Sum over TCC instances."></metric>
-  <metric name="TCC_EA_RDREQ_32B_sum" expr=sum(TCC_EA_RDREQ_32B,16) descr="Number of 32-byte TCC/EA read requests. Sum over TCC instances."></metric>
-  <metric name="TCC_EA_RDREQ_sum" expr=sum(TCC_EA_RDREQ,16) descr="Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances."></metric>
-  <metric name="TCC_EA_WRREQ_sum" expr=sum(TCC_EA_WRREQ,16) descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
-  <metric name="TCC_EA_WRREQ_64B_sum" expr=sum(TCC_EA_WRREQ_64B,16) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
-  <metric name="TCC_WRREQ_STALL_max" expr=max(TCC_EA_WRREQ_STALL,16) descr="Number of cycles a write request was stalled. Max over TCC instances."></metric>
-
-  <metric name="FETCH_SIZE" expr=(TCC_EA_RDREQ_32B_sum*32+(TCC_EA_RDREQ_sum-TCC_EA_RDREQ_32B_sum)*64)/1024 descr="The total kilobytes fetched from the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
-  <metric name="WRITE_SIZE" expr=((TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum)*32+TCC_EA_WRREQ_64B_sum*64)/1024 descr="The total kilobytes written to the video memory. This is measured with all extra fetches and any cache or memory effects taken into account."></metric>
-  <metric name="WRITE_REQ_32B" expr=TCC_EA_WRREQ_64B_sum*2+(TCC_EA_WRREQ_sum-TCC_EA_WRREQ_64B_sum) descr="The total number of 32-byte effective memory writes."></metric>
-
-
-
- #xlu - TA
-  <metric name="TA_TA_BUSY_sum"                                        expr=sum(TA_TA_BUSY,16) descr="."></metric>
-
-  <metric name="TA_TOTAL_WAVEFRONTS_sum"                               expr=sum(TA_TOTAL_WAVEFRONTS,16) descr="."></metric>
-  <metric name="TA_ADDR_STALLED_BY_TC_CYCLES_sum"                      expr=sum(TA_ADDR_STALLED_BY_TC_CYCLES,16) descr="."></metric>
-  <metric name="TA_ADDR_STALLED_BY_TD_CYCLES_sum"                      expr=sum(TA_ADDR_STALLED_BY_TD_CYCLES,16) descr="."></metric>
-  <metric name="TA_DATA_STALLED_BY_TC_CYCLES_sum"                      expr=sum(TA_DATA_STALLED_BY_TC_CYCLES,16) descr="."></metric>
-
-
-
-  <metric name="TA_FLAT_WAVEFRONTS_sum"                                expr=sum(TA_FLAT_WAVEFRONTS,16) descr="."></metric>
-  <metric name="TA_FLAT_READ_WAVEFRONTS_sum"                           expr=sum(TA_FLAT_READ_WAVEFRONTS,16) descr="."></metric>
-  <metric name="TA_FLAT_WRITE_WAVEFRONTS_sum"                          expr=sum(TA_FLAT_WRITE_WAVEFRONTS,16) descr="."></metric>
-  <metric name="TA_FLAT_ATOMIC_WAVEFRONTS_sum"                         expr=sum(TA_FLAT_ATOMIC_WAVEFRONTS,16) descr="."></metric>
-
-  <metric name="TA_BUFFER_WAVEFRONTS_sum"                              expr=sum(TA_BUFFER_WAVEFRONTS,16) descr="."></metric>
-  <metric name="TA_BUFFER_READ_WAVEFRONTS_sum"                         expr=sum(TA_BUFFER_READ_WAVEFRONTS,16) descr="."></metric>
-  <metric name="TA_BUFFER_WRITE_WAVEFRONTS_sum"                        expr=sum(TA_BUFFER_WRITE_WAVEFRONTS,16) descr="."></metric>
-  <metric name="TA_BUFFER_ATOMIC_WAVEFRONTS_sum"                       expr=sum(TA_BUFFER_ATOMIC_WAVEFRONTS,16) descr="."></metric>
-  <metric name="TA_BUFFER_TOTAL_CYCLES_sum"                            expr=sum(TA_BUFFER_TOTAL_CYCLES,16) descr="."></metric>
-
-  <metric name="TA_BUFFER_COALESCED_READ_CYCLES_sum"                   expr=sum(TA_BUFFER_COALESCED_READ_CYCLES,16) descr="."></metric>
-  <metric name="TA_BUFFER_COALESCED_WRITE_CYCLES_sum"                  expr=sum(TA_BUFFER_COALESCED_WRITE_CYCLES,16) descr="."></metric>
-
-  #xlu -TD
-  <metric name="TD_TD_BUSY_sum"                                         expr=sum(TD_TD_BUSY,16) descr="."></metric>
-  <metric name="TD_TC_STALL_sum"                                        expr=sum(TD_TC_STALL,16) descr="."></metric>
-  <metric name="TD_LOAD_WAVEFRONT_sum"                                  expr=sum(TD_LOAD_WAVEFRONT,16) descr="."></metric>
-  <metric name="TD_ATOMIC_WAVEFRONT_sum"                                expr=sum(TD_ATOMIC_WAVEFRONT,16) descr="."></metric>
-  <metric name="TD_STORE_WAVEFRONT_sum"                                 expr=sum(TD_STORE_WAVEFRONT,16) descr="."></metric>
-
-  <metric name="TD_COALESCABLE_WAVEFRONT_sum"                             expr=sum(TD_COALESCABLE_WAVEFRONT,16) descr="."></metric>
-
-    #xlu -TCP
-  <metric name="TCP_GATE_EN1_sum"                                       expr=sum(TCP_GATE_EN1,16) descr="."></metric>
-  <metric name="TCP_GATE_EN2_sum"                                       expr=sum(TCP_GATE_EN2,16) descr="."></metric>
-  <metric name="TCP_TCP_TA_DATA_STALL_CYCLES_sum"                       expr=sum(TCP_TCP_TA_DATA_STALL_CYCLES,16) descr="."></metric>
-  <metric name="TCP_TD_TCP_STALL_CYCLES_sum"                            expr=sum(TCP_TD_TCP_STALL_CYCLES,16) descr="."></metric>
-  <metric name="TCP_TCR_TCP_STALL_CYCLES_sum"                           expr=sum(TCP_TCR_TCP_STALL_CYCLES,16) descr="."></metric>
-  <metric name="TCP_READ_TAGCONFLICT_STALL_CYCLES_sum"                  expr=sum(TCP_READ_TAGCONFLICT_STALL_CYCLES,16) descr="."></metric>
-  <metric name="TCP_WRITE_TAGCONFLICT_STALL_CYCLES_sum"                 expr=sum(TCP_WRITE_TAGCONFLICT_STALL_CYCLES,16) descr="."></metric>
-  <metric name="TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES_sum"                expr=sum(TCP_ATOMIC_TAGCONFLICT_STALL_CYCLES,16) descr="."></metric>
-  <metric name="TCP_PENDING_STALL_CYCLES_sum"                           expr=sum(TCP_PENDING_STALL_CYCLES,16) descr="."></metric>
-  <metric name="TCP_VOLATILE_sum"                                       expr=sum(TCP_VOLATILE,16) descr="."></metric>
-  <metric name="TCP_TOTAL_ACCESSES_sum"                                 expr=sum(TCP_TOTAL_ACCESSES,16) descr="."></metric>
-  <metric name="TCP_TOTAL_READ_sum"                                     expr=sum(TCP_TOTAL_READ,16) descr="."></metric>
-  <metric name="TCP_TOTAL_WRITE_sum"                                    expr=sum(TCP_TOTAL_WRITE,16) descr="."></metric>
-  <metric name="TCP_TOTAL_ATOMIC_WITH_RET_sum"                          expr=sum(TCP_TOTAL_ATOMIC_WITH_RET,16) descr="."></metric>
-  <metric name="TCP_TOTAL_ATOMIC_WITHOUT_RET_sum"                       expr=sum(TCP_TOTAL_ATOMIC_WITHOUT_RET,16) descr="."></metric>
-  <metric name="TCP_TOTAL_WRITEBACK_INVALIDATES_sum"                    expr=sum(TCP_TOTAL_WRITEBACK_INVALIDATES,16) descr="."></metric>
-  <metric name="TCP_UTCL1_REQUEST_sum"                                  expr=sum(TCP_UTCL1_REQUEST,16) descr="."></metric>
-  <metric name="TCP_UTCL1_TRANSLATION_MISS_sum"                         expr=sum(TCP_UTCL1_TRANSLATION_MISS,16) descr="."></metric>
-  <metric name="TCP_UTCL1_TRANSLATION_HIT_sum"                          expr=sum(TCP_UTCL1_TRANSLATION_HIT,16) descr="."></metric>
-  <metric name="TCP_UTCL1_PERMISSION_MISS_sum"                          expr=sum(TCP_UTCL1_PERMISSION_MISS,16) descr="."></metric>
-  <metric name="TCP_TOTAL_CACHE_ACCESSES_sum"                           expr=sum(TCP_TOTAL_CACHE_ACCESSES,16) descr="."></metric>
-  <metric name="TCP_TCP_LATENCY_sum"                                    expr=sum(TCP_TCP_LATENCY,16) descr="."></metric>
-  <metric name="TCP_TA_TCP_STATE_READ_sum"                              expr=sum(TCP_TA_TCP_STATE_READ,16) descr="."></metric>
-  <metric name="TCP_TCC_READ_REQ_LATENCY_sum"                           expr=sum(TCP_TCC_READ_REQ_LATENCY,16) descr="."></metric>
-  <metric name="TCP_TCC_WRITE_REQ_LATENCY_sum"                          expr=sum(TCP_TCC_WRITE_REQ_LATENCY,16) descr="."></metric>
-  <metric name="TCP_TCC_READ_REQ_sum"                                   expr=sum(TCP_TCC_READ_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_WRITE_REQ_sum"                                  expr=sum(TCP_TCC_WRITE_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_ATOMIC_WITH_RET_REQ_sum"                        expr=sum(TCP_TCC_ATOMIC_WITH_RET_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_ATOMIC_WITHOUT_RET_REQ_sum"                     expr=sum(TCP_TCC_ATOMIC_WITHOUT_RET_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_NC_READ_REQ_sum"                                expr=sum(TCP_TCC_NC_READ_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_NC_WRITE_REQ_sum"                               expr=sum(TCP_TCC_NC_WRITE_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_NC_ATOMIC_REQ_sum"                              expr=sum(TCP_TCC_NC_ATOMIC_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_UC_READ_REQ_sum"                                expr=sum(TCP_TCC_UC_READ_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_UC_WRITE_REQ_sum"                               expr=sum(TCP_TCC_UC_WRITE_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_UC_ATOMIC_REQ_sum"                              expr=sum(TCP_TCC_UC_ATOMIC_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_CC_READ_REQ_sum"                                expr=sum(TCP_TCC_CC_READ_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_CC_WRITE_REQ_sum"                               expr=sum(TCP_TCC_CC_WRITE_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_CC_ATOMIC_REQ_sum"                              expr=sum(TCP_TCC_CC_ATOMIC_REQ,16) descr="."></metric>
-
-</gfx9_expr>
-
-<gfx908_expr base="gfx9_expr">
-  <metric name="TCC_BUSY_avr" expr=avr(TCC_BUSY,32) descr="TCC_BUSY avr over all memory channels."></metric>
-  <metric name="TCC_REQ_sum" expr=sum(TCC_REQ,32) descr="TCC_REQ sum over all memory channels."></metric>
-  <metric name="TCC_HIT_sum" expr=sum(TCC_HIT,32) descr="Number of cache hits. Sum over TCC instances."></metric>
-  <metric name="TCC_MISS_sum" expr=sum(TCC_MISS,32) descr="Number of cache misses. Sum over TCC instances."></metric>
-  <metric name="TCC_EA_RDREQ_32B_sum" expr=sum(TCC_EA_RDREQ_32B,32) descr="Number of 32-byte TCC/EA read requests. Sum over TCC instances."></metric>
-  <metric name="TCC_EA_RDREQ_sum" expr=sum(TCC_EA_RDREQ,32) descr="Number of TCC/EA read requests (either 32-byte or 64-byte). Sum over TCC instances."></metric>
-  <metric name="TCC_EA_WRREQ_sum" expr=sum(TCC_EA_WRREQ,32) descr="Number of transactions (either 32-byte or 64-byte) going over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
-  <metric name="TCC_EA_WRREQ_64B_sum" expr=sum(TCC_EA_WRREQ_64B,32) descr="Number of 64-byte transactions going (64-byte write or CMPSWAP) over the TC_EA_wrreq interface. Sum over TCC instances."></metric>
-  <metric name="TCC_WRREQ_STALL_max" expr=max(TCC_EA_WRREQ_STALL,32) descr="Number of cycles a write request was stalled. Max over TCC instances."></metric>
-
-  #xlu - TCP
-  <metric name="TCP_TCC_RW_READ_REQ_sum"                                expr=sum(TCP_TCC_RW_READ_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_RW_WRITE_REQ_sum"                               expr=sum(TCP_TCC_RW_WRITE_REQ,16) descr="."></metric>
-  <metric name="TCP_TCC_RW_ATOMIC_REQ_sum"                              expr=sum(TCP_TCC_RW_ATOMIC_REQ,16) descr="."></metric>
-
-    #xlu - TCC
-  <metric name="TCC_CYCLE_sum"                                          expr=sum(TCC_CYCLE,32) descr="."></metric>
-  <metric name="TCC_BUSY_sum"                                           expr=sum(TCC_BUSY,32) descr="."></metric>
-  <metric name="TCC_REQ_sum"                                            expr=sum(TCC_REQ,32) descr="."></metric>
-  <metric name="TCC_STREAMING_REQ_sum"                                  expr=sum(TCC_STREAMING_REQ,32) descr="."></metric>
-  <metric name="TCC_NC_REQ_sum"                                         expr=sum(TCC_NC_REQ,32) descr="."></metric>
-  <metric name="TCC_UC_REQ_sum"                                         expr=sum(TCC_UC_REQ,32) descr="."></metric>
-  <metric name="TCC_CC_REQ_sum"                                         expr=sum(TCC_CC_REQ,32) descr="."></metric>
-  <metric name="TCC_RW_REQ_sum"                                         expr=sum(TCC_RW_REQ,32) descr="."></metric>
-  <metric name="TCC_PROBE_sum"                                          expr=sum(TCC_PROBE,32) descr="."></metric>
-  <metric name="TCC_PROBE_ALL_sum"                                      expr=sum(TCC_PROBE_ALL,32) descr="."></metric>
-  <metric name="TCC_READ_sum"                                           expr=sum(TCC_READ,32) descr="."></metric>
-  <metric name="TCC_WRITE_sum"                                          expr=sum(TCC_WRITE,32) descr="."></metric>
-  <metric name="TCC_ATOMIC_sum"                                         expr=sum(TCC_ATOMIC,32) descr="."></metric>
-  <metric name="TCC_HIT_sum"                                            expr=sum(TCC_HIT,32) descr="."></metric>
-  <metric name="TCC_MISS_sum"                                           expr=sum(TCC_MISS,32) descr="."></metric>
-  <metric name="TCC_TAG_STALL_sum"                                      expr=sum(TCC_TAG_STALL,32) descr="."></metric>
-  <metric name="TCC_WRITEBACK_sum"                                      expr=sum(TCC_WRITEBACK,32) descr="."></metric>
-  <metric name="TCC_EA_WRREQ_sum"                                       expr=sum(TCC_EA_WRREQ,32) descr="."></metric>
-  <metric name="TCC_EA_WRREQ_64B_sum"                                   expr=sum(TCC_EA_WRREQ_64B,32) descr="."></metric>
-  <metric name="TCC_EA_WR_UNCACHED_32B_sum"                             expr=sum(TCC_EA_WR_UNCACHED_32B,32) descr="."></metric>
-  <metric name="TCC_EA_WRREQ_STALL_sum"                                 expr=sum(TCC_EA_WRREQ_STALL,32) descr="."></metric>
-  <metric name="TCC_EA_WRREQ_IO_CREDIT_STALL_sum"                       expr=sum(TCC_EA_WRREQ_IO_CREDIT_STALL,32) descr="."></metric>
-  <metric name="TCC_EA_WRREQ_GMI_CREDIT_STALL_sum"                      expr=sum(TCC_EA_WRREQ_GMI_CREDIT_STALL,32) descr="."></metric>
-  <metric name="TCC_EA_WRREQ_DRAM_CREDIT_STALL_sum"                     expr=sum(TCC_EA_WRREQ_DRAM_CREDIT_STALL,32) descr="."></metric>
-  <metric name="TCC_TOO_MANY_EA_WRREQS_STALL_sum"                       expr=sum(TCC_TOO_MANY_EA_WRREQS_STALL,32) descr="."></metric>
-  <metric name="TCC_EA_WRREQ_LEVEL_sum"                                 expr=sum(TCC_EA_WRREQ_LEVEL,32) descr="."></metric>
-  <metric name="TCC_EA_RDREQ_LEVEL_sum"                                 expr=sum(TCC_EA_RDREQ_LEVEL,32) descr="."></metric>
-  <metric name="TCC_EA_ATOMIC_sum"                                      expr=sum(TCC_EA_ATOMIC,32) descr="."></metric>
-  <metric name="TCC_EA_ATOMIC_LEVEL_sum"                                expr=sum(TCC_EA_ATOMIC_LEVEL,32) descr="."></metric>
-  <metric name="TCC_EA_RDREQ_sum"                                       expr=sum(TCC_EA_RDREQ,32) descr="."></metric>
-  <metric name="TCC_EA_RDREQ_32B_sum"                                   expr=sum(TCC_EA_RDREQ_32B,32) descr="."></metric>
-  <metric name="TCC_EA_RD_UNCACHED_32B_sum"                             expr=sum(TCC_EA_RD_UNCACHED_32B,32) descr="."></metric>
-  <metric name="TCC_EA_RDREQ_IO_CREDIT_STALL_sum"                       expr=sum(TCC_EA_RDREQ_IO_CREDIT_STALL,32) descr="."></metric>
-  <metric name="TCC_EA_RDREQ_GMI_CREDIT_STALL_sum"                      expr=sum(TCC_EA_RDREQ_GMI_CREDIT_STALL,32) descr="."></metric>
-  <metric name="TCC_EA_RDREQ_DRAM_CREDIT_STALL_sum"                     expr=sum(TCC_EA_RDREQ_DRAM_CREDIT_STALL,32) descr="."></metric>
-  <metric name="TCC_NORMAL_WRITEBACK_sum"                               expr=sum(TCC_NORMAL_WRITEBACK,32) descr="."></metric>
-  <metric name="TCC_ALL_TC_OP_WB_WRITEBACK_sum"                         expr=sum(TCC_ALL_TC_OP_WB_WRITEBACK,32) descr="."></metric>
-  <metric name="TCC_NORMAL_EVICT_sum"                                   expr=sum(TCC_NORMAL_EVICT,32) descr="."></metric>
-  <metric name="TCC_ALL_TC_OP_INV_EVICT_sum"                            expr=sum(TCC_ALL_TC_OP_INV_EVICT,32) descr="."></metric>
-  <metric name="TCC_EA_RDREQ_DRAM_sum"                                  expr=sum(TCC_EA_RDREQ_DRAM,32) descr="."></metric>
-  <metric name="TCC_EA_WRREQ_DRAM_sum"                                  expr=sum(TCC_EA_WRREQ_DRAM,32) descr="."></metric>
-
-</gfx908_expr>
-
-<gfx9 base="gfx9_expr"></gfx9>
-<gfx908 base="gfx908_expr"> </gfx908>
@@ -49,14 +49,12 @@ from utils.parser import BUILD_IN_VARS, SUPPORTED_DENOM
 from utils.specs import MachineSpecs
 from utils.utils import (
    add_counter_extra_config_input_yaml,
-    capture_subprocess_output,
    convert_metric_id_to_panel_info,
    detect_rocprof,
    get_submodules,
    is_tcc_channel_counter,
    mibench,
    parse_sets_yaml,
-    using_v3,
 )


@@ -370,8 +368,7 @@ class OmniSoC_Base:
        # Handle TCC channel counters: if hw_counter_matches has elems ending with '['
        # Expand and interleve the TCC channel counters
        # e.g.  TCC_HIT[0] TCC_ATOMIC[0] ... TCC_HIT[1] TCC_ATOMIC[1] ...
-        num_xcd_for_pmc_file = int(self._mspec.num_xcd) if using_v3() else 1
-
+        num_xcd_for_pmc_file = int(self._mspec.num_xcd)
        for counter_name in counters.copy():
            if counter_name.startswith("TCC") and counter_name.endswith("["):
                counters.remove(counter_name)
@@ -388,18 +385,6 @@ class OmniSoC_Base:
        """Filter default performance counter set based on user arguments"""
        counters, filter_blocks = self.detect_counters()

-        if not using_v3():
-            # Counters not supported in rocprof v1 / v2
-            counters = counters - {
-                "SQ_INSTS_VALU_MFMA_F8",
-                "SQ_INSTS_VALU_MFMA_MOPS_F8",
-                "SQC_DCACHE_INFLIGHT_LEVEL",
-                "SQC_ICACHE_INFLIGHT_LEVEL",
-                "SQ_VMEM_WR_TA_DATA_FIFO_FULL",
-                "SQ_VMEM_TA_ADDR_FIFO_FULL",
-                "SQ_VMEM_TA_CMD_FIFO_FULL",
-            }
-
        # TCP_TCP_LATENCY_sum not supported for MI300 (gfx940, gfx941, gfx942)
        if self.__arch in ("gfx940", "gfx941", "gfx942"):
            counters = counters - {"TCP_TCP_LATENCY_sum"}
@@ -467,84 +452,52 @@ class OmniSoC_Base:

        if rocprof_cmd != "rocprofiler-sdk":
            console_warning(
-                "rocprof v1/v2/v3 interfaces will be removed in favor of "
-                "rocprofiler-sdk interface in a future release. To use "
-                "rocprofiler-sdk, set ROCPROF to 'rocprofiler-sdk' and "
-                "optionally provide the path to librocprofiler-sdk.so via "
-                "--rocprofiler-sdk-library-path."
+                "rocprofv3 interface is deprecated and will be removed "
+                "in a future release."
            )

        rocprof_counters: set[str] = set()

-        if rocprof_cmd.endswith("rocprof"):
-            for list_type in ["--list-basic", "--list-derived"]:
-                command = [rocprof_cmd, list_type]
-                success, output = capture_subprocess_output(
-                    command, enable_logging=False
-                )
-                # return code should be 1 so success should be False
-                if success:
-                    console_error(
-                        "Failed to list rocprof supported counters using command: "
-                        f"{command}"
-                    )
-
-                for line in output.splitlines():
-                    if "gpu-agent" in line:
-                        counters, _ = self.parse_counters_text(
-                            line.split(":")[1].strip()
-                        )
-                        rocprof_counters.update(counters)
-        elif rocprof_cmd.endswith("rocprofv2"):
-            command = [rocprof_cmd, "--list-counters"]
-            success, output = capture_subprocess_output(command, enable_logging=False)
-            # return code should be 1 so success should be False
-            if success:
-                console_error(
-                    "Failed to list rocprof supported counters using command: "
-                    f"{command}"
-                )
-
-            for line in output.splitlines():
-                if "gfx" in line:
-                    counters, _ = self.parse_counters_text(line.split(":")[2].strip())
-                    rocprof_counters.update(counters)
-        elif rocprof_cmd.endswith("rocprofv3") or rocprof_cmd == "rocprofiler-sdk":
-            # Point to counter definition
-            old_rocprofiler_metrics_path = os.environ.get("ROCPROFILER_METRICS_PATH")
-            os.environ["ROCPROFILER_METRICS_PATH"] = str(
-                config.rocprof_compute_home / "rocprof_compute_soc" / "profile_configs"
-            )
-            sys.path.append(
-                str(Path(args.rocprofiler_sdk_library_path).parent.parent / "bin")
-            )
-
-            from rocprofv3_avail_module import avail
-
-            avail.loadLibrary.libname = str(
-                Path(args.rocprofiler_sdk_library_path).parent.parent
-                / "lib"
-                / "rocprofiler-sdk"
-                / "librocprofv3-list-avail.so"
-            )
-            counters = avail.get_counters()
-            rocprof_counters = {
-                counter.name
-                for counter in counters[list(counters.keys())[0]]
-                if hasattr(counter, "block") or hasattr(counter, "expression")
-            }
-            # Reset env. var.
-            if old_rocprofiler_metrics_path is None:
-                del os.environ["ROCPROFILER_METRICS_PATH"]
-            else:
-                os.environ["ROCPROFILER_METRICS_PATH"] = old_rocprofiler_metrics_path
-
-        else:
+        if not (
+            str(rocprof_cmd).endswith("rocprofv3")
+            or str(rocprof_cmd) == "rocprofiler-sdk"
+        ):
            console_error(
-                f"Incompatible profiler: {rocprof_cmd}. Supported profilers include: "
+                f"Incompatible profiler: {rocprof_cmd}. "
+                "Supported profilers include: "
                f"{get_submodules('rocprof_compute_profile')}"
            )

+        # Point to counter definition
+        old_rocprofiler_metrics_path = os.environ.get("ROCPROFILER_METRICS_PATH")
+        os.environ["ROCPROFILER_METRICS_PATH"] = str(
+            config.rocprof_compute_home / "rocprof_compute_soc" / "profile_configs"
+        )
+        sys.path.append(
+            str(
+                Path(self.get_args().rocprofiler_sdk_library_path).parent.parent / "bin"
+            )
+        )
+        from rocprofv3_avail_module import avail
+
+        avail.loadLibrary.libname = str(
+            Path(self.get_args().rocprofiler_sdk_library_path).parent.parent
+            / "lib"
+            / "rocprofiler-sdk"
+            / "librocprofv3-list-avail.so"
+        )
+        counters = avail.get_counters()
+        rocprof_counters = {
+            counter.name
+            for counter in counters[list(counters.keys())[0]]
+            if hasattr(counter, "block") or hasattr(counter, "expression")
+        }
+        # Reset env. var.
+        if old_rocprofiler_metrics_path is None:
+            del os.environ["ROCPROFILER_METRICS_PATH"]
+        else:
+            os.environ["ROCPROFILER_METRICS_PATH"] = old_rocprofiler_metrics_path
+
        return rocprof_counters

    @demarcate
@@ -600,14 +553,7 @@ class OmniSoC_Base:
                    CounterFile(counter + ".txt", self.__perfmon_config)
                )
                output_files[-1].add(counter)
-
-                if using_v3():
-                    # v3 does not support SQ_ACCUM_PREV_HIRES. Use custom counters
-                    # defined in counter_defs.yaml that utilize accumulate(),
-                    # with _ACCUM suffix.
-                    output_files[-1].add(f"{counter}_ACCUM")
-                else:
-                    output_files[-1].add("SQ_ACCUM_PREV_HIRES")
+                output_files[-1].add(f"{counter}_ACCUM")
                accu_file_count += 1

        file_count = 0
@@ -708,12 +654,12 @@ class OmniSoC_Base:
                    for ctr in f.blocks[block_name].elements
                ]:
                    pmc.append(ctr)
-                    if using_v3() and is_tcc_channel_counter(ctr):
+                    # Add TCC channel counters definitions
+                    if is_tcc_channel_counter(ctr):
                        counter_name = ctr.split("[")[0]
                        idx = int(ctr.split("[")[1].split("]")[0])
                        xcd_idx = idx // int(self._mspec.l2_banks)
                        channel_idx = idx % int(self._mspec.l2_banks)
-
                        expression = (
                            f"select({counter_name},"
                            f"[DIMENSION_XCC=[{xcd_idx}], "
@@ -743,16 +689,6 @@ class OmniSoC_Base:
                    with open(file_name_yaml, "w") as fp:
                        fp.write(yaml.dump(counter_def, sort_keys=False))

-        # Add a timestamp file
-        # TODO: Does v3 need this?
-        if not using_v3():
-            timestamp_file = workload_perfmon_dir / "timestamps.txt"
-            with open(timestamp_file, "w") as fd:
-                fd.write("pmc:\n\n")
-                fd.write("gpu:\n")
-                fd.write("range:\n")
-                fd.write("kernel:\n")
-
    # ----------------------------------------------------
    # Required methods to be implemented by child classes
    # ----------------------------------------------------
@@ -36,8 +36,7 @@ class gfx908_soc(OmniSoC_Base):
    def __init__(self, args: argparse.Namespace, mspec: MachineSpecs) -> None:
        super().__init__(args, mspec)
        self.set_arch("gfx908")
-
-        self.set_compatible_profilers(["rocprofv1", "rocprofv3", "rocprofiler-sdk"])
+        self.set_compatible_profilers(["rocprofv3", "rocprofiler-sdk"])
        # Per IP block max number of simultaneous counters. GFX IP Blocks
        self.set_perfmon_config(mi_gpu_specs.get_perfmon_config("gfx908"))

@@ -37,8 +37,6 @@ class gfx90a_soc(OmniSoC_Base):
        super().__init__(args, mspec)
        self.set_arch("gfx90a")
        self.set_compatible_profilers([
-            "rocprofv1",
-            "rocprofv2",
            "rocprofv3",
            "rocprofiler-sdk",
        ])
@@ -37,8 +37,6 @@ class gfx940_soc(OmniSoC_Base):
        super().__init__(args, mspec)
        self.set_arch("gfx940")
        self.set_compatible_profilers([
-            "rocprofv1",
-            "rocprofv2",
            "rocprofv3",
            "rocprofiler-sdk",
        ])
@@ -37,8 +37,6 @@ class gfx941_soc(OmniSoC_Base):
        super().__init__(args, mspec)
        self.set_arch("gfx941")
        self.set_compatible_profilers([
-            "rocprofv1",
-            "rocprofv2",
            "rocprofv3",
            "rocprofiler-sdk",
        ])
@@ -37,8 +37,6 @@ class gfx942_soc(OmniSoC_Base):
        super().__init__(args, mspec)
        self.set_arch("gfx942")
        self.set_compatible_profilers([
-            "rocprofv1",
-            "rocprofv2",
            "rocprofv3",
            "rocprofiler-sdk",
        ])
@@ -58,7 +58,6 @@ from utils.logger import (
    console_warning,
    demarcate,
 )
-from utils.mi_gpu_spec import mi_gpu_specs

 rocprof_cmd = ""
 rocprof_args = ""
@@ -144,40 +143,6 @@ def add_counter_extra_config_input_yaml(
    return data


-def extract_counter_info_extra_config_input_yaml(
-    data: dict[str, Any], counter_name: str
-) -> Optional[dict]:
-    """
-    Extract the full counter dictionary from 'data' for the given counter_name.
-
-    Args:
-        data (dict): The source YAML dict.
-        counter_name (str): The counter to find.
-
-    Returns:
-        Optional[dict]: The full counter dict if found, else None.
-    """
-    counters = data.get("rocprofiler-sdk", {}).get("counters", [])
-    for counter in counters:
-        if counter.get("name") == counter_name:
-            return counter
-    return None
-
-
-def using_v1() -> bool:
-    return "ROCPROF" in os.environ.keys() and os.environ["ROCPROF"].endswith("rocprof")
-
-
-def using_v3() -> bool:
-    return "ROCPROF" not in os.environ.keys() or (
-        "ROCPROF" in os.environ.keys()
-        and (
-            os.environ["ROCPROF"].endswith("rocprofv3")
-            or os.environ["ROCPROF"] == "rocprofiler-sdk"
-        )
-    )
-
-
 def get_version(rocprof_compute_home: Path) -> dict[str, str]:
    """Return ROCm Compute Profiler versioning info"""

@@ -240,7 +205,8 @@ def detect_rocprof(args: argparse.Namespace) -> str:
    """Detect loaded rocprof version. Resolve path and set cmd globally."""
    global rocprof_cmd

-    if os.environ.get("ROCPROF") == "rocprofiler-sdk":
+    # Default is rocprofiler-sdk
+    if os.environ.get("ROCPROF", "rocprofiler-sdk") == "rocprofiler-sdk":
        if not Path(args.rocprofiler_sdk_library_path).exists():
            console_error(
                "Could not find rocprofiler-sdk library at "
@@ -249,45 +215,22 @@ def detect_rocprof(args: argparse.Namespace) -> str:
        rocprof_cmd = "rocprofiler-sdk"
        console_debug(f"rocprof_cmd is {rocprof_cmd}")
        console_debug(f"rocprofiler_sdk_path is {args.rocprofiler_sdk_library_path}")
-        return rocprof_cmd
-
-    # detect rocprof
-    if not "ROCPROF" in os.environ.keys():
-        # default rocprof
-        rocprof_cmd = "rocprofv3"
    else:
+        # If ROCPROF is not set to rocprofiler-sdk
        rocprof_cmd = os.environ["ROCPROF"]
-
-    # resolve rocprof path
-    rocprof_path = shutil.which(rocprof_cmd)
-
-    if not rocprof_path:
-        rocprof_cmd = "rocprofv3"
-        console_warning(
-            f"Unable to resolve path to {rocprof_cmd} binary. Reverting to default."
-        )
        rocprof_path = shutil.which(rocprof_cmd)
        if not rocprof_path:
            console_error(
-                "Please verify installation or set ROCPROF environment variable "
-                "with full path."
+                f"Unable to resolve path to {rocprof_cmd} binary. "
+                "Please verify installation or set ROCPROF "
+                "environment variable with full path."
            )
-    else:
-        # Resolve any sym links in file path
        rocprof_path = str(Path(rocprof_path.rstrip("\n")).resolve())
+        console_debug(f"rocprof_cmd is {str(rocprof_cmd)}")
        console_debug(f"ROC Profiler: {rocprof_path}")
-
-    console_debug(f"rocprof_cmd is {rocprof_cmd}")
    return rocprof_cmd


-# TODO: v1/v2 function, to be removed
-def store_app_cmd(args: argparse.Namespace) -> None:
-    global rocprof_args
-    rocprof_args = args
-
-
-@demarcate
 def capture_subprocess_output(
    subprocess_args: list[str],
    new_env: Optional[dict[str, str]] = None,
@@ -766,47 +709,40 @@ def run_prof(
        default_options = ["-i", fname]
        options = default_options + cast(list[str], profiler_options)

-    if using_v3():
-        if rocprof_cmd == "rocprofiler-sdk":
-            options["ROCPROF_AGENT_INDEX"] = "absolute"
-        else:
-            options = ["-A", "absolute"] + options
+    if rocprof_cmd == "rocprofiler-sdk":
+        options["ROCPROF_AGENT_INDEX"] = "absolute"
    else:
-        if is_mode_live_attach:
-            console_error(
-                "The live attach/detach only supports rocprofv3 or rocprofiler-sdk"
-            )
+        options = ["-A", "absolute"] + options

    new_env = os.environ.copy()

-    if using_v3():
-        # Counter definitions
-        with open(
-            config.rocprof_compute_home
-            / "rocprof_compute_soc"
-            / "profile_configs"
-            / "counter_defs.yaml",
-        ) as file:
-            counter_defs = yaml.safe_load(file)
-        # Extra counter definitions
-        if fpath.with_suffix(".yaml").exists():
-            with open(fpath.with_suffix(".yaml")) as file:
-                counter_defs["rocprofiler-sdk"]["counters"].extend(
-                    yaml.safe_load(file)["rocprofiler-sdk"]["counters"]
-                )
-        # Write counter definitions to a temporary file
-        tmpfile_path = (
-            Path(tempfile.mkdtemp(prefix="rocprof_counter_defs_", dir="/tmp"))
-            / "counter_defs.yaml"
-        )
-        with open(tmpfile_path, "w") as tmpfile:
-            yaml.dump(counter_defs, tmpfile, default_flow_style=False, sort_keys=False)
-        # Set counter definitions
-        new_env["ROCPROFILER_METRICS_PATH"] = str(tmpfile_path.parent)
-        console_debug(
-            "Adding env var for counter definitions: "
-            f"ROCPROFILER_METRICS_PATH={new_env['ROCPROFILER_METRICS_PATH']}"
-        )
+    # Counter definitions
+    with open(
+        config.rocprof_compute_home
+        / "rocprof_compute_soc"
+        / "profile_configs"
+        / "counter_defs.yaml",
+    ) as file:
+        counter_defs = yaml.safe_load(file)
+    # Extra counter definitions
+    if Path(fname).with_suffix(".yaml").exists():
+        with open(Path(fname).with_suffix(".yaml")) as file:
+            counter_defs["rocprofiler-sdk"]["counters"].extend(
+                yaml.safe_load(file)["rocprofiler-sdk"]["counters"]
+            )
+    # Write counter definitions to a temporary file
+    tmpfile_path = (
+        Path(tempfile.mkdtemp(prefix="rocprof_counter_defs_", dir="/tmp"))
+        / "counter_defs.yaml"
+    )
+    with open(tmpfile_path, "w") as tmpfile:
+        yaml.dump(counter_defs, tmpfile, default_flow_style=False, sort_keys=False)
+    # Set counter definitions
+    new_env["ROCPROFILER_METRICS_PATH"] = str(tmpfile_path.parent)
+    console_debug(
+        "Adding env var for counter definitions: "
+        f"ROCPROFILER_METRICS_PATH={new_env['ROCPROFILER_METRICS_PATH']}"
+    )

    # set required env var for >= mi300
    if mspec.gpu_model.lower() not in (
@@ -910,92 +846,59 @@ def run_prof(
    results_files: list[str] = []

    if format_rocprof_output == "rocpd":
-        if rocprof_cmd == "rocprofiler-sdk" or rocprof_cmd.endswith("v3"):
-            # Write results_fbase.csv
-            rocpd_data.convert_db_to_csv(
-                glob.glob(f"{workload_dir}/out/pmc_1/*/*.db")[0],
-                f"{workload_dir}/results_{fbase}.csv",
+        # Write results_fbase.csv
+        rocpd_data.convert_db_to_csv(
+            glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
+            workload_dir + f"/results_{fbase}.csv",
+        )
+        if retain_rocpd_output:
+            shutil.copyfile(
+                glob.glob(workload_dir + "/out/pmc_1/*/*.db")[0],
+                workload_dir + "/" + fbase + ".db",
            )
-            if retain_rocpd_output:
-                shutil.copyfile(
-                    glob.glob(f"{workload_dir}/out/pmc_1/*/*.db")[0],
-                    f"{workload_dir}/{fbase}.db",
-                )
-                console_warning(
-                    f"Retaining large raw rocpd database: {workload_dir}/{fbase}.db"
-                )
-            # Remove temp directory
-            shutil.rmtree(f"{workload_dir}/out")
-            return
-        else:
-            console_error(
-                "rocpd output format is only supported with "
-                "rocprofiler-sdk or rocprofv3."
+            console_warning(
+                f"Retaining large raw rocpd database: {workload_dir}/{fbase}.db"
            )
-    elif rocprof_cmd.endswith("v2"):
-        # rocprofv2 has separate csv files for each process
-        results_files = glob.glob(f"{workload_dir}/out/pmc_1/results_*.csv")
+        # Remove temp directory
+        shutil.rmtree(workload_dir + "/" + "out")
+        return

-        if len(results_files) == 0:
-            return
+    # rocprofv3 requires additional processing for each process
+    results_files = process_rocprofv3_output(
+        format_rocprof_output, workload_dir, is_timestamps
+    )

-        # Combine results into single CSV file
-        combined_results = pd.concat(
-            [pd.read_csv(f) for f in results_files], ignore_index=True
-        )
-
-        # Overwrite column to ensure unique IDs.
-        combined_results["Dispatch_ID"] = range(0, len(combined_results))
-
-        combined_results.to_csv(
-            f"{workload_dir}/out/pmc_1/results_{fbase}.csv", index=False
-        )
-    elif rocprof_cmd.endswith("v3") or rocprof_cmd == "rocprofiler-sdk":
-        # rocprofv3 requires additional processing for each process
-        results_files = process_rocprofv3_output(
-            format_rocprof_output, workload_dir, is_timestamps
-        )
-
-        if rocprof_cmd == "rocprofiler-sdk":
+    if rocprof_cmd == "rocprofiler-sdk":
+        # TODO: as rocprofv3 --kokkos-trace feature improves,
+        # rocprof-compute should make updates accordingly
+        if "ROCPROF_HIP_RUNTIME_API_TRACE" in options:
+            process_hip_trace_output(workload_dir, fbase)
+    else:
+        if "--kokkos-trace" in options:
            # TODO: as rocprofv3 --kokkos-trace feature improves,
            # rocprof-compute should make updates accordingly
-            if "ROCPROF_HIP_RUNTIME_API_TRACE" in options:
-                process_hip_trace_output(workload_dir, fbase)
-        else:
-            if "--kokkos-trace" in options:
-                # TODO: as rocprofv3 --kokkos-trace feature improves,
-                # rocprof-compute should make updates accordingly
-                process_kokkos_trace_output(workload_dir, fbase)
-            elif "--hip-trace" in options:
-                process_hip_trace_output(workload_dir, fbase)
+            process_kokkos_trace_output(workload_dir, fbase)
+        elif "--hip-trace" in options:
+            process_hip_trace_output(workload_dir, fbase)

-        if not results_files:
-            console_warning(
-                f"Cannot write results for {fbase}.csv due to no counter "
-                "csv files generated."
-            )
-            return
-
-        # Combine results into single CSV file
+    # Combine results into single CSV file
+    if results_files:
        combined_results = pd.concat(
            [pd.read_csv(f) for f in results_files], ignore_index=True
        )
-
-        # Overwrite column to ensure unique IDs.
-        combined_results["Dispatch_ID"] = range(0, len(combined_results))
-
-        combined_results.to_csv(
-            f"{workload_dir}/out/pmc_1/results_{fbase}.csv", index=False
+    else:
+        console_warning(
+            f"Cannot write results for {fbase}.csv due to no counter "
+            "csv files generated."
        )
+        return

-    if not using_v3() and not using_v1():
-        # flatten tcc for applicable mi300 input
-        f = f"{workload_dir}/out/pmc_1/results_{fbase}.csv"
-        xcds = mi_gpu_specs.get_num_xcds(
-            mspec.gpu_arch, mspec.gpu_model, mspec.compute_partition
-        )
-        df = flatten_tcc_info_across_xcds(f, xcds, int(mspec.l2_banks))
-        df.to_csv(f, index=False)
+    # Overwrite column to ensure unique IDs.
+    combined_results["Dispatch_ID"] = range(0, len(combined_results))
+
+    combined_results.to_csv(
+        workload_dir + "/out/pmc_1/results_" + fbase + ".csv", index=False
+    )

    if Path(f"{workload_dir}/out").exists():
        # copy and remove out directory if needed
@@ -1226,26 +1129,6 @@ def process_hip_trace_output(workload_dir: str, fbase: str) -> None:
        )


-def replace_timestamps(workload_dir: str) -> None:
-    ts_path = Path(workload_dir) / "timestamps.csv"
-    if not ts_path.is_file():
-        return
-
-    df_stamps = pd.read_csv(ts_path)
-    if "Start_Timestamp" in df_stamps.columns and "End_Timestamp" in df_stamps.columns:
-        # Update timestamps for all *.csv output files
-        for fname in glob.glob(f"{workload_dir}/*.csv"):
-            if Path(fname).name != "sysinfo.csv":
-                df_pmc_perf = pd.read_csv(fname)
-                df_pmc_perf["Start_Timestamp"] = df_stamps["Start_Timestamp"]
-                df_pmc_perf["End_Timestamp"] = df_stamps["End_Timestamp"]
-                df_pmc_perf.to_csv(fname, index=False)
-    else:
-        console_warning(
-            "Incomplete profiling data detected. Unable to update timestamps.\n"
-        )
-
-
@demarcate
 def gen_sysinfo(
    workload_name: str,
@@ -1383,62 +1266,6 @@ def mibench(args: argparse.Namespace, mspec: Any) -> None:  # noqa: ANN401
    subprocess.run(my_args, check=True)


-def flatten_tcc_info_across_xcds(
-    file: str, xcds: int, tcc_channel_per_xcd: int
-) -> pd.DataFrame:
-    """
-    Flatten TCC per channel counters across all XCDs in partition.
-    NB: This func highly depends on the default behavior of rocprofv2 on MI300,
-        which might be broken anytime in the future!
-    """
-    df_orig = pd.read_csv(file)
-
-    ### prepare column headers
-    tcc_cols_orig = []
-    non_tcc_cols_orig = []
-    for c in df_orig.columns.to_list():
-        if "TCC" in c:
-            tcc_cols_orig.append(c)
-        else:
-            non_tcc_cols_orig.append(c)
-
-    cols = non_tcc_cols_orig[:]
-    tcc_cols_in_group: dict[int, list[str]] = {i: [] for i in range(xcds)}
-
-    for col in tcc_cols_orig:
-        for i in range(xcds):
-            # filter the channel index only
-            p = re.compile(r"\[(\d+)\]")
-
-            # pick up the 1st element only
-            def replacement(match: re.Match[str]) -> str:
-                return f"[{int(match.group(1)) + i * tcc_channel_per_xcd}]"
-
-            tcc_cols_in_group[i].append(re.sub(pattern=p, repl=replacement, string=col))
-
-    for i in range(xcds):
-        cols += tcc_cols_in_group[i]
-
-    df = pd.DataFrame(columns=cols)
-
-    ### Rearrange data with extended column names
-    for idx in range(0, len(df_orig.index), xcds):
-        # assume the front none TCC columns are the same for all XCCs
-        df_non_tcc = df_orig.iloc[idx].filter(regex=r"^(?!.*TCC).*$")
-        flatten_list = df_non_tcc.tolist()
-
-        # extract all tcc from one dispatch
-        # NB: assuming default contiguous order might not be safe!
-        df_tcc_all = df_orig.iloc[idx : (idx + xcds)].filter(regex="TCC")
-
-        for idx, row in df_tcc_all.iterrows():
-            flatten_list += row.tolist()
-        # NB: It is not the best perf to append a row once a time
-        df.loc[len(df.index)] = flatten_list
-
-    return df
-
-
 def get_submodules(package_name: str) -> list[str]:
    """List all submodules for a target package"""
    import importlib
@@ -108,7 +108,6 @@ ALL_CSVS_MI200 = sorted([
    "pmc_perf_4.csv",
    "pmc_perf_5.csv",
    "sysinfo.csv",
-    "timestamps.csv",
 ])
 ALL_CSVS_MI300 = sorted([
    "SQC_DCACHE_INFLIGHT_LEVEL.csv",
@@ -126,7 +125,6 @@ ALL_CSVS_MI300 = sorted([
    "pmc_perf_4.csv",
    "pmc_perf_5.csv",
    "sysinfo.csv",
-    "timestamps.csv",
 ])
 ALL_CSVS_MI350 = sorted([
    "SQC_DCACHE_INFLIGHT_LEVEL.csv",
@@ -155,13 +153,13 @@ ALL_CSVS_MI350 = sorted([

 ROOF_ONLY_FILES = sorted([
    "empirRoof_gpu-0_FP32.pdf",
+    "kernelName_legend.pdf",
    "pmc_perf.csv",
    "pmc_perf_0.csv",
    "pmc_perf_1.csv",
    "pmc_perf_2.csv",
    "roofline.csv",
    "sysinfo.csv",
-    "timestamps.csv",
 ])

 PC_SAMPLING_HOST_TRAP_FILES = sorted([
@@ -364,18 +362,7 @@ def gpu_soc():

 soc = gpu_soc()

-os.environ["ROCPROF"] = "rocprofv3"
-
-
-def using_v3():
-    return "ROCPROF" not in os.environ.keys() or (
-        "ROCPROF" in os.environ.keys()
-        and (
-            os.environ["ROCPROF"].endswith("rocprofv3")
-            or os.environ["ROCPROF"] == "rocprofiler-sdk"
-        )
-    )
-
+os.environ["ROCPROF"] = "rocprofiler-sdk"

 Baseline_dir = str(Path("tests/workloads/vcopy/" + soc).resolve())

@@ -568,19 +555,11 @@ def test_path(binary_handler_profile_rocprof_compute):
    if soc == "MI100":
        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
    elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI200 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI200
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
    elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI300 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI300
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
    elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(ALL_CSVS_MI350)
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
    else:
        print(f"This test is not supported for {soc}")
        assert 0
@@ -628,15 +607,7 @@ def test_roof_kernel_names(binary_handler_profile_rocprof_compute):
    assert returncode == 0
    file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels)

-    if soc == "MI100":
-        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
-    else:
-        expected_files = (
-            [f for f in ROOF_ONLY_FILES if f != "timestamps.csv"]
-            if using_v3()
-            else ROOF_ONLY_FILES
-        ) + ["kernelName_legend.pdf"]
-        assert sorted(list(file_dict.keys())) == sorted(expected_files)
+    assert sorted(list(file_dict.keys())) == ROOF_ONLY_FILES

    validate(
        inspect.stack()[0][3],
@@ -678,12 +649,7 @@ def test_roof_multiple_data_types(binary_handler_profile_rocprof_compute):
                assert os.path.exists(f"{workload_dir}/pmc_perf.csv")

                file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels)
-                expected_files = (
-                    [f for f in ROOF_ONLY_FILES if f != "timestamps.csv"]
-                    if using_v3()
-                    else ROOF_ONLY_FILES
-                ) + ["kernelName_legend.pdf"]
-                assert sorted(list(file_dict.keys())) == sorted(expected_files)
+                assert sorted(list(file_dict.keys())) == ROOF_ONLY_FILES
            else:
                pass
        finally:
@@ -1200,19 +1166,11 @@ def test_device_filter(binary_handler_profile_rocprof_compute):
    if soc == "MI100":
        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
    elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI200 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI200
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
    elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI300 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI300
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
    elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(ALL_CSVS_MI350)
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
    else:
        print(f"Testing isn't supported yet for {soc}")
        assert 0
@@ -1238,19 +1196,11 @@ def test_kernel(binary_handler_profile_rocprof_compute):
    if soc == "MI100":
        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
    elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI200 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI200
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
    elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI300 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI300
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
    elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(ALL_CSVS_MI350)
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
    else:
        print(f"Testing isn't supported yet for {soc}")
        assert 0
@@ -1274,19 +1224,11 @@ def test_dispatch_0(binary_handler_profile_rocprof_compute):
    if soc == "MI100":
        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
    elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI200 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI200
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
    elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI300 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI300
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
    elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(ALL_CSVS_MI350)
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
    else:
        print(f"Testing isn't supported yet for {soc}")
        assert 0
@@ -1314,19 +1256,11 @@ def test_dispatch_0_1(binary_handler_profile_rocprof_compute):
    if soc == "MI100":
        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
    elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI200 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI200
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
    elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI300 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI300
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
    elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(ALL_CSVS_MI350)
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
    else:
        print(f"Testing isn't supported yet for {soc}")
        assert 0
@@ -1351,19 +1285,11 @@ def test_dispatch_2(binary_handler_profile_rocprof_compute):
    if soc == "MI100":
        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
    elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI200 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI200
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
    elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI300 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI300
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
    elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(ALL_CSVS_MI350)
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
    else:
        print(f"Testing isn't supported yet for {soc}")
        assert 0
@@ -1391,19 +1317,11 @@ def test_join_type_grid(binary_handler_profile_rocprof_compute):
    if soc == "MI100":
        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
    elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI200 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI200
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
    elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI300 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI300
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
    elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(ALL_CSVS_MI350)
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
    else:
        print(f"Testing isn't supported yet for {soc}")
        assert 0
@@ -1428,19 +1346,11 @@ def test_join_type_kernel(binary_handler_profile_rocprof_compute):
    if soc == "MI100":
        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI100
    elif soc == "MI200":
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI200 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI200
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI200
    elif "MI300" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(
-            [f for f in ALL_CSVS_MI300 if f != "timestamps.csv"]
-            if using_v3()
-            else ALL_CSVS_MI300
-        )
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI300
    elif "MI350" in soc:
-        assert sorted(list(file_dict.keys())) == sorted(ALL_CSVS_MI350)
+        assert sorted(list(file_dict.keys())) == ALL_CSVS_MI350
    else:
        print(f"Testing isn't supported yet for {soc}")
        assert 0
@@ -1473,12 +1383,11 @@ def test_roof_sort_dispatches(binary_handler_profile_rocprof_compute):
    assert returncode == 0

    file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels)
-    assert (
-        sorted(list(file_dict.keys()))
-        == [f for f in ROOF_ONLY_FILES if f != "timestamps.csv"]
-        if using_v3()
-        else ROOF_ONLY_FILES
-    )
+
+    expected_files = ROOF_ONLY_FILES.copy()
+    expected_files.remove("kernelName_legend.pdf")
+    expected_files = sorted(expected_files)
+    assert sorted(list(file_dict.keys())) == expected_files

    validate(
        inspect.stack()[0][3],
@@ -1508,12 +1417,10 @@ def test_roof_sort_kernels(binary_handler_profile_rocprof_compute):
    assert returncode == 0
    file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels)

-    assert (
-        sorted(list(file_dict.keys()))
-        == [f for f in ROOF_ONLY_FILES if f != "timestamps.csv"]
-        if using_v3()
-        else ROOF_ONLY_FILES
-    )
+    expected_files = ROOF_ONLY_FILES.copy()
+    expected_files.remove("kernelName_legend.pdf")
+    expected_files = sorted(expected_files)
+    assert sorted(list(file_dict.keys())) == expected_files

    validate(
        inspect.stack()[0][3],
@@ -1543,12 +1450,10 @@ def test_roof_mem_levels_vL1D(binary_handler_profile_rocprof_compute):
    assert returncode == 0
    file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels)

-    assert (
-        sorted(list(file_dict.keys()))
-        == [f for f in ROOF_ONLY_FILES if f != "timestamps.csv"]
-        if using_v3()
-        else ROOF_ONLY_FILES
-    )
+    expected_files = ROOF_ONLY_FILES.copy()
+    expected_files.remove("kernelName_legend.pdf")
+    expected_files = sorted(expected_files)
+    assert sorted(list(file_dict.keys())) == expected_files

    validate(
        inspect.stack()[0][3],
@@ -1578,12 +1483,10 @@ def test_roof_mem_levels_LDS(binary_handler_profile_rocprof_compute):
    assert returncode == 0
    file_dict = test_utils.check_csv_files(workload_dir, 1, num_kernels)

-    assert (
-        sorted(list(file_dict.keys()))
-        == [f for f in ROOF_ONLY_FILES if f != "timestamps.csv"]
-        if using_v3()
-        else ROOF_ONLY_FILES
-    )
+    expected_files = ROOF_ONLY_FILES.copy()
+    expected_files.remove("kernelName_legend.pdf")
+    expected_files = sorted(expected_files)
+    assert sorted(list(file_dict.keys())) == expected_files

    validate(
        inspect.stack()[0][3],
@@ -1873,10 +1776,6 @@ def test_pc_sampling_stochastic(binary_handler_profile_rocprof_compute):

@pytest.mark.live_attach_detach
 def test_live_attach_detach_block(binary_handler_profile_rocprof_compute):
-    if not using_v3():
-        assert True
-        return
-
    options = ["--block", "3.1.1", "4.1.1", "5.1.1"]
    workload_dir = test_utils.get_output_dir()
    process_workload = subprocess.Popen(config["app_hip_dynamic_shared"])
@@ -1930,10 +1829,6 @@ def test_live_attach_detach_block(binary_handler_profile_rocprof_compute):
 def test_live_attach_detach_singlepath_launch_stats(
    binary_handler_profile_rocprof_compute,
 ):
-    if not using_v3():
-        assert True
-        return
-
    options = ["--set", "launch_stats"]
    workload_dir = test_utils.get_output_dir()
    process_workload = subprocess.Popen(config["app_hip_dynamic_shared"])
@@ -384,7 +384,7 @@ def test_detect_rocprof_env_rocprof_not_found(monkeypatch):
        rocprofiler_sdk_library_path = "/fake/path"

    # Set ROCPROF to 'rocprof'
-    monkeypatch.setenv("ROCPROF", "rocprof")
+    monkeypatch.setenv("ROCPROF", "rocprofv3")
    # shutil.which returns None for 'rocprof'
    monkeypatch.setattr("shutil.which", lambda cmd: None)
    # Track calls to console_warning and console_error
@@ -403,7 +403,6 @@ def test_detect_rocprof_env_rocprof_not_found(monkeypatch):

    with pytest.raises(RuntimeError, match="console_error called"):
        utils_mod.detect_rocprof(DummyArgs())
-    assert any("Unable to resolve path to rocprofv3 binary" in w for w in warnings)
    assert any(
        "Please verify installation or set ROCPROF environment variable" in e
        for e in errors
@@ -452,10 +451,7 @@ def test_detect_rocprof_env_not_set(monkeypatch):
        rocprofiler_sdk_library_path = "/fake/path"

    monkeypatch.delenv("ROCPROF", raising=False)
-    monkeypatch.setattr(
-        "shutil.which", lambda cmd: "/usr/bin/rocprofv3" if cmd == "rocprofv3" else None
-    )
-    monkeypatch.setattr("pathlib.Path.resolve", lambda self: self)
+    monkeypatch.setattr("pathlib.Path.exists", lambda _: True)
    logs = []
    monkeypatch.setattr(
        "utils.utils.console_debug", lambda msg, *a, **k: logs.append(str(msg))
@@ -463,10 +459,10 @@ def test_detect_rocprof_env_not_set(monkeypatch):
    import utils.utils as utils_mod

    result = utils_mod.detect_rocprof(DummyArgs())
-    assert result == "rocprofv3"
+    assert result == "rocprofiler-sdk"
    assert any(
-        "ROC Profiler: /usr/bin/rocprofv3" in log_entry
-        or "rocprof_cmd is rocprofv3" in log_entry
+        "rocprofiler_sdk_path is /fake/path" in log_entry
+        or "rocprof_cmd is rocprofiler-sdk" in log_entry
        for log_entry in logs
    )

@@ -2379,9 +2375,9 @@ def test_parse_text_file_not_found():
 # =============================================================================


-def test_run_prof_success_v2(tmp_path, monkeypatch):
+def test_run_prof_success_v3(tmp_path, monkeypatch):
    """
-    Test run_prof with rocprofv2 successful execution.
+    Test run_prof with rocprofv3 successful execution.

    Args:
        tmp_path (Path): Temporary directory for test files.
@@ -2395,7 +2391,13 @@ def test_run_prof_success_v2(tmp_path, monkeypatch):
    workload_dir = str(tmp_path / "workload")
    os.makedirs(workload_dir + "/out/pmc_1", exist_ok=True)

-    csv_content = "Dispatch_ID,GPU_ID,Kernel_Name\n0,0,test_kernel"
+    csv_content = (
+        "Agent_Type,Node_Id,Wave_Front_Size,Correlation_Id,Dispatch_Id,Agent_Id,Queue_Id,Process_Id,Thread_Id,"
+        "Grid_Size,Kernel_Id,Kernel_Name,Workgroup_Size,LDS_Block_Size,"
+        "Scratch_Size,VGPR_Count,Accum_VGPR_Count,SGPR_Count,Start_Timestamp,"
+        "End_Timestamp,Counter_Name,Counter_Value\n"
+        "GPU,0,0,0,0,0,0,0,0,0,0,test_kernel,0,0,0,0,0,0,0,1,SQ_WAVES,100"
+    )
    with open(workload_dir + "/out/pmc_1/results_0.csv", "w") as f:
        f.write(csv_content)

@@ -2408,12 +2410,10 @@ def test_run_prof_success_v2(tmp_path, monkeypatch):

    mspec = MockSpec()

-    monkeypatch.setattr("utils.utils.rocprof_cmd", "rocprofv2")
+    monkeypatch.setattr("utils.utils.rocprof_cmd", "rocprofv3")
    monkeypatch.setattr(
        "utils.utils.capture_subprocess_output", lambda *a, **k: (True, "success")
    )
-    monkeypatch.setattr("utils.utils.using_v3", lambda: False)
-    monkeypatch.setattr("utils.utils.using_v1", lambda: False)
    monkeypatch.setattr("utils.utils.console_debug", lambda *a, **k: None)
    monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None)
    monkeypatch.setattr(
@@ -2458,8 +2458,6 @@ def test_run_prof_success_v3_csv(tmp_path, monkeypatch):
    monkeypatch.setattr(
        "utils.utils.capture_subprocess_output", lambda *a, **k: (True, "success")
    )
-    monkeypatch.setattr("utils.utils.using_v3", lambda: True)
-    monkeypatch.setattr("utils.utils.using_v1", lambda: False)
    monkeypatch.setattr("utils.utils.console_debug", lambda *a, **k: None)
    monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None)
    monkeypatch.setattr(
@@ -2510,7 +2508,6 @@ def test_run_prof_success_rocprofiler_sdk(tmp_path, monkeypatch):
    monkeypatch.setattr(
        "utils.utils.capture_subprocess_output", lambda *a, **k: (True, "success")
    )
-    monkeypatch.setattr("utils.utils.using_v3", lambda: True)
    monkeypatch.setattr("utils.utils.parse_text", lambda f: ["SQ_WAVES"])
    monkeypatch.setattr("utils.utils.process_rocprofv3_output", lambda *a, **k: [])
    monkeypatch.setattr("utils.utils.console_debug", lambda *a, **k: None)
@@ -2554,8 +2551,6 @@ def test_run_prof_with_yaml_config(tmp_path, monkeypatch):
    monkeypatch.setattr(
        "utils.utils.capture_subprocess_output", lambda *a, **k: (True, "success")
    )
-    monkeypatch.setattr("utils.utils.using_v3", lambda: True)
-    monkeypatch.setattr("utils.utils.using_v1", lambda: False)
    monkeypatch.setattr("utils.utils.process_rocprofv3_output", lambda *a, **k: [])
    monkeypatch.setattr("utils.utils.console_debug", lambda *a, **k: None)
    monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None)
@@ -2597,8 +2592,6 @@ def test_run_prof_failure_subprocess(tmp_path, monkeypatch):
    monkeypatch.setattr(
        "utils.utils.capture_subprocess_output", lambda *a, **k: (False, "error output")
    )
-    monkeypatch.setattr("utils.utils.using_v3", lambda: True)
-    monkeypatch.setattr("utils.utils.using_v1", lambda: False)
    monkeypatch.setattr("utils.utils.console_debug", lambda *a, **k: None)
    monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None)

@@ -2651,8 +2644,6 @@ def test_run_prof_mi300_environment_setup(tmp_path, monkeypatch):
    monkeypatch.setattr(
        "utils.utils.capture_subprocess_output", mock_capture_subprocess_output
    )
-    monkeypatch.setattr("utils.utils.using_v3", lambda: True)
-    monkeypatch.setattr("utils.utils.using_v1", lambda: False)
    monkeypatch.setattr("utils.utils.process_rocprofv3_output", lambda *a, **k: [])
    monkeypatch.setattr("utils.utils.console_debug", lambda *a, **k: None)
    monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None)
@@ -2692,7 +2683,13 @@ def test_run_prof_timestamps_special_case(tmp_path, monkeypatch):

    mspec = MockSpec()

-    csv_content = "Dispatch_ID,Start_Timestamp,End_Timestamp\n0,100,200"
+    csv_content = (
+        "Agent_Type,Node_Id,Wave_Front_Size,Correlation_Id,Dispatch_Id,Agent_Id,Queue_Id,Process_Id,Thread_Id,"
+        "Grid_Size,Kernel_Id,Kernel_Name,Workgroup_Size,LDS_Block_Size,"
+        "Scratch_Size,VGPR_Count,Accum_VGPR_Count,SGPR_Count,Start_Timestamp,"
+        "End_Timestamp,Counter_Name,Counter_Value\n"
+        "GPU,0,0,0,0,0,0,0,0,0,0,test_kernel,0,0,0,0,0,0,0,1,SQ_WAVES,100"
+    )
    with open(workload_dir + "/kernel_trace.csv", "w") as f:
        f.write(csv_content)

@@ -2702,8 +2699,6 @@ def test_run_prof_timestamps_special_case(tmp_path, monkeypatch):
    monkeypatch.setattr(
        "utils.utils.capture_subprocess_output", lambda *a, **k: (True, "success")
    )
-    monkeypatch.setattr("utils.utils.using_v3", lambda: True)
-    monkeypatch.setattr("utils.utils.using_v1", lambda: False)
    monkeypatch.setattr(
        "utils.utils.process_rocprofv3_output", lambda *a, **k: csv_files
    )
@@ -2752,8 +2747,6 @@ def test_run_prof_no_results_files(tmp_path, monkeypatch):
    monkeypatch.setattr(
        "utils.utils.capture_subprocess_output", lambda *a, **k: (True, "success")
    )
-    monkeypatch.setattr("utils.utils.using_v3", lambda: False)
-    monkeypatch.setattr("utils.utils.using_v1", lambda: False)
    monkeypatch.setattr("glob.glob", lambda pattern: [])  # No files found
    monkeypatch.setattr("utils.utils.console_debug", lambda *a, **k: None)
    monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None)
@@ -2790,46 +2783,31 @@ def test_run_prof_header_standardization(tmp_path, monkeypatch):
    mspec = MockSpec()

    csv_content = (
-        "KernelName,Index,grd,gpu-id,BeginNs,EndNs\ntest_kernel,0,64,0,100,200"
+        "Agent_Type,Node_Id,Wave_Front_Size,Correlation_Id,Dispatch_Id,Agent_Id,Queue_Id,Process_Id,Thread_Id,"
+        "Grid_Size,Kernel_Id,Kernel_Name,Workgroup_Size,LDS_Block_Size,"
+        "Scratch_Size,VGPR_Count,Accum_VGPR_Count,SGPR_Count,Start_Timestamp,"
+        "End_Timestamp,Counter_Name,Counter_Value\n"
+        "GPU,0,0,0,0,0,0,0,0,0,0,test_kernel,0,0,0,0,0,0,0,1,SQ_WAVES,100"
    )
    with open(workload_dir + "/out/pmc_1/results_test.csv", "w") as f:
        f.write(csv_content)

-    old_headers_df = pd.DataFrame({
-        "KernelName": ["test_kernel"],
-        "Index": [0],
-        "grd": [64],
-        "gpu-id": [0],
-        "BeginNs": [100],
-        "EndNs": [200],
-    })
-
-    monkeypatch.setattr("utils.utils.rocprof_cmd", "rocprofv2")
+    monkeypatch.setattr("utils.utils.rocprof_cmd", "rocprofv3")
    monkeypatch.setattr(
        "utils.utils.capture_subprocess_output", lambda *a, **k: (True, "success")
    )
-    monkeypatch.setattr("utils.utils.using_v3", lambda: False)
-    monkeypatch.setattr("utils.utils.using_v1", lambda: False)
    monkeypatch.setattr(
        "glob.glob", lambda pattern: [workload_dir + "/out/pmc_1/results_test.csv"]
    )
    monkeypatch.setattr("utils.utils.console_debug", lambda *a, **k: None)
    monkeypatch.setattr("utils.utils.console_log", lambda *a, **k: None)

-    read_calls = []
-
-    def mock_read_csv(path, **kwargs):
-        read_calls.append(path)
-        return old_headers_df.copy()
-
    write_calls = []

    def mock_to_csv(self, path, **kwargs):
        write_calls.append((path, self.columns.tolist()))

-    monkeypatch.setattr("pandas.read_csv", mock_read_csv)
    monkeypatch.setattr("pandas.DataFrame.to_csv", mock_to_csv)
-    monkeypatch.setattr("pandas.concat", lambda dfs, **k: old_headers_df.copy())

    import utils.utils as utils_mod

@@ -2837,9 +2815,8 @@ def test_run_prof_header_standardization(tmp_path, monkeypatch):

    final_headers = write_calls[-1][1] if write_calls else []
    assert "Kernel_Name" in final_headers
-    assert "Dispatch_ID" in final_headers
+    assert "Dispatch_Id" in final_headers
    assert "Grid_Size" in final_headers
-    assert "GPU_ID" in final_headers
    assert "Start_Timestamp" in final_headers
    assert "End_Timestamp" in final_headers

@@ -2868,28 +2845,12 @@ def test_run_prof_tcc_flattening_mi300(tmp_path, monkeypatch):

    mspec = MockSpec()

-    flatten_called = False
-
-    def mock_flatten_tcc_info_across_xcds(file, xcds, l2_banks):
-        nonlocal flatten_called
-        flatten_called = True
-        return pd.DataFrame({
-            "Dispatch_ID": [0],
-            "TCC_HIT[0]": [100],
-            "TCC_HIT[16]": [200],
-        })
-
    # Mock functions
-    monkeypatch.setattr("utils.utils.rocprof_cmd", "rocprofv2")
+    monkeypatch.setattr("utils.utils.rocprof_cmd", "rocprofv3")
    monkeypatch.setattr(
        "utils.utils.capture_subprocess_output", lambda *a, **k: (True, "success")
    )
-    monkeypatch.setattr("utils.utils.using_v3", lambda: False)
-    monkeypatch.setattr("utils.utils.using_v1", lambda: False)
-    monkeypatch.setattr(
-        "utils.utils.flatten_tcc_info_across_xcds", mock_flatten_tcc_info_across_xcds
-    )
-    monkeypatch.setattr("utils.utils.mi_gpu_specs.get_num_xcds", lambda *a: 2)
+    monkeypatch.setattr("utils.mi_gpu_spec.mi_gpu_specs.get_num_xcds", lambda *a: 2)
    monkeypatch.setattr(
        "glob.glob", lambda pattern: [workload_dir + "/results_test.csv"]
    )
@@ -2907,8 +2868,6 @@ def test_run_prof_tcc_flattening_mi300(tmp_path, monkeypatch):
    # Execute function
    utils_mod.run_prof(str(fname), ["--arg"], workload_dir, mspec, logging.INFO, "csv")

-    assert flatten_called
-

 import utils.utils as utils_mod  # noqa

@@ -2934,7 +2893,6 @@ def test_run_prof_sdk_creates_new_env_copy(tmp_path, monkeypatch):
    workload_dir_str = str(tmp_path)

    monkeypatch.setattr("utils.utils.rocprof_cmd", "rocprofiler-sdk")
-    monkeypatch.setattr("utils.utils.using_v3", lambda: False)
    monkeypatch.setattr("utils.utils.process_rocprofv3_output", lambda *a, **k: [])

    capture_subprocess_called_with_env = None
@@ -2957,10 +2915,12 @@ def test_run_prof_sdk_creates_new_env_copy(tmp_path, monkeypatch):
        "utils.utils.parse_text", lambda *a, **k: ["COUNTER1", "COUNTER2"]
    )

-    mock_fname_path_obj = mock.Mock(spec=Path)
+    mock_fname_path_obj = mock.MagicMock(spec=Path)
    mock_fname_path_obj.stem = "counters"
    mock_fname_path_obj.name = "counters.txt"
    mock_fname_path_obj.with_suffix.return_value.exists.return_value = False
+    mock_fname_path_obj.__truediv__.return_value = mock.Mock(spec=Path)
+
    mock_out_path_obj = mock.Mock(spec=Path)
    mock_out_path_obj.exists.return_value = False

@@ -2999,6 +2959,7 @@ def test_run_prof_sdk_creates_new_env_copy(tmp_path, monkeypatch):
    monkeypatch.setattr("shutil.copyfile", lambda *a, **k: None)
    monkeypatch.setattr("shutil.rmtree", lambda *a, **k: None)
    monkeypatch.setattr("utils.utils.console_warning", lambda *a, **k: None)
+    monkeypatch.setattr("builtins.open", lambda *a, **k: io.StringIO(""))

    utils_mod.run_prof(
        fname_str,
@@ -3030,7 +2991,7 @@ def test_run_prof_v3_sdk_and_cli_calls_trace_processing(tmp_path, monkeypatch):
    Line 5 (CLI): elif "--hip-trace" in options:
        process_hip_trace_output(...)
    """
-    fname_str = str(tmp_path / "counters.txt")
+    fname_str = str(tmp_path) + "/counters.txt"
    Path(fname_str).touch()
    fbase_str = "counters"
    workload_dir_str = str(tmp_path)
@@ -3041,7 +3002,7 @@ def test_run_prof_v3_sdk_and_cli_calls_trace_processing(tmp_path, monkeypatch):
    )
    monkeypatch.setattr(
        "utils.utils.process_rocprofv3_output",
-        lambda *a, **k: [str(tmp_path / "results1.csv")],
+        lambda *a, **k: [str(tmp_path) + "/results1.csv"],
    )

    hip_trace_called_with = None
@@ -3096,15 +3057,13 @@ def test_run_prof_v3_sdk_and_cli_calls_trace_processing(tmp_path, monkeypatch):
    monkeypatch.setattr("shutil.copyfile", lambda *a, **k: None)
    monkeypatch.setattr("shutil.rmtree", lambda *a, **k: None)
    monkeypatch.setattr("builtins.open", lambda *a, **k: io.StringIO(""))
-    monkeypatch.setattr("utils.utils.flatten_tcc_info_across_xcds", lambda df, *a: df)
-    monkeypatch.setattr("utils.utils.mi_gpu_specs.get_num_xcds", lambda *a: 1)
+    monkeypatch.setattr("utils.mi_gpu_spec.mi_gpu_specs.get_num_xcds", lambda *a: 1)

    mspec = MockMSpec()
    loglevel = logging.INFO
    format_rocprof_output = True

    monkeypatch.setattr("utils.utils.rocprof_cmd", "rocprofiler-sdk")
-    monkeypatch.setattr("utils.utils.using_v3", lambda: True)

    profiler_options_sdk_hip = {
        "APP_CMD": "my_app",
@@ -5458,306 +5417,6 @@ def test_mibench_console_log_called(tmp_path, monkeypatch):
    assert console_log_calls[0][1] == "No roofline data found. Generating..."


-# =============================================================================
-# TESTS FOR flatten_tcc_info_across_xcds
-# =============================================================================
-"""
-Normal Functionality:
-
-Basic single XCD operation
-Multiple XCD channel renumbering
-Complex channel index patterns
-Multiple dispatch handling
-Edge Cases:
-
-Empty dataframes
-Zero XCDs
-Insufficient data
-Large channel numbers
-Column Handling:
-
-No TCC columns
-TCC-only columns
-Mixed TCC/non-TCC columns
-Irregular TCC naming patterns
-Error Conditions:
-
-File not found errors
-Invalid input validation
-Performance & Data Integrity:
-
-Large dataset handling
-Data preservation validation
-Regex pattern validation
-"""
-
-
-def test_flatten_tcc_info_across_xcds_zero_xcds(tmp_path):
-    """
-    Test edge case with zero XCDs.
-
-    Args:
-        tmp_path (Path): Temporary directory for test files.
-
-    Returns:
-        None: Asserts function handles zero XCDs edge case by raising ValueError.
-    """
-    columns = ["Kernel_Name", "TCC_HIT[0]"]
-    data = [["kernel1", 100]]
-
-    df = pd.DataFrame(data, columns=columns)
-    csv_file = tmp_path / "test_zero_xcds.csv"
-    df.to_csv(csv_file, index=False)
-
-    import utils.utils as utils_mod
-
-    with pytest.raises(ValueError, match="range\\(\\) arg 3 must not be zero"):
-        utils_mod.flatten_tcc_info_across_xcds(
-            str(csv_file), xcds=0, tcc_channel_per_xcd=4
-        )
-
-
-def test_flatten_tcc_info_across_xcds_insufficient_data(tmp_path):
-    """
-    Test when there's insufficient data for the specified XCDs.
-
-    Args:
-        tmp_path (Path): Temporary directory for test files.
-
-    Returns:
-        None: Asserts function raises ValueError when trying
-        to process insufficient data.
-    """
-    columns = ["Kernel_Name", "TCC_HIT[0]"]
-    data = [["kernel1", 100]]
-
-    df = pd.DataFrame(data, columns=columns)
-    csv_file = tmp_path / "test_insufficient.csv"
-    df.to_csv(csv_file, index=False)
-
-    import utils.utils as utils_mod
-
-    with pytest.raises(ValueError, match="cannot set a row with mismatched columns"):
-        utils_mod.flatten_tcc_info_across_xcds(
-            str(csv_file), xcds=3, tcc_channel_per_xcd=4
-        )
-
-
-def test_flatten_tcc_info_across_xcds_irregular_tcc_column_names(tmp_path):
-    """
-    Test with irregular TCC column naming patterns.
-
-    Args:
-        tmp_path (Path): Temporary directory for test files.
-
-    Returns:
-        None: Asserts function handles various TCC column name
-        patterns but may fail with pandas Series ambiguity.
-    """
-    columns = [
-        "Kernel_Name",
-        "TCC_HIT_SPECIAL[0]",
-        "NOT_TCC_BUT_HAS_TCC",
-        "TCC_MISS[0]",
-    ]
-    data = [
-        ["kernel1", 100, 50, 10],
-        ["kernel1", 200, 60, 20],
-    ]
-
-    df = pd.DataFrame(data, columns=columns)
-    csv_file = tmp_path / "test_irregular.csv"
-    df.to_csv(csv_file, index=False)
-
-    import utils.utils as utils_mod
-
-    try:
-        result = utils_mod.flatten_tcc_info_across_xcds(
-            str(csv_file), xcds=2, tcc_channel_per_xcd=4
-        )
-
-        assert len(result) == 1
-        assert "TCC_HIT_SPECIAL[0]" in result.columns
-        assert "TCC_HIT_SPECIAL[4]" in result.columns
-        assert "TCC_MISS[0]" in result.columns
-        assert "TCC_MISS[4]" in result.columns
-        assert result.iloc[0]["NOT_TCC_BUT_HAS_TCC"] == 50
-
-    except ValueError as e:
-        if "The truth value of a Series is ambiguous" in str(e):
-            pytest.skip(
-                "Function has pandas Series ambiguity issue in boolean evaluation"
-            )
-        else:
-            raise
-
-
-def test_flatten_tcc_info_across_xcds_regex_pattern_validation(tmp_path):
-    """
-    Test that regex pattern correctly identifies channel indices.
-
-    Args:
-        tmp_path (Path): Temporary directory for test files.
-
-    Returns:
-        None: Asserts regex pattern works for various channel
-        index formats but may fail with pandas Series ambiguity.
-    """
-    columns = ["TCC_HIT[0]", "TCC_MISS[10]", "TCC_REQ[255]", "TCC_INVALID_NO_BRACKET"]
-    data = [
-        [100, 200, 300, 400],  # XCD 0
-        [500, 600, 700, 800],  # XCD 1
-    ]
-
-    df = pd.DataFrame(data, columns=columns)
-    csv_file = tmp_path / "test_regex.csv"
-    df.to_csv(csv_file, index=False)
-
-    import utils.utils as utils_mod
-
-    try:
-        result = utils_mod.flatten_tcc_info_across_xcds(
-            str(csv_file), xcds=2, tcc_channel_per_xcd=128
-        )
-
-        assert len(result) == 1
-        assert "TCC_HIT[0]" in result.columns
-        assert "TCC_HIT[128]" in result.columns  # 0 + 1*128
-        assert "TCC_MISS[10]" in result.columns
-        assert "TCC_MISS[138]" in result.columns  # 10 + 1*128
-        assert "TCC_REQ[255]" in result.columns
-        assert "TCC_REQ[383]" in result.columns  # 255 + 1*128
-
-        assert result.iloc[0]["TCC_INVALID_NO_BRACKET"] == 400
-
-    except ValueError as e:
-        if "The truth value of a Series is ambiguous" in str(e):
-            pytest.skip(
-                "Function has pandas Series ambiguity issue in boolean evaluation"
-            )
-        else:
-            raise
-
-
-def test_flatten_tcc_info_across_xcds_edge_case_validation(tmp_path):
-    """
-    Test edge cases and validation scenarios for
-    flatten_tcc_info_across_xcds.
-
-    Args:
-        tmp_path (Path): Temporary directory for test files.
-
-    Returns:
-        None: Asserts function behavior with various edge cases.
-    """
-    import utils.utils as utils_mod
-
-    columns = ["Kernel_Name", "TCC_HIT[0]"]
-    data = [["kernel1", 100]]
-    df = pd.DataFrame(data, columns=columns)
-    csv_file = tmp_path / "test_zero_xcds.csv"
-    df.to_csv(csv_file, index=False)
-
-    with pytest.raises(ValueError):
-        utils_mod.flatten_tcc_info_across_xcds(
-            str(csv_file), xcds=0, tcc_channel_per_xcd=4
-        )
-
-    try:
-        result = utils_mod.flatten_tcc_info_across_xcds(
-            str(csv_file), xcds=-1, tcc_channel_per_xcd=4
-        )
-        assert len(result) == 0
-    except ValueError:
-        pass
-
-    with pytest.raises(FileNotFoundError):
-        utils_mod.flatten_tcc_info_across_xcds(
-            "nonexistent.csv", xcds=2, tcc_channel_per_xcd=4
-        )
-
-
-def test_flatten_tcc_info_across_xcds_pandas_filter_issue(tmp_path):
-    """
-    Test demonstrating the pandas filter regex issue that causes Series ambiguity error.
-
-    Args:
-        tmp_path (Path): Temporary directory for test files.
-
-    Returns:
-        None: Documents the pandas boolean evaluation issue in the function.
-    """
-    columns = ["Kernel_Name", "TCC_HIT[0]", "SQ_WAVES"]
-    data = [
-        ["kernel1", 100, 50],
-        ["kernel1", 200, 60],
-    ]
-
-    df = pd.DataFrame(data, columns=columns)
-    csv_file = tmp_path / "test_pandas_issue.csv"
-    df.to_csv(csv_file, index=False)
-
-    import utils.utils as utils_mod
-
-    try:
-        result = utils_mod.flatten_tcc_info_across_xcds(
-            str(csv_file), xcds=2, tcc_channel_per_xcd=4
-        )
-
-        assert len(result) == 1
-        assert "Kernel_Name" in result.columns
-        assert "TCC_HIT[0]" in result.columns
-        assert "TCC_HIT[4]" in result.columns
-        assert "SQ_WAVES" in result.columns
-
-    except ValueError as e:
-        if "The truth value of a Series is ambiguous" in str(e):
-            pytest.skip(
-                "Known issue: pandas .filter() with regex causes "
-                "Series boolean ambiguity"
-            )
-        else:
-            raise
-
-
-def test_flatten_tcc_info_across_xcds_successful_cases_only(tmp_path):
-    """
-    Test only the cases that are expected to work successfully.
-
-    Args:
-        tmp_path (Path): Temporary directory for test files.
-
-    Returns:
-        None: Asserts successful operation for known working scenarios.
-    """
-    import utils.utils as utils_mod
-
-    columns = ["TCC_HIT[0]", "TCC_MISS[0]"]
-    data = [
-        [100, 10],  # XCD 0
-        [200, 20],  # XCD 1
-    ]
-
-    df = pd.DataFrame(data, columns=columns)
-    csv_file = tmp_path / "test_simple_success.csv"
-    df.to_csv(csv_file, index=False)
-
-    result = utils_mod.flatten_tcc_info_across_xcds(
-        str(csv_file), xcds=2, tcc_channel_per_xcd=4
-    )
-
-    assert len(result) == 1
-    assert "TCC_HIT[0]" in result.columns
-    assert "TCC_HIT[4]" in result.columns
-    assert "TCC_MISS[0]" in result.columns
-    assert "TCC_MISS[4]" in result.columns
-    assert result.iloc[0]["TCC_HIT[0]"] == 100
-    assert result.iloc[0]["TCC_HIT[4]"] == 200
-
-
-# =============================================================================
-# TESTS FOR flatten_tcc_info_across_xcds
-# =============================================================================
 """
 Normal Functionality:

@@ -8517,173 +8176,6 @@ def test_add_counter_overwrite_existing():
    updated_properties = ["P_UPDATED", "P_NEW"]  # noqa


-# =================================================================================
-# Test extract counter info extra config input yaml
-# =================================================================================
-
-
-def test_extract_counter_info_returns_none_when_not_found():
-    """
-    Test that extract_counter_info_extra_config_input_yaml returns None
-    when the counter is not found or data structure is incomplete.
-    """
-    data_empty = {}
-    assert (
-        utils.extract_counter_info_extra_config_input_yaml(data_empty, "ANY_COUNTER")
-        is None
-    )
-
-    data_no_counters_key = {"rocprofiler-sdk": {}}
-    assert (
-        utils.extract_counter_info_extra_config_input_yaml(
-            data_no_counters_key, "ANY_COUNTER"
-        )
-        is None
-    )
-
-    data_empty_counters_list = {"rocprofiler-sdk": {"counters": []}}
-    assert (
-        utils.extract_counter_info_extra_config_input_yaml(
-            data_empty_counters_list, "ANY_COUNTER"
-        )
-        is None
-    )
-
-    data_with_other_counters = {
-        "rocprofiler-sdk": {
-            "counters": [
-                {"name": "EXISTING_COUNTER_1", "value": "val1"},
-                {"name": "EXISTING_COUNTER_2", "value": "val2"},
-            ]
-        }
-    }
-    assert (
-        utils.extract_counter_info_extra_config_input_yaml(
-            data_with_other_counters, "NON_EXISTENT_COUNTER"
-        )
-        is None
-    )
-
-    data_with_malformed_counter = {
-        "rocprofiler-sdk": {
-            "counters": [
-                {"value": "val1"},  # No 'name' key
-                {"name": "EXISTING_COUNTER_2", "value": "val2"},
-            ]
-        }
-    }
-    assert (
-        utils.extract_counter_info_extra_config_input_yaml(
-            data_with_malformed_counter, "EXISTING_COUNTER_1"
-        )
-        is None
-    )
-    assert (
-        utils.extract_counter_info_extra_config_input_yaml(
-            data_with_malformed_counter, "EXISTING_COUNTER_2"
-        )
-        is not None
-    )
-
-
-def test_extract_counter_info_returns_counter_when_found():
-    """
-    Test that extract_counter_info_extra_config_input_yaml returns the correct
-    counter dictionary when the counter is found.
-    """
-    counter1_details = {
-        "name": "MY_COUNTER_1",
-        "description": "Desc 1",
-        "expression": "expr1",
-    }
-    counter2_details = {
-        "name": "MY_COUNTER_2",
-        "description": "Desc 2",
-        "expression": "expr2",
-    }
-    data = {
-        "rocprofiler-sdk": {
-            "counters-schema-version": 1,
-            "counters": [
-                counter1_details,
-                counter2_details,
-            ],
-        }
-    }
-
-    extracted_counter1 = utils.extract_counter_info_extra_config_input_yaml(
-        data, "MY_COUNTER_1"
-    )
-    assert extracted_counter1 is not None
-    assert extracted_counter1 == counter1_details
-
-    extracted_counter2 = utils.extract_counter_info_extra_config_input_yaml(
-        data, "MY_COUNTER_2"
-    )
-    assert extracted_counter2 is not None
-    assert extracted_counter2 == counter2_details
-
-
-# =============================================================================
-# test using_v1 function
-# =============================================================================
-
-
-def test_using_v1_rocprof_set_and_ends_with_rocprof_returns_true():
-    """
-    Covers the case where "ROCPROF" is in os.environ and its value ends with "rocprof".
-    This makes the entire expression True, so the function returns True.
-    """
-    with mock.patch.dict(
-        os.environ, {"ROCPROF": "/opt/rocm/bin/rocprof", "OTHER_VAR": "value"}
-    ):
-        assert utils.using_v1() is True
-
-
-def test_using_v1_rocprof_set_but_not_ends_with_rocprof_returns_false():
-    """
-    Covers the case where "ROCPROF" is in os.environ, but its value does
-    NOT end with "rocprof".
-
-    The second part of the 'and' (os.environ["ROCPROF"].endswith("rocprof")) is False.
-    So the function returns False.
-    """
-    with mock.patch.dict(
-        os.environ, {"ROCPROF": "/opt/rocm/bin/rocprofv2", "OTHER_VAR": "value"}
-    ):
-        assert utils.using_v1() is False
-
-    with mock.patch.dict(
-        os.environ, {"ROCPROF": "some/path/to/rocprof_tool", "OTHER_VAR": "value"}
-    ):
-        assert utils.using_v1() is False
-
-
-def test_using_v1_rocprof_not_in_environ_returns_false():
-    """
-    Covers the case where "ROCPROF" is NOT in os.environ.
-    The first part of the 'and' ("ROCPROF" in os.environ.keys()) is False.
-    Due to short-circuiting, the second part is not evaluated.
-    So the function returns False.
-    """
-    current_env = os.environ.copy()
-    if "ROCPROF" in current_env:
-        del current_env["ROCPROF"]
-
-    with mock.patch.dict(os.environ, current_env, clear=True):
-        assert utils.using_v1() is False
-
-
-def test_using_v1_rocprof_is_empty_string_returns_false():
-    """
-    Covers the case where "ROCPROF" is in os.environ but is an empty string.
-    The second part (os.environ["ROCPROF"].endswith("rocprof")) will be False.
-    So the function returns False.
-    """
-    with mock.patch.dict(os.environ, {"ROCPROF": "", "OTHER_VAR": "value"}):
-        assert utils.using_v1() is False
-
-
 # =============================================================================
 # additional test detect_rocprof console error
 # =============================================================================
@@ -8732,27 +8224,6 @@ class MockArgs:  # noqa
        return self.__dict__ == other.__dict__


-def test_store_app_cmd_sets_global_rocprof_args():
-    """
-    Tests that store_app_cmd correctly assigns the passed 'args'
-    object to the global 'rocprof_args'.
-    """
-    sample_args_object = MockArgs(
-        rocprofiler_sdk_library_path="/path/to/sdk",
-        input_file="input.txt",
-        some_other_option=True,
-    )
-
-    if hasattr(utils, "rocprof_args"):
-        utils.rocprof_args = None
-    else:
-        pass
-    utils.store_app_cmd(sample_args_object)
-    assert utils.rocprof_args is sample_args_object, (
-        "Global rocprof_args should be the same object as the passed args"
-    )
-
-
 # =============================================================================
 # additional tests for v3_counter_csv_to_v2_csv function
 # =============================================================================
@@ -9112,155 +8583,6 @@ def test_pc_sampling_prof_empty_appcmd(
        mock_console_error.assert_not_called()


-# =============================================================================
-# test replace_timestamps function
-# =============================================================================
-
-
-def create_dummy_csv(filepath, data_dict):
-    df = pd.DataFrame(data_dict)
-    df.to_csv(filepath, index=False)
-
-
-@mock.patch("utils.utils.console_warning")
-def test_replace_timestamps_no_timestamps_csv_returns_early(
-    mock_console_warning, tmp_path
-):
-    """
-    Edge Case: timestamps.csv does not exist in workload_dir.
-    The function should return early.
-    Covers: if not path(workload_dir, "timestamps.csv").is_file(): return
-    """
-    workload_dir = str(tmp_path)
-
-    utils.replace_timestamps(workload_dir)
-
-    # Since there's no timestamps.csv, function should return early
-    # and console_warning should not be called
-    mock_console_warning.assert_not_called()
-
-
-@mock.patch("utils.utils.console_warning")
-@mock.patch("glob.glob")
-def test_replace_timestamps_timestamps_csv_missing_columns_warns(
-    mock_glob, mock_console_warning, tmp_path
-):
-    """
-    Edge Case: timestamps.csv exists but is missing
-    'Start_Timestamp' or 'End_Timestamp'.
-    The function should call console_warning.
-    Covers: else: console_warning(...)
-    """
-    workload_dir = str(tmp_path)
-    timestamps_csv_path_str = os.path.join(workload_dir, "timestamps.csv")
-
-    # Create the actual CSV file with missing columns
-    create_dummy_csv(timestamps_csv_path_str, {"Some_Other_Column": [123]})
-
-    utils.replace_timestamps(workload_dir)
-
-    # Verify console_warning was called
-    mock_console_warning.assert_called_once_with(
-        "Incomplete profiling data detected. Unable to update timestamps.\n"
-    )
-    # Verify glob wasn't called (since we return early due to missing columns)
-    mock_glob.assert_not_called()
-
-
-@mock.patch("utils.utils.console_warning")
-@mock.patch("glob.glob")
-def test_replace_timestamps_updates_other_csvs_skips_sysinfo(
-    mock_glob, mock_console_warning, tmp_path
-):
-    """
-    Edge Case: timestamps.csv is valid. Other CSVs exist, including sysinfo.csv.
-    Only non-sysinfo.csv files should be updated.
-    Covers: for fname in glob.glob(...): if path(fname).name != "sysinfo.csv": ...
-    """
-    workload_dir = str(tmp_path)
-    timestamps_csv_path_str = os.path.join(workload_dir, "timestamps.csv")
-    data_csv_path_str = os.path.join(workload_dir, "data.csv")
-    sysinfo_csv_path_str = os.path.join(workload_dir, "sysinfo.csv")
-
-    new_start_ts = [1000, 2000]
-    new_end_ts = [1500, 2500]
-    create_dummy_csv(
-        timestamps_csv_path_str,
-        {"Start_Timestamp": new_start_ts, "End_Timestamp": new_end_ts},
-    )
-
-    create_dummy_csv(
-        data_csv_path_str,
-        {"Kernel_Name": ["A", "B"], "Start_Timestamp": [1, 2], "End_Timestamp": [3, 4]},
-    )
-    create_dummy_csv(
-        sysinfo_csv_path_str,
-        {"Info": ["CPU", "MEM"], "Start_Timestamp": [5, 6], "End_Timestamp": [7, 8]},
-    )
-
-    # Mock glob to return the CSV files we created
-    mock_glob.return_value = [
-        data_csv_path_str,
-        sysinfo_csv_path_str,
-        timestamps_csv_path_str,
-    ]
-
-    utils.replace_timestamps(workload_dir)
-
-    mock_console_warning.assert_not_called()
-
-    # Verify data.csv was updated with new timestamps
-    df_data_updated = pd.read_csv(data_csv_path_str)
-    pd.testing.assert_series_equal(
-        df_data_updated["Start_Timestamp"],
-        pd.Series(new_start_ts, name="Start_Timestamp"),
-    )
-    pd.testing.assert_series_equal(
-        df_data_updated["End_Timestamp"], pd.Series(new_end_ts, name="End_Timestamp")
-    )
-
-    # Verify sysinfo.csv was NOT updated (timestamps should remain original)
-    df_sysinfo_original = pd.read_csv(sysinfo_csv_path_str)
-    assert list(df_sysinfo_original["Start_Timestamp"]) == [5, 6]
-    assert list(df_sysinfo_original["End_Timestamp"]) == [7, 8]
-
-
-@mock.patch("utils.utils.console_warning")
-@mock.patch("glob.glob")
-def test_replace_timestamps_no_other_csvs_to_update(
-    mock_glob, mock_console_warning, tmp_path
-):
-    """
-    Edge Case: timestamps.csv is valid, but no other *.csv files
-    (or only sysinfo.csv) exist.
-    The loop for updating files should not do anything or not run.
-    Covers: The for loop not iterating if glob returns empty or only sysinfo.
-    """
-    workload_dir = str(tmp_path)
-    timestamps_csv_path_str = os.path.join(workload_dir, "timestamps.csv")
-    sysinfo_csv_path_str = os.path.join(workload_dir, "sysinfo.csv")
-
-    create_dummy_csv(
-        timestamps_csv_path_str, {"Start_Timestamp": [100], "End_Timestamp": [200]}
-    )
-    create_dummy_csv(
-        sysinfo_csv_path_str,
-        {"Info": ["CPU"], "Start_Timestamp": [5], "End_Timestamp": [7]},
-    )
-
-    # Mock glob to return only timestamps.csv and sysinfo.csv
-    mock_glob.return_value = [timestamps_csv_path_str, sysinfo_csv_path_str]
-
-    utils.replace_timestamps(workload_dir)
-
-    mock_console_warning.assert_not_called()
-
-    # Verify sysinfo.csv was NOT updated (timestamps should remain original)
-    df_sysinfo_original = pd.read_csv(sysinfo_csv_path_str)
-    assert list(df_sysinfo_original["Start_Timestamp"]) == [5]
-    assert list(df_sysinfo_original["End_Timestamp"]) == [7]
-
-
 def test_set_parser():
    from utils.utils import parse_sets_yaml