[rocprofiler-compute] Counter accuracy tests and improvements for iteration multiplexing (#2011)

* Added laplace solver in samples * Add laplace eqn in CMake * Added counter accuracy test * Add iteration CLI arg for laplace eq * Unnest profile method * Missing counter warning * Updated insufficient kernel warning * Added reference for laplace equation * variable name change * Added comments for data comparison * Included scipy as test requirement * Added line number for ref * split stochastic and deterministic tests * Added order cli option for laplace_eqn * Install laplace eqn * Missing counter warning * Warn about missing kernels during analysis * Update tests * Split iteration multiplexing ctests * Updated warning * Incorporated copilot's suggestions
2025-12-17 18:26:39 +05:30
@@ -361,10 +361,18 @@ add_test(
 )

 add_test(
-    NAME test_profile_iteration_multiplexing
+    NAME test_profile_iteration_multiplexing_1
    COMMAND
-        ${PYTHON_TEST_COMMAND} -m pytest -s -m iteration_multiplexing
-        --junitxml=tests/test_profile_iteration_multiplexing.xml ${COV_OPTION}
+        ${PYTHON_TEST_COMMAND} -m pytest -s -m iteration_multiplexing_1
+        --junitxml=tests/test_profile_iteration_multiplexing_1.xml ${COV_OPTION}
+        tests/test_profile_general.py ${WORKING_DIR_OPTION}
+)
+
+add_test(
+    NAME test_profile_iteration_multiplexing_2
+    COMMAND
+        ${PYTHON_TEST_COMMAND} -m pytest -s -m iteration_multiplexing_2
+        --junitxml=tests/test_profile_iteration_multiplexing_2.xml ${COV_OPTION}
        tests/test_profile_general.py ${WORKING_DIR_OPTION}
 )

@@ -382,7 +390,8 @@ set_tests_properties(
    test_profile_pc_sampling
    test_profile_sets_func
    test_profile_live_attach_detach
-    test_profile_iteration_multiplexing
+    test_profile_iteration_multiplexing_1
+    test_profile_iteration_multiplexing_2
    PROPERTIES LABELS "profile" RESOURCE_GROUPS gpus:1 TIMEOUT 1800
 )

@@ -723,6 +732,7 @@ if(INSTALL_TESTS)
            tests/vsequential_access
            tests/occupancy
            tests/hip_dynamic_shared
+            tests/laplace_eqn
            tests/mat_mul_max
        DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${PROJECT_NAME}/tests
        COMPONENT tests
@@ -107,5 +107,7 @@ markers = [
    "roofline_2",
    "path",
    "sci_notion",
-    "iteration_multiplexing",
+    "iteration_multiplexing_1",
+    "iteration_multiplexing_2",
+    "iteration_multiplexing_stochastic",
 ]
@@ -2,3 +2,4 @@ mock
 pytest
 pytest-cov
 pytest-xdist
+scipy
@@ -0,0 +1,217 @@
+// MIT License
+//
+// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+// Reference:  Finite Difference Methods for Ordinary and Partial Differential
+// Equations by Randall J. LeVeque, Page 69-71
+
+#include <algorithm>
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <vector>
+
+// Helper macro for HIP error checking
+#define HIP_CHECK(call)                                                        \
+  do {                                                                         \
+    hipError_t err = call;                                                     \
+    if (err != hipSuccess) {                                                   \
+      std::cerr << "HIP error: " << hipGetErrorString(err) << " at "           \
+                << __FILE__ << ":" << __LINE__ << std::endl;                   \
+      std::exit(EXIT_FAILURE);                                                 \
+    }                                                                          \
+  } while (0)
+
+void initialize(float *U, int N) {
+  for (int i = 0; i < N; ++i) {
+    for (int j = 0; j < N; ++j) {
+      float x = 2.0f * (float)i / (float)N - 1.0f;
+      float y = 2.0f * (float)j / (float)N - 1.0f;
+      U[i * N + j] = 1.0f / (1.0f + x * x + y * y); // Initial guess
+    }
+  }
+}
+
+__global__ void jacobi_iteration(float *__restrict__ U_new,
+                                 const float *__restrict__ U_old, int N) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int j = blockIdx.y * blockDim.y + threadIdx.y;
+
+  int idx = j + i * N; // Flattened index for 2D grid
+
+  extern __shared__ float U[];
+  int s_i = threadIdx.x + 1;
+  int s_j = threadIdx.y + 1;
+
+  int tile_height = blockDim.y + 2;
+  int s_idx = s_j + s_i * tile_height;
+
+  // Load data into shared memory with halo regions
+  if (i < N && j < N) {
+    U[s_idx] = U_old[idx];
+
+    // Load halo regions
+    if (threadIdx.x == 0) {
+      U[s_j] = (i > 0) ? U_old[(i - 1) * N + j] : U_old[(N - 1) * N + j];
+    }
+
+    if (threadIdx.x == blockDim.x - 1) {
+      U[s_j + (blockDim.x + 1) * tile_height] =
+          (i < N - 1) ? U_old[(i + 1) * N + j] : U_old[j];
+    }
+
+    if (threadIdx.y == 0) {
+      U[s_i * tile_height] =
+          (j > 0) ? U_old[i * N + (j - 1)] : U_old[i * N + (N - 1)];
+    }
+
+    if (threadIdx.y == blockDim.y - 1) {
+      U[s_i * tile_height + blockDim.y + 1] =
+          (j < N - 1) ? U_old[i * N + (j + 1)] : U_old[i * N];
+    }
+  }
+
+  __syncthreads();
+
+  if (i < N && j < N) {
+    int east = s_idx - tile_height;
+    int west = s_idx + tile_height;
+    int north = s_idx + 1;
+    int south = s_idx - 1;
+    U_new[idx] = 0.25f * (U[east] + U[west] + U[north] + U[south]);
+  }
+}
+
+int main(int argc, char *argv[]) {
+  int iter = 1000;
+  std::vector<int> launch_order;
+
+  for (int i = 0; i < argc; i++) {
+    if (std::string(argv[i]) == "-i" && i + 1 < argc) {
+      iter = std::atoi(argv[i + 1]);
+    } else if (std::string(argv[i]) == "-o" && i + 1 < argc) {
+      int n_order = std::atoi(argv[i + 1]);
+      if ((i + n_order + 1) >= argc) {
+        std::cerr << "Insufficient arguments for -o option" << std::endl;
+        return EXIT_FAILURE;
+      } else {
+        for (int j = 0; j < n_order; j++) {
+          int kernel_id = std::atoi(argv[i + 2 + j]);
+          if (kernel_id < 0 || kernel_id > 2) {
+            std::cerr << "Invalid kernel ID: " << kernel_id << std::endl;
+            return EXIT_FAILURE;
+          }
+          launch_order.push_back(kernel_id);
+        }
+        i += n_order;
+      }
+    }
+  }
+
+  if (launch_order.empty()) {
+    launch_order = {0, 1, 2};
+  }
+
+  // Problem size
+  const int n_small = 1024;
+  const int n_mid = 2048;
+  const int n_large = 4096;
+
+  // Define block and grid sizes
+  dim3 blockSize_small(16, 16);
+  dim3 gridSize_small((n_small + blockSize_small.x - 1) / blockSize_small.x,
+                      (n_small + blockSize_small.y - 1) / blockSize_small.y);
+  dim3 blockSize_mid(24, 24);
+  dim3 gridSize_mid((n_mid + blockSize_mid.x - 1) / blockSize_mid.x,
+                    (n_mid + blockSize_mid.y - 1) / blockSize_mid.y);
+  dim3 blockSize_large(32, 32);
+  dim3 gridSize_large((n_large + blockSize_large.x - 1) / blockSize_large.x,
+                      (n_large + blockSize_large.y - 1) / blockSize_large.y);
+
+  // Host memory pointers
+  float *h_U_small = new float[n_small * n_small];
+  float *h_U_mid = new float[n_mid * n_mid];
+  float *h_U_large = new float[n_large * n_large];
+
+  // Initialize host arrays
+  initialize(h_U_small, n_small);
+  initialize(h_U_mid, n_mid);
+  initialize(h_U_large, n_large);
+
+  // Device memory pointers
+  float *d_U_small_new, *d_U_small_old;
+  float *d_U_mid_new, *d_U_mid_old;
+  float *d_U_large_new, *d_U_large_old;
+
+  // Allocate device memory
+  HIP_CHECK(hipMalloc(&d_U_small_new, n_small * n_small * sizeof(float)));
+  HIP_CHECK(hipMalloc(&d_U_small_old, n_small * n_small * sizeof(float)));
+
+  HIP_CHECK(hipMalloc(&d_U_mid_new, n_mid * n_mid * sizeof(float)));
+  HIP_CHECK(hipMalloc(&d_U_mid_old, n_mid * n_mid * sizeof(float)));
+
+  HIP_CHECK(hipMalloc(&d_U_large_new, n_large * n_large * sizeof(float)));
+  HIP_CHECK(hipMalloc(&d_U_large_old, n_large * n_large * sizeof(float)));
+
+  // Copy host arrays to device
+  HIP_CHECK(hipMemcpy(d_U_small_old, h_U_small,
+                      n_small * n_small * sizeof(float),
+                      hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(d_U_mid_old, h_U_mid, n_mid * n_mid * sizeof(float),
+                      hipMemcpyHostToDevice));
+  HIP_CHECK(hipMemcpy(d_U_large_old, h_U_large,
+                      n_large * n_large * sizeof(float),
+                      hipMemcpyHostToDevice));
+
+  std::vector<dim3> grid_sizes = {gridSize_small, gridSize_mid, gridSize_large};
+  std::vector<dim3> block_sizes = {blockSize_small, blockSize_mid,
+                                   blockSize_large};
+  std::vector<float *> d_U_news = {d_U_small_new, d_U_mid_new, d_U_large_new};
+  std::vector<float *> d_U_olds = {d_U_small_old, d_U_mid_old, d_U_large_old};
+  std::vector<int> n_sizes = {n_small, n_mid, n_large};
+
+  // Perform Jacobi iterations
+  for (int it = 0; it < iter; ++it) {
+    int i = launch_order[it % launch_order.size()];
+    jacobi_iteration<<<grid_sizes[i], block_sizes[i],
+                       (block_sizes[i].x + 2) * (block_sizes[i].y + 2) *
+                           sizeof(float)>>>(d_U_news[i], d_U_olds[i],
+                                            n_sizes[i]);
+
+    HIP_CHECK(hipDeviceSynchronize());
+
+    // Swap pointers
+    std::swap(d_U_news[i], d_U_olds[i]);
+  }
+
+  // Free device memory
+  HIP_CHECK(hipFree(d_U_small_new));
+  HIP_CHECK(hipFree(d_U_small_old));
+  HIP_CHECK(hipFree(d_U_mid_new));
+  HIP_CHECK(hipFree(d_U_mid_old));
+  HIP_CHECK(hipFree(d_U_large_new));
+  HIP_CHECK(hipFree(d_U_large_old));
+
+  delete[] h_U_small;
+  delete[] h_U_mid;
+  delete[] h_U_large;
+
+  return 0;
+}
@@ -430,6 +430,23 @@ class OmniAnalyze_Base:
                "with profiling data collected with iteration multiplexing.",
            )

+        # Check if any kernel's counters are missing due to iteration multiplexing
+        if (
+            self._profiling_config.get("iteration_multiplexing") is not None
+            and self._profiling_config.get("kernels_with_missing_counters") is not None
+        ):
+            missing_kernels = self._profiling_config.get(
+                "kernels_with_missing_counters"
+            )
+            console_warning(
+                "analysis",
+                (
+                    "The following kernels have missing counter data "
+                    "due to iteration multiplexing and should be filtered out: "
+                    f"{', '.join(missing_kernels)}"
+                ),
+            )
+
        # initalize runs
        self._runs = self.initalize_runs()

@@ -122,6 +122,37 @@ class RocProfCompute_Base:
                "./vcopy -n 1048576 -b 256"
            )

+    def detect_missing_counters(self, df: pd.DataFrame) -> None:
+        """Detect missing counter values in joined dataframe"""
+        args = self.get_args()
+        group_labels = ["Kernel_Name"]
+        if args.join_type == "grid":
+            group_labels.append("Grid_Size")
+
+        num_files = len(list(Path(args.path).glob("perfmon/*.txt")))
+        kernels_with_missing_counters = []
+        for _, groups in df.groupby(group_labels):
+            if groups["Dispatch_ID"].nunique() < num_files:
+                kernel_name = groups.iloc[0]["Kernel_Name"]
+                kernels_with_missing_counters.append(kernel_name)
+
+        if kernels_with_missing_counters:
+            kernels_with_missing_counters = list(set(kernels_with_missing_counters))
+            console_warning(
+                "join_prof",
+                (
+                    f"Insufficient number of kernel calls for kernels: "
+                    f"{', '.join(kernels_with_missing_counters)} "
+                    f"to collect all counters using iteration multiplexing. "
+                    f"Please use kernel filtering and exclude the above kernels "
+                    f"or turn off iteration multiplexing."
+                ),
+            )
+            with open(f"{args.path}/profiling_config.yaml", "a") as f:
+                yaml.dump(
+                    {"kernels_with_missing_counters": kernels_with_missing_counters}, f
+                )
+
    @demarcate
    def join_prof(self, out: Optional[str] = None) -> Optional[pd.DataFrame]:
        """Manually join separated rocprof runs"""
@@ -148,6 +179,10 @@ class RocProfCompute_Base:

            console_debug(f"Created file: {output_file}")

+            if args.iteration_multiplexing is not None:
+                df = pd.read_csv(output_file)
+                self.detect_missing_counters(df)
+
            # Delete results_*.csv files
            for file in result_files:
                Path(file).unlink()
@@ -340,6 +375,11 @@ class RocProfCompute_Base:
        if "key" in df.columns:
            df = df.drop(columns=["key"])

+        console_debug("join_prof", "Checking for missing counter values...")
+
+        if args.iteration_multiplexing is not None:
+            self.detect_missing_counters(df)
+
        # save to file and delete old file(s)
        # skip if we're being called outside of rocprof-compute
        if isinstance(args.path, str):
@@ -391,6 +431,62 @@ class RocProfCompute_Base:
            soc=self._soc,
        )

+    def profile(
+        self,
+        fnames: Union[list[Path], Path],
+        options: Union[list[str], dict[str, Any]],
+        total_runs: int = 1,
+    ) -> float:
+        args = self.get_args()
+
+        if isinstance(fnames, list):
+            console_log(
+                "profiling", f"Current input files: {', '.join(map(str, fnames))}"
+            )
+            str_fnames = [str(fname) for fname in fnames]
+        else:
+            console_log("profiling", f"Current input file: {fnames}")
+            str_fnames = str(fnames)
+
+        start_time = time.time()
+
+        if self.__profiler == "rocprofv3" or self.__profiler == "rocprofiler-sdk":
+            # Only 1-run case is permitted for attach/detach
+            if (isinstance(options, list) and "--pid" in options) or (
+                isinstance(options, dict)
+                and (options.get("ROCPROF_ATTACH_PID") is not None)
+            ):
+                if total_runs > 1:
+                    console_error(
+                        f"Cannot attach process for profiling as the requested "
+                        f"performance counters exceed the collection capacity of "
+                        f"single pass counter collection. The current setup of "
+                        f"requested counter blocks needs {total_runs} number of "
+                        f'passes. Please use "--block" or "--set" '
+                        f"to adjust or reduce the requested performance metrics!"
+                    )
+            run_prof(
+                fnames=str_fnames,
+                profiler_options=options,
+                workload_dir=args.path,
+                mspec=self._soc._mspec,
+                loglevel=args.loglevel,
+                format_rocprof_output=args.format_rocprof_output,
+                retain_rocpd_output=args.retain_rocpd_output,
+            )
+
+            end_time = time.time()
+            duration = end_time - start_time
+
+            console_debug(
+                f"The time of run_prof of {str_fnames} is {int(duration / 60)} min"
+                f" {duration % 60} sec"
+            )
+            return duration
+        else:
+            console_error("Profiler not supported")
+            return 0.0
+
    @abstractmethod
    def run_profiling(self, version: str, prog: str) -> None:
        """Run profiling."""
@@ -516,57 +612,6 @@ class RocProfCompute_Base:
                else:
                    console_debug(output)

-        def profile(
-            fnames: Union[list[Path], Path], options: Union[list[str], dict[str, Any]]
-        ) -> float:
-            if isinstance(fnames, list):
-                console_log(
-                    "profiling", f"Current input files: {', '.join(map(str, fnames))}"
-                )
-                str_fnames = [str(fname) for fname in fnames]
-            else:
-                console_log("profiling", f"Current input file: {fnames}")
-                str_fnames = str(fnames)
-
-            start_time = time.time()
-
-            if self.__profiler == "rocprofv3" or self.__profiler == "rocprofiler-sdk":
-                # Only 1-run case is permitted for attach/detach
-                if (isinstance(options, list) and "--pid" in options) or (
-                    isinstance(options, dict)
-                    and (options.get("ROCPROF_ATTACH_PID") is not None)
-                ):
-                    if total_runs > 1:
-                        console_error(
-                            f"Cannot attach process for profiling as the requested "
-                            f"performance counters exceed the collection capacity of "
-                            f"single pass counter collection. The current setup of "
-                            f"requested counter blocks needs {total_runs} number of "
-                            f'passes. Please use "--block" or "--set" '
-                            f"to adjust or reduce the requested performance metrics!"
-                        )
-                run_prof(
-                    fnames=str_fnames,
-                    profiler_options=options,
-                    workload_dir=args.path,
-                    mspec=self._soc._mspec,
-                    loglevel=args.loglevel,
-                    format_rocprof_output=args.format_rocprof_output,
-                    retain_rocpd_output=args.retain_rocpd_output,
-                )
-
-                end_time = time.time()
-                duration = end_time - start_time
-
-                console_debug(
-                    f"The time of run_prof of {fname} is {int(duration / 60)} min"
-                    f" {duration % 60} sec"
-                )
-                return duration
-            else:
-                console_error("Profiler not supported")
-                return 0.0
-
        if args.iteration_multiplexing is not None:
            console_log(
                "profiling", f"Iteration multiplexing: {args.iteration_multiplexing}"
@@ -589,7 +634,7 @@ class RocProfCompute_Base:
                    ),
                )

-            profile(input_files, options)
+            self.profile(input_files, options)
        else:
            console_log("profiling", "Iteration multiplexing: Disabled")

@@ -615,7 +660,7 @@ class RocProfCompute_Base:
                        "pending first measurement...]"
                    )

-                duration = profile(fname, options)
+                duration = self.profile(fname, options, total_runs)
                total_profiling_time += duration

        # Delete temporary native tool if created
@@ -59,3 +59,11 @@ set_target_properties(
    hip_dynamic_shared
    PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
 )
+
+set(LAPLACE_EQN_SOURCES ../sample/laplace_eqn.hip)
+set_source_files_properties(${LAPLACE_EQN_SOURCES} PROPERTIES LANGUAGE HIP)
+add_executable(laplace_eqn ${LAPLACE_EQN_SOURCES})
+set_target_properties(
+    laplace_eqn
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
+)
@@ -31,9 +31,11 @@ import subprocess
 import sys
 from pathlib import Path

+import numpy as np
 import pandas as pd
 import pytest
 import test_utils
+from scipy.stats import zscore

 # Globals

@@ -64,6 +66,8 @@ config["app_1"] = ["./tests/vcopy", "-n", "1048576", "-b", "256", "-i", "3"]
 config["app_occupancy"] = ["./tests/occupancy"]
 config["app_mat_mul_max"] = ["./tests/mat_mul_max"]
 config["app_hip_dynamic_shared"] = ["./tests/hip_dynamic_shared"]
+config["app_laplace_eqn"] = ["./tests/laplace_eqn", "-i", "5000"]
+config["app_laplace_eqn_iter"] = ["./tests/laplace_eqn", "-i", "15000"]
 config["cleanup"] = True
 config["COUNTER_LOGGING"] = False
 config["METRIC_COMPARE"] = False
@@ -460,6 +464,150 @@ def validate(test_name, workload_dir, file_dict, args=[]):
        baseline_compare_metric(test_name, workload_dir, args)


+def are_stochastic_counters_similar(test_dfs, baseline_df):
+    """
+    Compares multiple test dataframes against a baseline dataframe to check
+    if the stochastic counter values are similar. Returns True if all test dataframes
+    have similar counter values to the baseline, otherwise returns False.
+    """
+    group_labels = [
+        "Kernel_Name",
+        "Grid_Size",
+        "Workgroup_Size",
+        "LDS_Per_Workgroup",
+        "Counter_Name",
+    ]
+
+    baseline_grouped = baseline_df.groupby(group_labels)
+    tests_grouped = [df.groupby(group_labels) for df in test_dfs]
+
+    baseline_group_keys = set(baseline_grouped.groups.keys())
+    tests_group_keys = [set(group.groups.keys()) for group in tests_grouped]
+
+    # Check if all test dataframes have the same group keys as the baseline
+    if not all(baseline_group_keys == keys for keys in tests_group_keys):
+        return False
+
+    stochastic_counter_patterns = list(
+        map(
+            re.compile,
+            [
+                ".*REQ_sum$",
+                ".*REQ_.*_sum$",
+                ".*READ_sum$",
+                ".*WRITE_sum$",
+            ],
+        )
+    )
+
+    for group_key, baseline_group in baseline_grouped:
+        test_groups = [
+            test_grouped.get_group(group_key) for test_grouped in tests_grouped
+        ]
+
+        baseline_counters = baseline_group["Counter_Value"]
+        test_counters_list = [test_group["Counter_Value"] for test_group in test_groups]
+
+        counter_name = group_key[4]
+
+        # Warmup values aren't ignored as they do not significantly impact
+        # the analysis for stochastic counters and leaves too few data points
+        # for baseline.
+        if any(
+            re.match(pattern, counter_name) for pattern in stochastic_counter_patterns
+        ):
+            # Remove outliers using Z-score method
+            z_score_threshold = 2.0
+
+            test_z_scores_list = [
+                np.abs(zscore(test_counters)) for test_counters in test_counters_list
+            ]
+            test_counters_list_trimmed = [
+                test_counters[test_z_scores < z_score_threshold]
+                for test_counters, test_z_scores in zip(
+                    test_counters_list, test_z_scores_list
+                )
+            ]
+
+            baseline_mean = baseline_counters.mean()
+            baseline_std = baseline_counters.std()
+            upper_bound = baseline_mean + 3 * baseline_std
+            lower_bound = baseline_mean - 3 * baseline_std
+
+            for test_counters in test_counters_list_trimmed:
+                if test_counters.between(lower_bound, upper_bound).all() is False:
+                    return False
+
+    return True
+
+
+def are_deterministic_counters_equal(test_dfs, baseline_df):
+    """
+    Compares multiple test dataframes against a baseline dataframe to check
+    if the deterministic counter values are equal. Returns True if all test dataframes
+    have equal counter values to the baseline, otherwise returns False.
+    """
+    group_labels = [
+        "Kernel_Name",
+        "Grid_Size",
+        "Workgroup_Size",
+        "LDS_Per_Workgroup",
+        "Counter_Name",
+    ]
+
+    baseline_grouped = baseline_df.groupby(group_labels)
+    tests_grouped = [df.groupby(group_labels) for df in test_dfs]
+
+    baseline_group_keys = set(baseline_grouped.groups.keys())
+    tests_group_keys = [set(group.groups.keys()) for group in tests_grouped]
+
+    # Check if all test dataframes have the same group keys as the baseline
+    if not all(baseline_group_keys == keys for keys in tests_group_keys):
+        return False
+
+    deterministic_counter_patterns = list(
+        map(
+            re.compile,
+            [
+                "SQ_INSTS_.*",
+                "SPI_CS\\d_NUM_THREADGROUPS",
+                "SPI_CS\\d_WAVE",
+                "SQ_WAVES",
+            ],
+        )
+    )
+
+    for group_key, baseline_group in baseline_grouped:
+        test_groups = [
+            test_grouped.get_group(group_key) for test_grouped in tests_grouped
+        ]
+
+        baseline_counters = baseline_group["Counter_Value"]
+        test_counters_list = [test_group["Counter_Value"] for test_group in test_groups]
+
+        counter_name = group_key[4]
+        if any(
+            re.match(pattern, counter_name)
+            for pattern in deterministic_counter_patterns
+        ):
+            if (
+                all([
+                    test_counters.unique().size == 1
+                    for test_counters in test_counters_list
+                ])
+                and baseline_counters.unique().size == 1
+                and all([
+                    test_counters.values[0] == baseline_counters.values[0]
+                    for test_counters in test_counters_list
+                ])
+            ):
+                continue
+
+            return False
+
+    return True
+
+
 # --
 # Start of profiling tests
 # --
@@ -2367,7 +2515,7 @@ class TestSetsIntegration:
        test_utils.clean_output_dir(config["cleanup"], workload_dir)


-@pytest.mark.iteration_multiplexing
+@pytest.mark.iteration_multiplexing_1
 def test_profiler_options(binary_handler_profile_rocprof_compute):
    options = ["--no-native-tool", "--iteration-multiplexing"]
    workload_dir = test_utils.get_output_dir()
@@ -2377,7 +2525,7 @@ def test_profiler_options(binary_handler_profile_rocprof_compute):
    assert code == 1


-@pytest.mark.iteration_multiplexing
+@pytest.mark.iteration_multiplexing_1
 def test_iteration_multiplexing(binary_handler_profile_rocprof_compute):
    options = ["--iteration-multiplexing"]
    workload_dir = test_utils.get_output_dir()
@@ -2407,7 +2555,7 @@ def test_iteration_multiplexing(binary_handler_profile_rocprof_compute):
    test_utils.clean_output_dir(config["cleanup"], workload_dir)


-@pytest.mark.iteration_multiplexing
+@pytest.mark.iteration_multiplexing_1
 def test_iteration_multiplexing_kernel(binary_handler_profile_rocprof_compute):
    options = ["--iteration-multiplexing", "kernel"]
    workload_dir = test_utils.get_output_dir()
@@ -2437,7 +2585,7 @@ def test_iteration_multiplexing_kernel(binary_handler_profile_rocprof_compute):
    test_utils.clean_output_dir(config["cleanup"], workload_dir)


-@pytest.mark.iteration_multiplexing
+@pytest.mark.iteration_multiplexing_1
 def test_iteration_multiplexing_kernel_launch_params(
    binary_handler_profile_rocprof_compute,
 ):
@@ -2467,3 +2615,97 @@ def test_iteration_multiplexing_kernel_launch_params(
    )

    test_utils.clean_output_dir(config["cleanup"], workload_dir)
+
+
+@pytest.mark.iteration_multiplexing_2
+def test_iteration_multiplexing_deterministic_counter_accuracy(
+    binary_handler_profile_rocprof_compute,
+):
+    workload_dir = test_utils.get_output_dir()
+    _ = binary_handler_profile_rocprof_compute(
+        config, workload_dir, check_success=True, roof=False, app_name="app_laplace_eqn"
+    )
+    counters_no_multiplexing = test_utils.check_csv_files(
+        workload_dir, num_devices, num_kernels
+    )["pmc_perf.csv"]
+
+    options = ["--iteration-multiplexing", "kernel"]
+    workload_dir = test_utils.get_output_dir()
+    _ = binary_handler_profile_rocprof_compute(
+        config,
+        workload_dir,
+        options,
+        check_success=True,
+        roof=False,
+        app_name="app_laplace_eqn_iter",
+    )
+    counters_kernel = test_utils.check_csv_files(
+        workload_dir, num_devices, num_kernels
+    )["pmc_perf.csv"]
+
+    options = ["--iteration-multiplexing", "kernel_launch_params"]
+    workload_dir = test_utils.get_output_dir()
+    _ = binary_handler_profile_rocprof_compute(
+        config,
+        workload_dir,
+        options,
+        check_success=True,
+        roof=False,
+        app_name="app_laplace_eqn_iter",
+    )
+    counters_kernel_launch_params = test_utils.check_csv_files(
+        workload_dir, num_devices, num_kernels
+    )["pmc_perf.csv"]
+
+    assert are_deterministic_counters_equal(
+        [counters_kernel, counters_kernel_launch_params], counters_no_multiplexing
+    )
+
+    test_utils.clean_output_dir(config["cleanup"], workload_dir)
+
+
+@pytest.mark.iteration_multiplexing_stochastic
+def test_iteration_multiplexing_stochastic_counter_accuracy(
+    binary_handler_profile_rocprof_compute,
+):
+    workload_dir = test_utils.get_output_dir()
+    _ = binary_handler_profile_rocprof_compute(
+        config, workload_dir, check_success=True, roof=False, app_name="app_laplace_eqn"
+    )
+    counters_no_multiplexing = test_utils.check_csv_files(
+        workload_dir, num_devices, num_kernels
+    )["pmc_perf.csv"]
+
+    options = ["--iteration-multiplexing", "kernel"]
+    workload_dir = test_utils.get_output_dir()
+    _ = binary_handler_profile_rocprof_compute(
+        config,
+        workload_dir,
+        options,
+        check_success=True,
+        roof=False,
+        app_name="app_laplace_eqn_iter",
+    )
+    counters_kernel = test_utils.check_csv_files(
+        workload_dir, num_devices, num_kernels
+    )["pmc_perf.csv"]
+
+    options = ["--iteration-multiplexing", "kernel_launch_params"]
+    workload_dir = test_utils.get_output_dir()
+    _ = binary_handler_profile_rocprof_compute(
+        config,
+        workload_dir,
+        options,
+        check_success=True,
+        roof=False,
+        app_name="app_laplace_eqn_iter",
+    )
+    counters_kernel_launch_params = test_utils.check_csv_files(
+        workload_dir, num_devices, num_kernels
+    )["pmc_perf.csv"]
+
+    assert are_stochastic_counters_similar(
+        [counters_kernel, counters_kernel_launch_params], counters_no_multiplexing
+    )
+
+    test_utils.clean_output_dir(config["cleanup"], workload_dir)