diff --git a/projects/rocprofiler-compute/CMakeLists.txt b/projects/rocprofiler-compute/CMakeLists.txt index a8e3fe6257..3c00861802 100644 --- a/projects/rocprofiler-compute/CMakeLists.txt +++ b/projects/rocprofiler-compute/CMakeLists.txt @@ -361,10 +361,18 @@ add_test( ) add_test( - NAME test_profile_iteration_multiplexing + NAME test_profile_iteration_multiplexing_1 COMMAND - ${PYTHON_TEST_COMMAND} -m pytest -s -m iteration_multiplexing - --junitxml=tests/test_profile_iteration_multiplexing.xml ${COV_OPTION} + ${PYTHON_TEST_COMMAND} -m pytest -s -m iteration_multiplexing_1 + --junitxml=tests/test_profile_iteration_multiplexing_1.xml ${COV_OPTION} + tests/test_profile_general.py ${WORKING_DIR_OPTION} +) + +add_test( + NAME test_profile_iteration_multiplexing_2 + COMMAND + ${PYTHON_TEST_COMMAND} -m pytest -s -m iteration_multiplexing_2 + --junitxml=tests/test_profile_iteration_multiplexing_2.xml ${COV_OPTION} tests/test_profile_general.py ${WORKING_DIR_OPTION} ) @@ -382,7 +390,8 @@ set_tests_properties( test_profile_pc_sampling test_profile_sets_func test_profile_live_attach_detach - test_profile_iteration_multiplexing + test_profile_iteration_multiplexing_1 + test_profile_iteration_multiplexing_2 PROPERTIES LABELS "profile" RESOURCE_GROUPS gpus:1 TIMEOUT 1800 ) @@ -723,6 +732,7 @@ if(INSTALL_TESTS) tests/vsequential_access tests/occupancy tests/hip_dynamic_shared + tests/laplace_eqn tests/mat_mul_max DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${PROJECT_NAME}/tests COMPONENT tests diff --git a/projects/rocprofiler-compute/pyproject.toml b/projects/rocprofiler-compute/pyproject.toml index 5544f61c85..1c86207a5b 100644 --- a/projects/rocprofiler-compute/pyproject.toml +++ b/projects/rocprofiler-compute/pyproject.toml @@ -107,5 +107,7 @@ markers = [ "roofline_2", "path", "sci_notion", - "iteration_multiplexing", + "iteration_multiplexing_1", + "iteration_multiplexing_2", + "iteration_multiplexing_stochastic", ] diff --git a/projects/rocprofiler-compute/requirements-test.txt b/projects/rocprofiler-compute/requirements-test.txt index 80f89998ab..bef5a34594 100644 --- a/projects/rocprofiler-compute/requirements-test.txt +++ b/projects/rocprofiler-compute/requirements-test.txt @@ -2,3 +2,4 @@ mock pytest pytest-cov pytest-xdist +scipy diff --git a/projects/rocprofiler-compute/sample/laplace_eqn.hip b/projects/rocprofiler-compute/sample/laplace_eqn.hip new file mode 100644 index 0000000000..60bd5bc889 --- /dev/null +++ b/projects/rocprofiler-compute/sample/laplace_eqn.hip @@ -0,0 +1,217 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// Reference: Finite Difference Methods for Ordinary and Partial Differential +// Equations by Randall J. LeVeque, Page 69-71 + +#include +#include +#include +#include + +// Helper macro for HIP error checking +#define HIP_CHECK(call) \ + do { \ + hipError_t err = call; \ + if (err != hipSuccess) { \ + std::cerr << "HIP error: " << hipGetErrorString(err) << " at " \ + << __FILE__ << ":" << __LINE__ << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ + } while (0) + +void initialize(float *U, int N) { + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + float x = 2.0f * (float)i / (float)N - 1.0f; + float y = 2.0f * (float)j / (float)N - 1.0f; + U[i * N + j] = 1.0f / (1.0f + x * x + y * y); // Initial guess + } + } +} + +__global__ void jacobi_iteration(float *__restrict__ U_new, + const float *__restrict__ U_old, int N) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int j = blockIdx.y * blockDim.y + threadIdx.y; + + int idx = j + i * N; // Flattened index for 2D grid + + extern __shared__ float U[]; + int s_i = threadIdx.x + 1; + int s_j = threadIdx.y + 1; + + int tile_height = blockDim.y + 2; + int s_idx = s_j + s_i * tile_height; + + // Load data into shared memory with halo regions + if (i < N && j < N) { + U[s_idx] = U_old[idx]; + + // Load halo regions + if (threadIdx.x == 0) { + U[s_j] = (i > 0) ? U_old[(i - 1) * N + j] : U_old[(N - 1) * N + j]; + } + + if (threadIdx.x == blockDim.x - 1) { + U[s_j + (blockDim.x + 1) * tile_height] = + (i < N - 1) ? U_old[(i + 1) * N + j] : U_old[j]; + } + + if (threadIdx.y == 0) { + U[s_i * tile_height] = + (j > 0) ? U_old[i * N + (j - 1)] : U_old[i * N + (N - 1)]; + } + + if (threadIdx.y == blockDim.y - 1) { + U[s_i * tile_height + blockDim.y + 1] = + (j < N - 1) ? U_old[i * N + (j + 1)] : U_old[i * N]; + } + } + + __syncthreads(); + + if (i < N && j < N) { + int east = s_idx - tile_height; + int west = s_idx + tile_height; + int north = s_idx + 1; + int south = s_idx - 1; + U_new[idx] = 0.25f * (U[east] + U[west] + U[north] + U[south]); + } +} + +int main(int argc, char *argv[]) { + int iter = 1000; + std::vector launch_order; + + for (int i = 0; i < argc; i++) { + if (std::string(argv[i]) == "-i" && i + 1 < argc) { + iter = std::atoi(argv[i + 1]); + } else if (std::string(argv[i]) == "-o" && i + 1 < argc) { + int n_order = std::atoi(argv[i + 1]); + if ((i + n_order + 1) >= argc) { + std::cerr << "Insufficient arguments for -o option" << std::endl; + return EXIT_FAILURE; + } else { + for (int j = 0; j < n_order; j++) { + int kernel_id = std::atoi(argv[i + 2 + j]); + if (kernel_id < 0 || kernel_id > 2) { + std::cerr << "Invalid kernel ID: " << kernel_id << std::endl; + return EXIT_FAILURE; + } + launch_order.push_back(kernel_id); + } + i += n_order; + } + } + } + + if (launch_order.empty()) { + launch_order = {0, 1, 2}; + } + + // Problem size + const int n_small = 1024; + const int n_mid = 2048; + const int n_large = 4096; + + // Define block and grid sizes + dim3 blockSize_small(16, 16); + dim3 gridSize_small((n_small + blockSize_small.x - 1) / blockSize_small.x, + (n_small + blockSize_small.y - 1) / blockSize_small.y); + dim3 blockSize_mid(24, 24); + dim3 gridSize_mid((n_mid + blockSize_mid.x - 1) / blockSize_mid.x, + (n_mid + blockSize_mid.y - 1) / blockSize_mid.y); + dim3 blockSize_large(32, 32); + dim3 gridSize_large((n_large + blockSize_large.x - 1) / blockSize_large.x, + (n_large + blockSize_large.y - 1) / blockSize_large.y); + + // Host memory pointers + float *h_U_small = new float[n_small * n_small]; + float *h_U_mid = new float[n_mid * n_mid]; + float *h_U_large = new float[n_large * n_large]; + + // Initialize host arrays + initialize(h_U_small, n_small); + initialize(h_U_mid, n_mid); + initialize(h_U_large, n_large); + + // Device memory pointers + float *d_U_small_new, *d_U_small_old; + float *d_U_mid_new, *d_U_mid_old; + float *d_U_large_new, *d_U_large_old; + + // Allocate device memory + HIP_CHECK(hipMalloc(&d_U_small_new, n_small * n_small * sizeof(float))); + HIP_CHECK(hipMalloc(&d_U_small_old, n_small * n_small * sizeof(float))); + + HIP_CHECK(hipMalloc(&d_U_mid_new, n_mid * n_mid * sizeof(float))); + HIP_CHECK(hipMalloc(&d_U_mid_old, n_mid * n_mid * sizeof(float))); + + HIP_CHECK(hipMalloc(&d_U_large_new, n_large * n_large * sizeof(float))); + HIP_CHECK(hipMalloc(&d_U_large_old, n_large * n_large * sizeof(float))); + + // Copy host arrays to device + HIP_CHECK(hipMemcpy(d_U_small_old, h_U_small, + n_small * n_small * sizeof(float), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_U_mid_old, h_U_mid, n_mid * n_mid * sizeof(float), + hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(d_U_large_old, h_U_large, + n_large * n_large * sizeof(float), + hipMemcpyHostToDevice)); + + std::vector grid_sizes = {gridSize_small, gridSize_mid, gridSize_large}; + std::vector block_sizes = {blockSize_small, blockSize_mid, + blockSize_large}; + std::vector d_U_news = {d_U_small_new, d_U_mid_new, d_U_large_new}; + std::vector d_U_olds = {d_U_small_old, d_U_mid_old, d_U_large_old}; + std::vector n_sizes = {n_small, n_mid, n_large}; + + // Perform Jacobi iterations + for (int it = 0; it < iter; ++it) { + int i = launch_order[it % launch_order.size()]; + jacobi_iteration<<>>(d_U_news[i], d_U_olds[i], + n_sizes[i]); + + HIP_CHECK(hipDeviceSynchronize()); + + // Swap pointers + std::swap(d_U_news[i], d_U_olds[i]); + } + + // Free device memory + HIP_CHECK(hipFree(d_U_small_new)); + HIP_CHECK(hipFree(d_U_small_old)); + HIP_CHECK(hipFree(d_U_mid_new)); + HIP_CHECK(hipFree(d_U_mid_old)); + HIP_CHECK(hipFree(d_U_large_new)); + HIP_CHECK(hipFree(d_U_large_old)); + + delete[] h_U_small; + delete[] h_U_mid; + delete[] h_U_large; + + return 0; +} \ No newline at end of file diff --git a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py index 34fde73ce2..66d0def36e 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_analyze/analysis_base.py @@ -430,6 +430,23 @@ class OmniAnalyze_Base: "with profiling data collected with iteration multiplexing.", ) + # Check if any kernel's counters are missing due to iteration multiplexing + if ( + self._profiling_config.get("iteration_multiplexing") is not None + and self._profiling_config.get("kernels_with_missing_counters") is not None + ): + missing_kernels = self._profiling_config.get( + "kernels_with_missing_counters" + ) + console_warning( + "analysis", + ( + "The following kernels have missing counter data " + "due to iteration multiplexing and should be filtered out: " + f"{', '.join(missing_kernels)}" + ), + ) + # initalize runs self._runs = self.initalize_runs() diff --git a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py index 52ff39065b..ab1915d837 100644 --- a/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py +++ b/projects/rocprofiler-compute/src/rocprof_compute_profile/profiler_base.py @@ -122,6 +122,37 @@ class RocProfCompute_Base: "./vcopy -n 1048576 -b 256" ) + def detect_missing_counters(self, df: pd.DataFrame) -> None: + """Detect missing counter values in joined dataframe""" + args = self.get_args() + group_labels = ["Kernel_Name"] + if args.join_type == "grid": + group_labels.append("Grid_Size") + + num_files = len(list(Path(args.path).glob("perfmon/*.txt"))) + kernels_with_missing_counters = [] + for _, groups in df.groupby(group_labels): + if groups["Dispatch_ID"].nunique() < num_files: + kernel_name = groups.iloc[0]["Kernel_Name"] + kernels_with_missing_counters.append(kernel_name) + + if kernels_with_missing_counters: + kernels_with_missing_counters = list(set(kernels_with_missing_counters)) + console_warning( + "join_prof", + ( + f"Insufficient number of kernel calls for kernels: " + f"{', '.join(kernels_with_missing_counters)} " + f"to collect all counters using iteration multiplexing. " + f"Please use kernel filtering and exclude the above kernels " + f"or turn off iteration multiplexing." + ), + ) + with open(f"{args.path}/profiling_config.yaml", "a") as f: + yaml.dump( + {"kernels_with_missing_counters": kernels_with_missing_counters}, f + ) + @demarcate def join_prof(self, out: Optional[str] = None) -> Optional[pd.DataFrame]: """Manually join separated rocprof runs""" @@ -148,6 +179,10 @@ class RocProfCompute_Base: console_debug(f"Created file: {output_file}") + if args.iteration_multiplexing is not None: + df = pd.read_csv(output_file) + self.detect_missing_counters(df) + # Delete results_*.csv files for file in result_files: Path(file).unlink() @@ -340,6 +375,11 @@ class RocProfCompute_Base: if "key" in df.columns: df = df.drop(columns=["key"]) + console_debug("join_prof", "Checking for missing counter values...") + + if args.iteration_multiplexing is not None: + self.detect_missing_counters(df) + # save to file and delete old file(s) # skip if we're being called outside of rocprof-compute if isinstance(args.path, str): @@ -391,6 +431,62 @@ class RocProfCompute_Base: soc=self._soc, ) + def profile( + self, + fnames: Union[list[Path], Path], + options: Union[list[str], dict[str, Any]], + total_runs: int = 1, + ) -> float: + args = self.get_args() + + if isinstance(fnames, list): + console_log( + "profiling", f"Current input files: {', '.join(map(str, fnames))}" + ) + str_fnames = [str(fname) for fname in fnames] + else: + console_log("profiling", f"Current input file: {fnames}") + str_fnames = str(fnames) + + start_time = time.time() + + if self.__profiler == "rocprofv3" or self.__profiler == "rocprofiler-sdk": + # Only 1-run case is permitted for attach/detach + if (isinstance(options, list) and "--pid" in options) or ( + isinstance(options, dict) + and (options.get("ROCPROF_ATTACH_PID") is not None) + ): + if total_runs > 1: + console_error( + f"Cannot attach process for profiling as the requested " + f"performance counters exceed the collection capacity of " + f"single pass counter collection. The current setup of " + f"requested counter blocks needs {total_runs} number of " + f'passes. Please use "--block" or "--set" ' + f"to adjust or reduce the requested performance metrics!" + ) + run_prof( + fnames=str_fnames, + profiler_options=options, + workload_dir=args.path, + mspec=self._soc._mspec, + loglevel=args.loglevel, + format_rocprof_output=args.format_rocprof_output, + retain_rocpd_output=args.retain_rocpd_output, + ) + + end_time = time.time() + duration = end_time - start_time + + console_debug( + f"The time of run_prof of {str_fnames} is {int(duration / 60)} min" + f" {duration % 60} sec" + ) + return duration + else: + console_error("Profiler not supported") + return 0.0 + @abstractmethod def run_profiling(self, version: str, prog: str) -> None: """Run profiling.""" @@ -516,57 +612,6 @@ class RocProfCompute_Base: else: console_debug(output) - def profile( - fnames: Union[list[Path], Path], options: Union[list[str], dict[str, Any]] - ) -> float: - if isinstance(fnames, list): - console_log( - "profiling", f"Current input files: {', '.join(map(str, fnames))}" - ) - str_fnames = [str(fname) for fname in fnames] - else: - console_log("profiling", f"Current input file: {fnames}") - str_fnames = str(fnames) - - start_time = time.time() - - if self.__profiler == "rocprofv3" or self.__profiler == "rocprofiler-sdk": - # Only 1-run case is permitted for attach/detach - if (isinstance(options, list) and "--pid" in options) or ( - isinstance(options, dict) - and (options.get("ROCPROF_ATTACH_PID") is not None) - ): - if total_runs > 1: - console_error( - f"Cannot attach process for profiling as the requested " - f"performance counters exceed the collection capacity of " - f"single pass counter collection. The current setup of " - f"requested counter blocks needs {total_runs} number of " - f'passes. Please use "--block" or "--set" ' - f"to adjust or reduce the requested performance metrics!" - ) - run_prof( - fnames=str_fnames, - profiler_options=options, - workload_dir=args.path, - mspec=self._soc._mspec, - loglevel=args.loglevel, - format_rocprof_output=args.format_rocprof_output, - retain_rocpd_output=args.retain_rocpd_output, - ) - - end_time = time.time() - duration = end_time - start_time - - console_debug( - f"The time of run_prof of {fname} is {int(duration / 60)} min" - f" {duration % 60} sec" - ) - return duration - else: - console_error("Profiler not supported") - return 0.0 - if args.iteration_multiplexing is not None: console_log( "profiling", f"Iteration multiplexing: {args.iteration_multiplexing}" @@ -589,7 +634,7 @@ class RocProfCompute_Base: ), ) - profile(input_files, options) + self.profile(input_files, options) else: console_log("profiling", "Iteration multiplexing: Disabled") @@ -615,7 +660,7 @@ class RocProfCompute_Base: "pending first measurement...]" ) - duration = profile(fname, options) + duration = self.profile(fname, options, total_runs) total_profiling_time += duration # Delete temporary native tool if created diff --git a/projects/rocprofiler-compute/tests/CMakeLists.txt b/projects/rocprofiler-compute/tests/CMakeLists.txt index 40f712dc42..60bb856f14 100644 --- a/projects/rocprofiler-compute/tests/CMakeLists.txt +++ b/projects/rocprofiler-compute/tests/CMakeLists.txt @@ -59,3 +59,11 @@ set_target_properties( hip_dynamic_shared PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests ) + +set(LAPLACE_EQN_SOURCES ../sample/laplace_eqn.hip) +set_source_files_properties(${LAPLACE_EQN_SOURCES} PROPERTIES LANGUAGE HIP) +add_executable(laplace_eqn ${LAPLACE_EQN_SOURCES}) +set_target_properties( + laplace_eqn + PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests +) diff --git a/projects/rocprofiler-compute/tests/test_profile_general.py b/projects/rocprofiler-compute/tests/test_profile_general.py index 7c46ac50de..eaa4cfbfbf 100644 --- a/projects/rocprofiler-compute/tests/test_profile_general.py +++ b/projects/rocprofiler-compute/tests/test_profile_general.py @@ -31,9 +31,11 @@ import subprocess import sys from pathlib import Path +import numpy as np import pandas as pd import pytest import test_utils +from scipy.stats import zscore # Globals @@ -64,6 +66,8 @@ config["app_1"] = ["./tests/vcopy", "-n", "1048576", "-b", "256", "-i", "3"] config["app_occupancy"] = ["./tests/occupancy"] config["app_mat_mul_max"] = ["./tests/mat_mul_max"] config["app_hip_dynamic_shared"] = ["./tests/hip_dynamic_shared"] +config["app_laplace_eqn"] = ["./tests/laplace_eqn", "-i", "5000"] +config["app_laplace_eqn_iter"] = ["./tests/laplace_eqn", "-i", "15000"] config["cleanup"] = True config["COUNTER_LOGGING"] = False config["METRIC_COMPARE"] = False @@ -460,6 +464,150 @@ def validate(test_name, workload_dir, file_dict, args=[]): baseline_compare_metric(test_name, workload_dir, args) +def are_stochastic_counters_similar(test_dfs, baseline_df): + """ + Compares multiple test dataframes against a baseline dataframe to check + if the stochastic counter values are similar. Returns True if all test dataframes + have similar counter values to the baseline, otherwise returns False. + """ + group_labels = [ + "Kernel_Name", + "Grid_Size", + "Workgroup_Size", + "LDS_Per_Workgroup", + "Counter_Name", + ] + + baseline_grouped = baseline_df.groupby(group_labels) + tests_grouped = [df.groupby(group_labels) for df in test_dfs] + + baseline_group_keys = set(baseline_grouped.groups.keys()) + tests_group_keys = [set(group.groups.keys()) for group in tests_grouped] + + # Check if all test dataframes have the same group keys as the baseline + if not all(baseline_group_keys == keys for keys in tests_group_keys): + return False + + stochastic_counter_patterns = list( + map( + re.compile, + [ + ".*REQ_sum$", + ".*REQ_.*_sum$", + ".*READ_sum$", + ".*WRITE_sum$", + ], + ) + ) + + for group_key, baseline_group in baseline_grouped: + test_groups = [ + test_grouped.get_group(group_key) for test_grouped in tests_grouped + ] + + baseline_counters = baseline_group["Counter_Value"] + test_counters_list = [test_group["Counter_Value"] for test_group in test_groups] + + counter_name = group_key[4] + + # Warmup values aren't ignored as they do not significantly impact + # the analysis for stochastic counters and leaves too few data points + # for baseline. + if any( + re.match(pattern, counter_name) for pattern in stochastic_counter_patterns + ): + # Remove outliers using Z-score method + z_score_threshold = 2.0 + + test_z_scores_list = [ + np.abs(zscore(test_counters)) for test_counters in test_counters_list + ] + test_counters_list_trimmed = [ + test_counters[test_z_scores < z_score_threshold] + for test_counters, test_z_scores in zip( + test_counters_list, test_z_scores_list + ) + ] + + baseline_mean = baseline_counters.mean() + baseline_std = baseline_counters.std() + upper_bound = baseline_mean + 3 * baseline_std + lower_bound = baseline_mean - 3 * baseline_std + + for test_counters in test_counters_list_trimmed: + if test_counters.between(lower_bound, upper_bound).all() is False: + return False + + return True + + +def are_deterministic_counters_equal(test_dfs, baseline_df): + """ + Compares multiple test dataframes against a baseline dataframe to check + if the deterministic counter values are equal. Returns True if all test dataframes + have equal counter values to the baseline, otherwise returns False. + """ + group_labels = [ + "Kernel_Name", + "Grid_Size", + "Workgroup_Size", + "LDS_Per_Workgroup", + "Counter_Name", + ] + + baseline_grouped = baseline_df.groupby(group_labels) + tests_grouped = [df.groupby(group_labels) for df in test_dfs] + + baseline_group_keys = set(baseline_grouped.groups.keys()) + tests_group_keys = [set(group.groups.keys()) for group in tests_grouped] + + # Check if all test dataframes have the same group keys as the baseline + if not all(baseline_group_keys == keys for keys in tests_group_keys): + return False + + deterministic_counter_patterns = list( + map( + re.compile, + [ + "SQ_INSTS_.*", + "SPI_CS\\d_NUM_THREADGROUPS", + "SPI_CS\\d_WAVE", + "SQ_WAVES", + ], + ) + ) + + for group_key, baseline_group in baseline_grouped: + test_groups = [ + test_grouped.get_group(group_key) for test_grouped in tests_grouped + ] + + baseline_counters = baseline_group["Counter_Value"] + test_counters_list = [test_group["Counter_Value"] for test_group in test_groups] + + counter_name = group_key[4] + if any( + re.match(pattern, counter_name) + for pattern in deterministic_counter_patterns + ): + if ( + all([ + test_counters.unique().size == 1 + for test_counters in test_counters_list + ]) + and baseline_counters.unique().size == 1 + and all([ + test_counters.values[0] == baseline_counters.values[0] + for test_counters in test_counters_list + ]) + ): + continue + + return False + + return True + + # -- # Start of profiling tests # -- @@ -2367,7 +2515,7 @@ class TestSetsIntegration: test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.iteration_multiplexing +@pytest.mark.iteration_multiplexing_1 def test_profiler_options(binary_handler_profile_rocprof_compute): options = ["--no-native-tool", "--iteration-multiplexing"] workload_dir = test_utils.get_output_dir() @@ -2377,7 +2525,7 @@ def test_profiler_options(binary_handler_profile_rocprof_compute): assert code == 1 -@pytest.mark.iteration_multiplexing +@pytest.mark.iteration_multiplexing_1 def test_iteration_multiplexing(binary_handler_profile_rocprof_compute): options = ["--iteration-multiplexing"] workload_dir = test_utils.get_output_dir() @@ -2407,7 +2555,7 @@ def test_iteration_multiplexing(binary_handler_profile_rocprof_compute): test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.iteration_multiplexing +@pytest.mark.iteration_multiplexing_1 def test_iteration_multiplexing_kernel(binary_handler_profile_rocprof_compute): options = ["--iteration-multiplexing", "kernel"] workload_dir = test_utils.get_output_dir() @@ -2437,7 +2585,7 @@ def test_iteration_multiplexing_kernel(binary_handler_profile_rocprof_compute): test_utils.clean_output_dir(config["cleanup"], workload_dir) -@pytest.mark.iteration_multiplexing +@pytest.mark.iteration_multiplexing_1 def test_iteration_multiplexing_kernel_launch_params( binary_handler_profile_rocprof_compute, ): @@ -2467,3 +2615,97 @@ def test_iteration_multiplexing_kernel_launch_params( ) test_utils.clean_output_dir(config["cleanup"], workload_dir) + + +@pytest.mark.iteration_multiplexing_2 +def test_iteration_multiplexing_deterministic_counter_accuracy( + binary_handler_profile_rocprof_compute, +): + workload_dir = test_utils.get_output_dir() + _ = binary_handler_profile_rocprof_compute( + config, workload_dir, check_success=True, roof=False, app_name="app_laplace_eqn" + ) + counters_no_multiplexing = test_utils.check_csv_files( + workload_dir, num_devices, num_kernels + )["pmc_perf.csv"] + + options = ["--iteration-multiplexing", "kernel"] + workload_dir = test_utils.get_output_dir() + _ = binary_handler_profile_rocprof_compute( + config, + workload_dir, + options, + check_success=True, + roof=False, + app_name="app_laplace_eqn_iter", + ) + counters_kernel = test_utils.check_csv_files( + workload_dir, num_devices, num_kernels + )["pmc_perf.csv"] + + options = ["--iteration-multiplexing", "kernel_launch_params"] + workload_dir = test_utils.get_output_dir() + _ = binary_handler_profile_rocprof_compute( + config, + workload_dir, + options, + check_success=True, + roof=False, + app_name="app_laplace_eqn_iter", + ) + counters_kernel_launch_params = test_utils.check_csv_files( + workload_dir, num_devices, num_kernels + )["pmc_perf.csv"] + + assert are_deterministic_counters_equal( + [counters_kernel, counters_kernel_launch_params], counters_no_multiplexing + ) + + test_utils.clean_output_dir(config["cleanup"], workload_dir) + + +@pytest.mark.iteration_multiplexing_stochastic +def test_iteration_multiplexing_stochastic_counter_accuracy( + binary_handler_profile_rocprof_compute, +): + workload_dir = test_utils.get_output_dir() + _ = binary_handler_profile_rocprof_compute( + config, workload_dir, check_success=True, roof=False, app_name="app_laplace_eqn" + ) + counters_no_multiplexing = test_utils.check_csv_files( + workload_dir, num_devices, num_kernels + )["pmc_perf.csv"] + + options = ["--iteration-multiplexing", "kernel"] + workload_dir = test_utils.get_output_dir() + _ = binary_handler_profile_rocprof_compute( + config, + workload_dir, + options, + check_success=True, + roof=False, + app_name="app_laplace_eqn_iter", + ) + counters_kernel = test_utils.check_csv_files( + workload_dir, num_devices, num_kernels + )["pmc_perf.csv"] + + options = ["--iteration-multiplexing", "kernel_launch_params"] + workload_dir = test_utils.get_output_dir() + _ = binary_handler_profile_rocprof_compute( + config, + workload_dir, + options, + check_success=True, + roof=False, + app_name="app_laplace_eqn_iter", + ) + counters_kernel_launch_params = test_utils.check_csv_files( + workload_dir, num_devices, num_kernels + )["pmc_perf.csv"] + + assert are_stochastic_counters_similar( + [counters_kernel, counters_kernel_launch_params], counters_no_multiplexing + ) + + test_utils.clean_output_dir(config["cleanup"], workload_dir)