[rocprofiler-compute] Counter accuracy tests and improvements for iteration multiplexing (#2011)

* Added laplace solver in samples

* Add laplace eqn in CMake

* Added counter accuracy test

* Add iteration CLI arg for laplace eq

* Unnest profile method

* Missing counter warning

* Updated insufficient kernel warning

* Added reference for laplace equation

* variable name change

* Added comments for data comparison

* Included scipy as test requirement

* Added line number for ref

* split stochastic and deterministic tests

* Added order cli option for laplace_eqn

* Install laplace eqn

* Missing counter warning

* Warn about missing kernels during analysis

* Update tests

* Split iteration multiplexing ctests

* Updated warning

* Incorporated copilot's suggestions
此提交包含在:
abchoudh-amd
2025-12-17 18:26:39 +05:30
提交者 GitHub
父節點 c738e73d99
當前提交 6d9d880d31
共有 8 個檔案被更改,包括 604 行新增62 行删除
+14 -4
查看文件
@@ -361,10 +361,18 @@ add_test(
)
add_test(
NAME test_profile_iteration_multiplexing
NAME test_profile_iteration_multiplexing_1
COMMAND
${PYTHON_TEST_COMMAND} -m pytest -s -m iteration_multiplexing
--junitxml=tests/test_profile_iteration_multiplexing.xml ${COV_OPTION}
${PYTHON_TEST_COMMAND} -m pytest -s -m iteration_multiplexing_1
--junitxml=tests/test_profile_iteration_multiplexing_1.xml ${COV_OPTION}
tests/test_profile_general.py ${WORKING_DIR_OPTION}
)
add_test(
NAME test_profile_iteration_multiplexing_2
COMMAND
${PYTHON_TEST_COMMAND} -m pytest -s -m iteration_multiplexing_2
--junitxml=tests/test_profile_iteration_multiplexing_2.xml ${COV_OPTION}
tests/test_profile_general.py ${WORKING_DIR_OPTION}
)
@@ -382,7 +390,8 @@ set_tests_properties(
test_profile_pc_sampling
test_profile_sets_func
test_profile_live_attach_detach
test_profile_iteration_multiplexing
test_profile_iteration_multiplexing_1
test_profile_iteration_multiplexing_2
PROPERTIES LABELS "profile" RESOURCE_GROUPS gpus:1 TIMEOUT 1800
)
@@ -723,6 +732,7 @@ if(INSTALL_TESTS)
tests/vsequential_access
tests/occupancy
tests/hip_dynamic_shared
tests/laplace_eqn
tests/mat_mul_max
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${PROJECT_NAME}/tests
COMPONENT tests
+3 -1
查看文件
@@ -107,5 +107,7 @@ markers = [
"roofline_2",
"path",
"sci_notion",
"iteration_multiplexing",
"iteration_multiplexing_1",
"iteration_multiplexing_2",
"iteration_multiplexing_stochastic",
]
+1
查看文件
@@ -2,3 +2,4 @@ mock
pytest
pytest-cov
pytest-xdist
scipy
+217
查看文件
@@ -0,0 +1,217 @@
// MIT License
//
// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
// Reference: Finite Difference Methods for Ordinary and Partial Differential
// Equations by Randall J. LeVeque, Page 69-71
#include <algorithm>
#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>
// Helper macro for HIP error checking
#define HIP_CHECK(call) \
do { \
hipError_t err = call; \
if (err != hipSuccess) { \
std::cerr << "HIP error: " << hipGetErrorString(err) << " at " \
<< __FILE__ << ":" << __LINE__ << std::endl; \
std::exit(EXIT_FAILURE); \
} \
} while (0)
void initialize(float *U, int N) {
for (int i = 0; i < N; ++i) {
for (int j = 0; j < N; ++j) {
float x = 2.0f * (float)i / (float)N - 1.0f;
float y = 2.0f * (float)j / (float)N - 1.0f;
U[i * N + j] = 1.0f / (1.0f + x * x + y * y); // Initial guess
}
}
}
__global__ void jacobi_iteration(float *__restrict__ U_new,
const float *__restrict__ U_old, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int idx = j + i * N; // Flattened index for 2D grid
extern __shared__ float U[];
int s_i = threadIdx.x + 1;
int s_j = threadIdx.y + 1;
int tile_height = blockDim.y + 2;
int s_idx = s_j + s_i * tile_height;
// Load data into shared memory with halo regions
if (i < N && j < N) {
U[s_idx] = U_old[idx];
// Load halo regions
if (threadIdx.x == 0) {
U[s_j] = (i > 0) ? U_old[(i - 1) * N + j] : U_old[(N - 1) * N + j];
}
if (threadIdx.x == blockDim.x - 1) {
U[s_j + (blockDim.x + 1) * tile_height] =
(i < N - 1) ? U_old[(i + 1) * N + j] : U_old[j];
}
if (threadIdx.y == 0) {
U[s_i * tile_height] =
(j > 0) ? U_old[i * N + (j - 1)] : U_old[i * N + (N - 1)];
}
if (threadIdx.y == blockDim.y - 1) {
U[s_i * tile_height + blockDim.y + 1] =
(j < N - 1) ? U_old[i * N + (j + 1)] : U_old[i * N];
}
}
__syncthreads();
if (i < N && j < N) {
int east = s_idx - tile_height;
int west = s_idx + tile_height;
int north = s_idx + 1;
int south = s_idx - 1;
U_new[idx] = 0.25f * (U[east] + U[west] + U[north] + U[south]);
}
}
int main(int argc, char *argv[]) {
int iter = 1000;
std::vector<int> launch_order;
for (int i = 0; i < argc; i++) {
if (std::string(argv[i]) == "-i" && i + 1 < argc) {
iter = std::atoi(argv[i + 1]);
} else if (std::string(argv[i]) == "-o" && i + 1 < argc) {
int n_order = std::atoi(argv[i + 1]);
if ((i + n_order + 1) >= argc) {
std::cerr << "Insufficient arguments for -o option" << std::endl;
return EXIT_FAILURE;
} else {
for (int j = 0; j < n_order; j++) {
int kernel_id = std::atoi(argv[i + 2 + j]);
if (kernel_id < 0 || kernel_id > 2) {
std::cerr << "Invalid kernel ID: " << kernel_id << std::endl;
return EXIT_FAILURE;
}
launch_order.push_back(kernel_id);
}
i += n_order;
}
}
}
if (launch_order.empty()) {
launch_order = {0, 1, 2};
}
// Problem size
const int n_small = 1024;
const int n_mid = 2048;
const int n_large = 4096;
// Define block and grid sizes
dim3 blockSize_small(16, 16);
dim3 gridSize_small((n_small + blockSize_small.x - 1) / blockSize_small.x,
(n_small + blockSize_small.y - 1) / blockSize_small.y);
dim3 blockSize_mid(24, 24);
dim3 gridSize_mid((n_mid + blockSize_mid.x - 1) / blockSize_mid.x,
(n_mid + blockSize_mid.y - 1) / blockSize_mid.y);
dim3 blockSize_large(32, 32);
dim3 gridSize_large((n_large + blockSize_large.x - 1) / blockSize_large.x,
(n_large + blockSize_large.y - 1) / blockSize_large.y);
// Host memory pointers
float *h_U_small = new float[n_small * n_small];
float *h_U_mid = new float[n_mid * n_mid];
float *h_U_large = new float[n_large * n_large];
// Initialize host arrays
initialize(h_U_small, n_small);
initialize(h_U_mid, n_mid);
initialize(h_U_large, n_large);
// Device memory pointers
float *d_U_small_new, *d_U_small_old;
float *d_U_mid_new, *d_U_mid_old;
float *d_U_large_new, *d_U_large_old;
// Allocate device memory
HIP_CHECK(hipMalloc(&d_U_small_new, n_small * n_small * sizeof(float)));
HIP_CHECK(hipMalloc(&d_U_small_old, n_small * n_small * sizeof(float)));
HIP_CHECK(hipMalloc(&d_U_mid_new, n_mid * n_mid * sizeof(float)));
HIP_CHECK(hipMalloc(&d_U_mid_old, n_mid * n_mid * sizeof(float)));
HIP_CHECK(hipMalloc(&d_U_large_new, n_large * n_large * sizeof(float)));
HIP_CHECK(hipMalloc(&d_U_large_old, n_large * n_large * sizeof(float)));
// Copy host arrays to device
HIP_CHECK(hipMemcpy(d_U_small_old, h_U_small,
n_small * n_small * sizeof(float),
hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(d_U_mid_old, h_U_mid, n_mid * n_mid * sizeof(float),
hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(d_U_large_old, h_U_large,
n_large * n_large * sizeof(float),
hipMemcpyHostToDevice));
std::vector<dim3> grid_sizes = {gridSize_small, gridSize_mid, gridSize_large};
std::vector<dim3> block_sizes = {blockSize_small, blockSize_mid,
blockSize_large};
std::vector<float *> d_U_news = {d_U_small_new, d_U_mid_new, d_U_large_new};
std::vector<float *> d_U_olds = {d_U_small_old, d_U_mid_old, d_U_large_old};
std::vector<int> n_sizes = {n_small, n_mid, n_large};
// Perform Jacobi iterations
for (int it = 0; it < iter; ++it) {
int i = launch_order[it % launch_order.size()];
jacobi_iteration<<<grid_sizes[i], block_sizes[i],
(block_sizes[i].x + 2) * (block_sizes[i].y + 2) *
sizeof(float)>>>(d_U_news[i], d_U_olds[i],
n_sizes[i]);
HIP_CHECK(hipDeviceSynchronize());
// Swap pointers
std::swap(d_U_news[i], d_U_olds[i]);
}
// Free device memory
HIP_CHECK(hipFree(d_U_small_new));
HIP_CHECK(hipFree(d_U_small_old));
HIP_CHECK(hipFree(d_U_mid_new));
HIP_CHECK(hipFree(d_U_mid_old));
HIP_CHECK(hipFree(d_U_large_new));
HIP_CHECK(hipFree(d_U_large_old));
delete[] h_U_small;
delete[] h_U_mid;
delete[] h_U_large;
return 0;
}
+17
查看文件
@@ -430,6 +430,23 @@ class OmniAnalyze_Base:
"with profiling data collected with iteration multiplexing.",
)
# Check if any kernel's counters are missing due to iteration multiplexing
if (
self._profiling_config.get("iteration_multiplexing") is not None
and self._profiling_config.get("kernels_with_missing_counters") is not None
):
missing_kernels = self._profiling_config.get(
"kernels_with_missing_counters"
)
console_warning(
"analysis",
(
"The following kernels have missing counter data "
"due to iteration multiplexing and should be filtered out: "
f"{', '.join(missing_kernels)}"
),
)
# initalize runs
self._runs = self.initalize_runs()
+98 -53
查看文件
@@ -122,6 +122,37 @@ class RocProfCompute_Base:
"./vcopy -n 1048576 -b 256"
)
def detect_missing_counters(self, df: pd.DataFrame) -> None:
"""Detect missing counter values in joined dataframe"""
args = self.get_args()
group_labels = ["Kernel_Name"]
if args.join_type == "grid":
group_labels.append("Grid_Size")
num_files = len(list(Path(args.path).glob("perfmon/*.txt")))
kernels_with_missing_counters = []
for _, groups in df.groupby(group_labels):
if groups["Dispatch_ID"].nunique() < num_files:
kernel_name = groups.iloc[0]["Kernel_Name"]
kernels_with_missing_counters.append(kernel_name)
if kernels_with_missing_counters:
kernels_with_missing_counters = list(set(kernels_with_missing_counters))
console_warning(
"join_prof",
(
f"Insufficient number of kernel calls for kernels: "
f"{', '.join(kernels_with_missing_counters)} "
f"to collect all counters using iteration multiplexing. "
f"Please use kernel filtering and exclude the above kernels "
f"or turn off iteration multiplexing."
),
)
with open(f"{args.path}/profiling_config.yaml", "a") as f:
yaml.dump(
{"kernels_with_missing_counters": kernels_with_missing_counters}, f
)
@demarcate
def join_prof(self, out: Optional[str] = None) -> Optional[pd.DataFrame]:
"""Manually join separated rocprof runs"""
@@ -148,6 +179,10 @@ class RocProfCompute_Base:
console_debug(f"Created file: {output_file}")
if args.iteration_multiplexing is not None:
df = pd.read_csv(output_file)
self.detect_missing_counters(df)
# Delete results_*.csv files
for file in result_files:
Path(file).unlink()
@@ -340,6 +375,11 @@ class RocProfCompute_Base:
if "key" in df.columns:
df = df.drop(columns=["key"])
console_debug("join_prof", "Checking for missing counter values...")
if args.iteration_multiplexing is not None:
self.detect_missing_counters(df)
# save to file and delete old file(s)
# skip if we're being called outside of rocprof-compute
if isinstance(args.path, str):
@@ -391,6 +431,62 @@ class RocProfCompute_Base:
soc=self._soc,
)
def profile(
self,
fnames: Union[list[Path], Path],
options: Union[list[str], dict[str, Any]],
total_runs: int = 1,
) -> float:
args = self.get_args()
if isinstance(fnames, list):
console_log(
"profiling", f"Current input files: {', '.join(map(str, fnames))}"
)
str_fnames = [str(fname) for fname in fnames]
else:
console_log("profiling", f"Current input file: {fnames}")
str_fnames = str(fnames)
start_time = time.time()
if self.__profiler == "rocprofv3" or self.__profiler == "rocprofiler-sdk":
# Only 1-run case is permitted for attach/detach
if (isinstance(options, list) and "--pid" in options) or (
isinstance(options, dict)
and (options.get("ROCPROF_ATTACH_PID") is not None)
):
if total_runs > 1:
console_error(
f"Cannot attach process for profiling as the requested "
f"performance counters exceed the collection capacity of "
f"single pass counter collection. The current setup of "
f"requested counter blocks needs {total_runs} number of "
f'passes. Please use "--block" or "--set" '
f"to adjust or reduce the requested performance metrics!"
)
run_prof(
fnames=str_fnames,
profiler_options=options,
workload_dir=args.path,
mspec=self._soc._mspec,
loglevel=args.loglevel,
format_rocprof_output=args.format_rocprof_output,
retain_rocpd_output=args.retain_rocpd_output,
)
end_time = time.time()
duration = end_time - start_time
console_debug(
f"The time of run_prof of {str_fnames} is {int(duration / 60)} min"
f" {duration % 60} sec"
)
return duration
else:
console_error("Profiler not supported")
return 0.0
@abstractmethod
def run_profiling(self, version: str, prog: str) -> None:
"""Run profiling."""
@@ -516,57 +612,6 @@ class RocProfCompute_Base:
else:
console_debug(output)
def profile(
fnames: Union[list[Path], Path], options: Union[list[str], dict[str, Any]]
) -> float:
if isinstance(fnames, list):
console_log(
"profiling", f"Current input files: {', '.join(map(str, fnames))}"
)
str_fnames = [str(fname) for fname in fnames]
else:
console_log("profiling", f"Current input file: {fnames}")
str_fnames = str(fnames)
start_time = time.time()
if self.__profiler == "rocprofv3" or self.__profiler == "rocprofiler-sdk":
# Only 1-run case is permitted for attach/detach
if (isinstance(options, list) and "--pid" in options) or (
isinstance(options, dict)
and (options.get("ROCPROF_ATTACH_PID") is not None)
):
if total_runs > 1:
console_error(
f"Cannot attach process for profiling as the requested "
f"performance counters exceed the collection capacity of "
f"single pass counter collection. The current setup of "
f"requested counter blocks needs {total_runs} number of "
f'passes. Please use "--block" or "--set" '
f"to adjust or reduce the requested performance metrics!"
)
run_prof(
fnames=str_fnames,
profiler_options=options,
workload_dir=args.path,
mspec=self._soc._mspec,
loglevel=args.loglevel,
format_rocprof_output=args.format_rocprof_output,
retain_rocpd_output=args.retain_rocpd_output,
)
end_time = time.time()
duration = end_time - start_time
console_debug(
f"The time of run_prof of {fname} is {int(duration / 60)} min"
f" {duration % 60} sec"
)
return duration
else:
console_error("Profiler not supported")
return 0.0
if args.iteration_multiplexing is not None:
console_log(
"profiling", f"Iteration multiplexing: {args.iteration_multiplexing}"
@@ -589,7 +634,7 @@ class RocProfCompute_Base:
),
)
profile(input_files, options)
self.profile(input_files, options)
else:
console_log("profiling", "Iteration multiplexing: Disabled")
@@ -615,7 +660,7 @@ class RocProfCompute_Base:
"pending first measurement...]"
)
duration = profile(fname, options)
duration = self.profile(fname, options, total_runs)
total_profiling_time += duration
# Delete temporary native tool if created
+8
查看文件
@@ -59,3 +59,11 @@ set_target_properties(
hip_dynamic_shared
PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
)
set(LAPLACE_EQN_SOURCES ../sample/laplace_eqn.hip)
set_source_files_properties(${LAPLACE_EQN_SOURCES} PROPERTIES LANGUAGE HIP)
add_executable(laplace_eqn ${LAPLACE_EQN_SOURCES})
set_target_properties(
laplace_eqn
PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
)
+246 -4
查看文件
@@ -31,9 +31,11 @@ import subprocess
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import pytest
import test_utils
from scipy.stats import zscore
# Globals
@@ -64,6 +66,8 @@ config["app_1"] = ["./tests/vcopy", "-n", "1048576", "-b", "256", "-i", "3"]
config["app_occupancy"] = ["./tests/occupancy"]
config["app_mat_mul_max"] = ["./tests/mat_mul_max"]
config["app_hip_dynamic_shared"] = ["./tests/hip_dynamic_shared"]
config["app_laplace_eqn"] = ["./tests/laplace_eqn", "-i", "5000"]
config["app_laplace_eqn_iter"] = ["./tests/laplace_eqn", "-i", "15000"]
config["cleanup"] = True
config["COUNTER_LOGGING"] = False
config["METRIC_COMPARE"] = False
@@ -460,6 +464,150 @@ def validate(test_name, workload_dir, file_dict, args=[]):
baseline_compare_metric(test_name, workload_dir, args)
def are_stochastic_counters_similar(test_dfs, baseline_df):
"""
Compares multiple test dataframes against a baseline dataframe to check
if the stochastic counter values are similar. Returns True if all test dataframes
have similar counter values to the baseline, otherwise returns False.
"""
group_labels = [
"Kernel_Name",
"Grid_Size",
"Workgroup_Size",
"LDS_Per_Workgroup",
"Counter_Name",
]
baseline_grouped = baseline_df.groupby(group_labels)
tests_grouped = [df.groupby(group_labels) for df in test_dfs]
baseline_group_keys = set(baseline_grouped.groups.keys())
tests_group_keys = [set(group.groups.keys()) for group in tests_grouped]
# Check if all test dataframes have the same group keys as the baseline
if not all(baseline_group_keys == keys for keys in tests_group_keys):
return False
stochastic_counter_patterns = list(
map(
re.compile,
[
".*REQ_sum$",
".*REQ_.*_sum$",
".*READ_sum$",
".*WRITE_sum$",
],
)
)
for group_key, baseline_group in baseline_grouped:
test_groups = [
test_grouped.get_group(group_key) for test_grouped in tests_grouped
]
baseline_counters = baseline_group["Counter_Value"]
test_counters_list = [test_group["Counter_Value"] for test_group in test_groups]
counter_name = group_key[4]
# Warmup values aren't ignored as they do not significantly impact
# the analysis for stochastic counters and leaves too few data points
# for baseline.
if any(
re.match(pattern, counter_name) for pattern in stochastic_counter_patterns
):
# Remove outliers using Z-score method
z_score_threshold = 2.0
test_z_scores_list = [
np.abs(zscore(test_counters)) for test_counters in test_counters_list
]
test_counters_list_trimmed = [
test_counters[test_z_scores < z_score_threshold]
for test_counters, test_z_scores in zip(
test_counters_list, test_z_scores_list
)
]
baseline_mean = baseline_counters.mean()
baseline_std = baseline_counters.std()
upper_bound = baseline_mean + 3 * baseline_std
lower_bound = baseline_mean - 3 * baseline_std
for test_counters in test_counters_list_trimmed:
if test_counters.between(lower_bound, upper_bound).all() is False:
return False
return True
def are_deterministic_counters_equal(test_dfs, baseline_df):
"""
Compares multiple test dataframes against a baseline dataframe to check
if the deterministic counter values are equal. Returns True if all test dataframes
have equal counter values to the baseline, otherwise returns False.
"""
group_labels = [
"Kernel_Name",
"Grid_Size",
"Workgroup_Size",
"LDS_Per_Workgroup",
"Counter_Name",
]
baseline_grouped = baseline_df.groupby(group_labels)
tests_grouped = [df.groupby(group_labels) for df in test_dfs]
baseline_group_keys = set(baseline_grouped.groups.keys())
tests_group_keys = [set(group.groups.keys()) for group in tests_grouped]
# Check if all test dataframes have the same group keys as the baseline
if not all(baseline_group_keys == keys for keys in tests_group_keys):
return False
deterministic_counter_patterns = list(
map(
re.compile,
[
"SQ_INSTS_.*",
"SPI_CS\\d_NUM_THREADGROUPS",
"SPI_CS\\d_WAVE",
"SQ_WAVES",
],
)
)
for group_key, baseline_group in baseline_grouped:
test_groups = [
test_grouped.get_group(group_key) for test_grouped in tests_grouped
]
baseline_counters = baseline_group["Counter_Value"]
test_counters_list = [test_group["Counter_Value"] for test_group in test_groups]
counter_name = group_key[4]
if any(
re.match(pattern, counter_name)
for pattern in deterministic_counter_patterns
):
if (
all([
test_counters.unique().size == 1
for test_counters in test_counters_list
])
and baseline_counters.unique().size == 1
and all([
test_counters.values[0] == baseline_counters.values[0]
for test_counters in test_counters_list
])
):
continue
return False
return True
# --
# Start of profiling tests
# --
@@ -2367,7 +2515,7 @@ class TestSetsIntegration:
test_utils.clean_output_dir(config["cleanup"], workload_dir)
@pytest.mark.iteration_multiplexing
@pytest.mark.iteration_multiplexing_1
def test_profiler_options(binary_handler_profile_rocprof_compute):
options = ["--no-native-tool", "--iteration-multiplexing"]
workload_dir = test_utils.get_output_dir()
@@ -2377,7 +2525,7 @@ def test_profiler_options(binary_handler_profile_rocprof_compute):
assert code == 1
@pytest.mark.iteration_multiplexing
@pytest.mark.iteration_multiplexing_1
def test_iteration_multiplexing(binary_handler_profile_rocprof_compute):
options = ["--iteration-multiplexing"]
workload_dir = test_utils.get_output_dir()
@@ -2407,7 +2555,7 @@ def test_iteration_multiplexing(binary_handler_profile_rocprof_compute):
test_utils.clean_output_dir(config["cleanup"], workload_dir)
@pytest.mark.iteration_multiplexing
@pytest.mark.iteration_multiplexing_1
def test_iteration_multiplexing_kernel(binary_handler_profile_rocprof_compute):
options = ["--iteration-multiplexing", "kernel"]
workload_dir = test_utils.get_output_dir()
@@ -2437,7 +2585,7 @@ def test_iteration_multiplexing_kernel(binary_handler_profile_rocprof_compute):
test_utils.clean_output_dir(config["cleanup"], workload_dir)
@pytest.mark.iteration_multiplexing
@pytest.mark.iteration_multiplexing_1
def test_iteration_multiplexing_kernel_launch_params(
binary_handler_profile_rocprof_compute,
):
@@ -2467,3 +2615,97 @@ def test_iteration_multiplexing_kernel_launch_params(
)
test_utils.clean_output_dir(config["cleanup"], workload_dir)
@pytest.mark.iteration_multiplexing_2
def test_iteration_multiplexing_deterministic_counter_accuracy(
binary_handler_profile_rocprof_compute,
):
workload_dir = test_utils.get_output_dir()
_ = binary_handler_profile_rocprof_compute(
config, workload_dir, check_success=True, roof=False, app_name="app_laplace_eqn"
)
counters_no_multiplexing = test_utils.check_csv_files(
workload_dir, num_devices, num_kernels
)["pmc_perf.csv"]
options = ["--iteration-multiplexing", "kernel"]
workload_dir = test_utils.get_output_dir()
_ = binary_handler_profile_rocprof_compute(
config,
workload_dir,
options,
check_success=True,
roof=False,
app_name="app_laplace_eqn_iter",
)
counters_kernel = test_utils.check_csv_files(
workload_dir, num_devices, num_kernels
)["pmc_perf.csv"]
options = ["--iteration-multiplexing", "kernel_launch_params"]
workload_dir = test_utils.get_output_dir()
_ = binary_handler_profile_rocprof_compute(
config,
workload_dir,
options,
check_success=True,
roof=False,
app_name="app_laplace_eqn_iter",
)
counters_kernel_launch_params = test_utils.check_csv_files(
workload_dir, num_devices, num_kernels
)["pmc_perf.csv"]
assert are_deterministic_counters_equal(
[counters_kernel, counters_kernel_launch_params], counters_no_multiplexing
)
test_utils.clean_output_dir(config["cleanup"], workload_dir)
@pytest.mark.iteration_multiplexing_stochastic
def test_iteration_multiplexing_stochastic_counter_accuracy(
binary_handler_profile_rocprof_compute,
):
workload_dir = test_utils.get_output_dir()
_ = binary_handler_profile_rocprof_compute(
config, workload_dir, check_success=True, roof=False, app_name="app_laplace_eqn"
)
counters_no_multiplexing = test_utils.check_csv_files(
workload_dir, num_devices, num_kernels
)["pmc_perf.csv"]
options = ["--iteration-multiplexing", "kernel"]
workload_dir = test_utils.get_output_dir()
_ = binary_handler_profile_rocprof_compute(
config,
workload_dir,
options,
check_success=True,
roof=False,
app_name="app_laplace_eqn_iter",
)
counters_kernel = test_utils.check_csv_files(
workload_dir, num_devices, num_kernels
)["pmc_perf.csv"]
options = ["--iteration-multiplexing", "kernel_launch_params"]
workload_dir = test_utils.get_output_dir()
_ = binary_handler_profile_rocprof_compute(
config,
workload_dir,
options,
check_success=True,
roof=False,
app_name="app_laplace_eqn_iter",
)
counters_kernel_launch_params = test_utils.check_csv_files(
workload_dir, num_devices, num_kernels
)["pmc_perf.csv"]
assert are_stochastic_counters_similar(
[counters_kernel, counters_kernel_launch_params], counters_no_multiplexing
)
test_utils.clean_output_dir(config["cleanup"], workload_dir)