[rocprofiler-compute] Counter accuracy tests and improvements for iteration multiplexing (#2011)
* Added laplace solver in samples * Add laplace eqn in CMake * Added counter accuracy test * Add iteration CLI arg for laplace eq * Unnest profile method * Missing counter warning * Updated insufficient kernel warning * Added reference for laplace equation * variable name change * Added comments for data comparison * Included scipy as test requirement * Added line number for ref * split stochastic and deterministic tests * Added order cli option for laplace_eqn * Install laplace eqn * Missing counter warning * Warn about missing kernels during analysis * Update tests * Split iteration multiplexing ctests * Updated warning * Incorporated copilot's suggestions
此提交包含在:
@@ -361,10 +361,18 @@ add_test(
|
||||
)
|
||||
|
||||
add_test(
|
||||
NAME test_profile_iteration_multiplexing
|
||||
NAME test_profile_iteration_multiplexing_1
|
||||
COMMAND
|
||||
${PYTHON_TEST_COMMAND} -m pytest -s -m iteration_multiplexing
|
||||
--junitxml=tests/test_profile_iteration_multiplexing.xml ${COV_OPTION}
|
||||
${PYTHON_TEST_COMMAND} -m pytest -s -m iteration_multiplexing_1
|
||||
--junitxml=tests/test_profile_iteration_multiplexing_1.xml ${COV_OPTION}
|
||||
tests/test_profile_general.py ${WORKING_DIR_OPTION}
|
||||
)
|
||||
|
||||
add_test(
|
||||
NAME test_profile_iteration_multiplexing_2
|
||||
COMMAND
|
||||
${PYTHON_TEST_COMMAND} -m pytest -s -m iteration_multiplexing_2
|
||||
--junitxml=tests/test_profile_iteration_multiplexing_2.xml ${COV_OPTION}
|
||||
tests/test_profile_general.py ${WORKING_DIR_OPTION}
|
||||
)
|
||||
|
||||
@@ -382,7 +390,8 @@ set_tests_properties(
|
||||
test_profile_pc_sampling
|
||||
test_profile_sets_func
|
||||
test_profile_live_attach_detach
|
||||
test_profile_iteration_multiplexing
|
||||
test_profile_iteration_multiplexing_1
|
||||
test_profile_iteration_multiplexing_2
|
||||
PROPERTIES LABELS "profile" RESOURCE_GROUPS gpus:1 TIMEOUT 1800
|
||||
)
|
||||
|
||||
@@ -723,6 +732,7 @@ if(INSTALL_TESTS)
|
||||
tests/vsequential_access
|
||||
tests/occupancy
|
||||
tests/hip_dynamic_shared
|
||||
tests/laplace_eqn
|
||||
tests/mat_mul_max
|
||||
DESTINATION ${CMAKE_INSTALL_LIBEXECDIR}/${PROJECT_NAME}/tests
|
||||
COMPONENT tests
|
||||
|
||||
@@ -107,5 +107,7 @@ markers = [
|
||||
"roofline_2",
|
||||
"path",
|
||||
"sci_notion",
|
||||
"iteration_multiplexing",
|
||||
"iteration_multiplexing_1",
|
||||
"iteration_multiplexing_2",
|
||||
"iteration_multiplexing_stochastic",
|
||||
]
|
||||
|
||||
@@ -2,3 +2,4 @@ mock
|
||||
pytest
|
||||
pytest-cov
|
||||
pytest-xdist
|
||||
scipy
|
||||
|
||||
@@ -0,0 +1,217 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
// Reference: Finite Difference Methods for Ordinary and Partial Differential
|
||||
// Equations by Randall J. LeVeque, Page 69-71
|
||||
|
||||
#include <algorithm>
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
|
||||
// Helper macro for HIP error checking
|
||||
#define HIP_CHECK(call) \
|
||||
do { \
|
||||
hipError_t err = call; \
|
||||
if (err != hipSuccess) { \
|
||||
std::cerr << "HIP error: " << hipGetErrorString(err) << " at " \
|
||||
<< __FILE__ << ":" << __LINE__ << std::endl; \
|
||||
std::exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
void initialize(float *U, int N) {
|
||||
for (int i = 0; i < N; ++i) {
|
||||
for (int j = 0; j < N; ++j) {
|
||||
float x = 2.0f * (float)i / (float)N - 1.0f;
|
||||
float y = 2.0f * (float)j / (float)N - 1.0f;
|
||||
U[i * N + j] = 1.0f / (1.0f + x * x + y * y); // Initial guess
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void jacobi_iteration(float *__restrict__ U_new,
|
||||
const float *__restrict__ U_old, int N) {
|
||||
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int j = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
int idx = j + i * N; // Flattened index for 2D grid
|
||||
|
||||
extern __shared__ float U[];
|
||||
int s_i = threadIdx.x + 1;
|
||||
int s_j = threadIdx.y + 1;
|
||||
|
||||
int tile_height = blockDim.y + 2;
|
||||
int s_idx = s_j + s_i * tile_height;
|
||||
|
||||
// Load data into shared memory with halo regions
|
||||
if (i < N && j < N) {
|
||||
U[s_idx] = U_old[idx];
|
||||
|
||||
// Load halo regions
|
||||
if (threadIdx.x == 0) {
|
||||
U[s_j] = (i > 0) ? U_old[(i - 1) * N + j] : U_old[(N - 1) * N + j];
|
||||
}
|
||||
|
||||
if (threadIdx.x == blockDim.x - 1) {
|
||||
U[s_j + (blockDim.x + 1) * tile_height] =
|
||||
(i < N - 1) ? U_old[(i + 1) * N + j] : U_old[j];
|
||||
}
|
||||
|
||||
if (threadIdx.y == 0) {
|
||||
U[s_i * tile_height] =
|
||||
(j > 0) ? U_old[i * N + (j - 1)] : U_old[i * N + (N - 1)];
|
||||
}
|
||||
|
||||
if (threadIdx.y == blockDim.y - 1) {
|
||||
U[s_i * tile_height + blockDim.y + 1] =
|
||||
(j < N - 1) ? U_old[i * N + (j + 1)] : U_old[i * N];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (i < N && j < N) {
|
||||
int east = s_idx - tile_height;
|
||||
int west = s_idx + tile_height;
|
||||
int north = s_idx + 1;
|
||||
int south = s_idx - 1;
|
||||
U_new[idx] = 0.25f * (U[east] + U[west] + U[north] + U[south]);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int iter = 1000;
|
||||
std::vector<int> launch_order;
|
||||
|
||||
for (int i = 0; i < argc; i++) {
|
||||
if (std::string(argv[i]) == "-i" && i + 1 < argc) {
|
||||
iter = std::atoi(argv[i + 1]);
|
||||
} else if (std::string(argv[i]) == "-o" && i + 1 < argc) {
|
||||
int n_order = std::atoi(argv[i + 1]);
|
||||
if ((i + n_order + 1) >= argc) {
|
||||
std::cerr << "Insufficient arguments for -o option" << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
} else {
|
||||
for (int j = 0; j < n_order; j++) {
|
||||
int kernel_id = std::atoi(argv[i + 2 + j]);
|
||||
if (kernel_id < 0 || kernel_id > 2) {
|
||||
std::cerr << "Invalid kernel ID: " << kernel_id << std::endl;
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
launch_order.push_back(kernel_id);
|
||||
}
|
||||
i += n_order;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (launch_order.empty()) {
|
||||
launch_order = {0, 1, 2};
|
||||
}
|
||||
|
||||
// Problem size
|
||||
const int n_small = 1024;
|
||||
const int n_mid = 2048;
|
||||
const int n_large = 4096;
|
||||
|
||||
// Define block and grid sizes
|
||||
dim3 blockSize_small(16, 16);
|
||||
dim3 gridSize_small((n_small + blockSize_small.x - 1) / blockSize_small.x,
|
||||
(n_small + blockSize_small.y - 1) / blockSize_small.y);
|
||||
dim3 blockSize_mid(24, 24);
|
||||
dim3 gridSize_mid((n_mid + blockSize_mid.x - 1) / blockSize_mid.x,
|
||||
(n_mid + blockSize_mid.y - 1) / blockSize_mid.y);
|
||||
dim3 blockSize_large(32, 32);
|
||||
dim3 gridSize_large((n_large + blockSize_large.x - 1) / blockSize_large.x,
|
||||
(n_large + blockSize_large.y - 1) / blockSize_large.y);
|
||||
|
||||
// Host memory pointers
|
||||
float *h_U_small = new float[n_small * n_small];
|
||||
float *h_U_mid = new float[n_mid * n_mid];
|
||||
float *h_U_large = new float[n_large * n_large];
|
||||
|
||||
// Initialize host arrays
|
||||
initialize(h_U_small, n_small);
|
||||
initialize(h_U_mid, n_mid);
|
||||
initialize(h_U_large, n_large);
|
||||
|
||||
// Device memory pointers
|
||||
float *d_U_small_new, *d_U_small_old;
|
||||
float *d_U_mid_new, *d_U_mid_old;
|
||||
float *d_U_large_new, *d_U_large_old;
|
||||
|
||||
// Allocate device memory
|
||||
HIP_CHECK(hipMalloc(&d_U_small_new, n_small * n_small * sizeof(float)));
|
||||
HIP_CHECK(hipMalloc(&d_U_small_old, n_small * n_small * sizeof(float)));
|
||||
|
||||
HIP_CHECK(hipMalloc(&d_U_mid_new, n_mid * n_mid * sizeof(float)));
|
||||
HIP_CHECK(hipMalloc(&d_U_mid_old, n_mid * n_mid * sizeof(float)));
|
||||
|
||||
HIP_CHECK(hipMalloc(&d_U_large_new, n_large * n_large * sizeof(float)));
|
||||
HIP_CHECK(hipMalloc(&d_U_large_old, n_large * n_large * sizeof(float)));
|
||||
|
||||
// Copy host arrays to device
|
||||
HIP_CHECK(hipMemcpy(d_U_small_old, h_U_small,
|
||||
n_small * n_small * sizeof(float),
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(d_U_mid_old, h_U_mid, n_mid * n_mid * sizeof(float),
|
||||
hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(d_U_large_old, h_U_large,
|
||||
n_large * n_large * sizeof(float),
|
||||
hipMemcpyHostToDevice));
|
||||
|
||||
std::vector<dim3> grid_sizes = {gridSize_small, gridSize_mid, gridSize_large};
|
||||
std::vector<dim3> block_sizes = {blockSize_small, blockSize_mid,
|
||||
blockSize_large};
|
||||
std::vector<float *> d_U_news = {d_U_small_new, d_U_mid_new, d_U_large_new};
|
||||
std::vector<float *> d_U_olds = {d_U_small_old, d_U_mid_old, d_U_large_old};
|
||||
std::vector<int> n_sizes = {n_small, n_mid, n_large};
|
||||
|
||||
// Perform Jacobi iterations
|
||||
for (int it = 0; it < iter; ++it) {
|
||||
int i = launch_order[it % launch_order.size()];
|
||||
jacobi_iteration<<<grid_sizes[i], block_sizes[i],
|
||||
(block_sizes[i].x + 2) * (block_sizes[i].y + 2) *
|
||||
sizeof(float)>>>(d_U_news[i], d_U_olds[i],
|
||||
n_sizes[i]);
|
||||
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
// Swap pointers
|
||||
std::swap(d_U_news[i], d_U_olds[i]);
|
||||
}
|
||||
|
||||
// Free device memory
|
||||
HIP_CHECK(hipFree(d_U_small_new));
|
||||
HIP_CHECK(hipFree(d_U_small_old));
|
||||
HIP_CHECK(hipFree(d_U_mid_new));
|
||||
HIP_CHECK(hipFree(d_U_mid_old));
|
||||
HIP_CHECK(hipFree(d_U_large_new));
|
||||
HIP_CHECK(hipFree(d_U_large_old));
|
||||
|
||||
delete[] h_U_small;
|
||||
delete[] h_U_mid;
|
||||
delete[] h_U_large;
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -430,6 +430,23 @@ class OmniAnalyze_Base:
|
||||
"with profiling data collected with iteration multiplexing.",
|
||||
)
|
||||
|
||||
# Check if any kernel's counters are missing due to iteration multiplexing
|
||||
if (
|
||||
self._profiling_config.get("iteration_multiplexing") is not None
|
||||
and self._profiling_config.get("kernels_with_missing_counters") is not None
|
||||
):
|
||||
missing_kernels = self._profiling_config.get(
|
||||
"kernels_with_missing_counters"
|
||||
)
|
||||
console_warning(
|
||||
"analysis",
|
||||
(
|
||||
"The following kernels have missing counter data "
|
||||
"due to iteration multiplexing and should be filtered out: "
|
||||
f"{', '.join(missing_kernels)}"
|
||||
),
|
||||
)
|
||||
|
||||
# initalize runs
|
||||
self._runs = self.initalize_runs()
|
||||
|
||||
|
||||
@@ -122,6 +122,37 @@ class RocProfCompute_Base:
|
||||
"./vcopy -n 1048576 -b 256"
|
||||
)
|
||||
|
||||
def detect_missing_counters(self, df: pd.DataFrame) -> None:
|
||||
"""Detect missing counter values in joined dataframe"""
|
||||
args = self.get_args()
|
||||
group_labels = ["Kernel_Name"]
|
||||
if args.join_type == "grid":
|
||||
group_labels.append("Grid_Size")
|
||||
|
||||
num_files = len(list(Path(args.path).glob("perfmon/*.txt")))
|
||||
kernels_with_missing_counters = []
|
||||
for _, groups in df.groupby(group_labels):
|
||||
if groups["Dispatch_ID"].nunique() < num_files:
|
||||
kernel_name = groups.iloc[0]["Kernel_Name"]
|
||||
kernels_with_missing_counters.append(kernel_name)
|
||||
|
||||
if kernels_with_missing_counters:
|
||||
kernels_with_missing_counters = list(set(kernels_with_missing_counters))
|
||||
console_warning(
|
||||
"join_prof",
|
||||
(
|
||||
f"Insufficient number of kernel calls for kernels: "
|
||||
f"{', '.join(kernels_with_missing_counters)} "
|
||||
f"to collect all counters using iteration multiplexing. "
|
||||
f"Please use kernel filtering and exclude the above kernels "
|
||||
f"or turn off iteration multiplexing."
|
||||
),
|
||||
)
|
||||
with open(f"{args.path}/profiling_config.yaml", "a") as f:
|
||||
yaml.dump(
|
||||
{"kernels_with_missing_counters": kernels_with_missing_counters}, f
|
||||
)
|
||||
|
||||
@demarcate
|
||||
def join_prof(self, out: Optional[str] = None) -> Optional[pd.DataFrame]:
|
||||
"""Manually join separated rocprof runs"""
|
||||
@@ -148,6 +179,10 @@ class RocProfCompute_Base:
|
||||
|
||||
console_debug(f"Created file: {output_file}")
|
||||
|
||||
if args.iteration_multiplexing is not None:
|
||||
df = pd.read_csv(output_file)
|
||||
self.detect_missing_counters(df)
|
||||
|
||||
# Delete results_*.csv files
|
||||
for file in result_files:
|
||||
Path(file).unlink()
|
||||
@@ -340,6 +375,11 @@ class RocProfCompute_Base:
|
||||
if "key" in df.columns:
|
||||
df = df.drop(columns=["key"])
|
||||
|
||||
console_debug("join_prof", "Checking for missing counter values...")
|
||||
|
||||
if args.iteration_multiplexing is not None:
|
||||
self.detect_missing_counters(df)
|
||||
|
||||
# save to file and delete old file(s)
|
||||
# skip if we're being called outside of rocprof-compute
|
||||
if isinstance(args.path, str):
|
||||
@@ -391,6 +431,62 @@ class RocProfCompute_Base:
|
||||
soc=self._soc,
|
||||
)
|
||||
|
||||
def profile(
|
||||
self,
|
||||
fnames: Union[list[Path], Path],
|
||||
options: Union[list[str], dict[str, Any]],
|
||||
total_runs: int = 1,
|
||||
) -> float:
|
||||
args = self.get_args()
|
||||
|
||||
if isinstance(fnames, list):
|
||||
console_log(
|
||||
"profiling", f"Current input files: {', '.join(map(str, fnames))}"
|
||||
)
|
||||
str_fnames = [str(fname) for fname in fnames]
|
||||
else:
|
||||
console_log("profiling", f"Current input file: {fnames}")
|
||||
str_fnames = str(fnames)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
if self.__profiler == "rocprofv3" or self.__profiler == "rocprofiler-sdk":
|
||||
# Only 1-run case is permitted for attach/detach
|
||||
if (isinstance(options, list) and "--pid" in options) or (
|
||||
isinstance(options, dict)
|
||||
and (options.get("ROCPROF_ATTACH_PID") is not None)
|
||||
):
|
||||
if total_runs > 1:
|
||||
console_error(
|
||||
f"Cannot attach process for profiling as the requested "
|
||||
f"performance counters exceed the collection capacity of "
|
||||
f"single pass counter collection. The current setup of "
|
||||
f"requested counter blocks needs {total_runs} number of "
|
||||
f'passes. Please use "--block" or "--set" '
|
||||
f"to adjust or reduce the requested performance metrics!"
|
||||
)
|
||||
run_prof(
|
||||
fnames=str_fnames,
|
||||
profiler_options=options,
|
||||
workload_dir=args.path,
|
||||
mspec=self._soc._mspec,
|
||||
loglevel=args.loglevel,
|
||||
format_rocprof_output=args.format_rocprof_output,
|
||||
retain_rocpd_output=args.retain_rocpd_output,
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
duration = end_time - start_time
|
||||
|
||||
console_debug(
|
||||
f"The time of run_prof of {str_fnames} is {int(duration / 60)} min"
|
||||
f" {duration % 60} sec"
|
||||
)
|
||||
return duration
|
||||
else:
|
||||
console_error("Profiler not supported")
|
||||
return 0.0
|
||||
|
||||
@abstractmethod
|
||||
def run_profiling(self, version: str, prog: str) -> None:
|
||||
"""Run profiling."""
|
||||
@@ -516,57 +612,6 @@ class RocProfCompute_Base:
|
||||
else:
|
||||
console_debug(output)
|
||||
|
||||
def profile(
|
||||
fnames: Union[list[Path], Path], options: Union[list[str], dict[str, Any]]
|
||||
) -> float:
|
||||
if isinstance(fnames, list):
|
||||
console_log(
|
||||
"profiling", f"Current input files: {', '.join(map(str, fnames))}"
|
||||
)
|
||||
str_fnames = [str(fname) for fname in fnames]
|
||||
else:
|
||||
console_log("profiling", f"Current input file: {fnames}")
|
||||
str_fnames = str(fnames)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
if self.__profiler == "rocprofv3" or self.__profiler == "rocprofiler-sdk":
|
||||
# Only 1-run case is permitted for attach/detach
|
||||
if (isinstance(options, list) and "--pid" in options) or (
|
||||
isinstance(options, dict)
|
||||
and (options.get("ROCPROF_ATTACH_PID") is not None)
|
||||
):
|
||||
if total_runs > 1:
|
||||
console_error(
|
||||
f"Cannot attach process for profiling as the requested "
|
||||
f"performance counters exceed the collection capacity of "
|
||||
f"single pass counter collection. The current setup of "
|
||||
f"requested counter blocks needs {total_runs} number of "
|
||||
f'passes. Please use "--block" or "--set" '
|
||||
f"to adjust or reduce the requested performance metrics!"
|
||||
)
|
||||
run_prof(
|
||||
fnames=str_fnames,
|
||||
profiler_options=options,
|
||||
workload_dir=args.path,
|
||||
mspec=self._soc._mspec,
|
||||
loglevel=args.loglevel,
|
||||
format_rocprof_output=args.format_rocprof_output,
|
||||
retain_rocpd_output=args.retain_rocpd_output,
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
duration = end_time - start_time
|
||||
|
||||
console_debug(
|
||||
f"The time of run_prof of {fname} is {int(duration / 60)} min"
|
||||
f" {duration % 60} sec"
|
||||
)
|
||||
return duration
|
||||
else:
|
||||
console_error("Profiler not supported")
|
||||
return 0.0
|
||||
|
||||
if args.iteration_multiplexing is not None:
|
||||
console_log(
|
||||
"profiling", f"Iteration multiplexing: {args.iteration_multiplexing}"
|
||||
@@ -589,7 +634,7 @@ class RocProfCompute_Base:
|
||||
),
|
||||
)
|
||||
|
||||
profile(input_files, options)
|
||||
self.profile(input_files, options)
|
||||
else:
|
||||
console_log("profiling", "Iteration multiplexing: Disabled")
|
||||
|
||||
@@ -615,7 +660,7 @@ class RocProfCompute_Base:
|
||||
"pending first measurement...]"
|
||||
)
|
||||
|
||||
duration = profile(fname, options)
|
||||
duration = self.profile(fname, options, total_runs)
|
||||
total_profiling_time += duration
|
||||
|
||||
# Delete temporary native tool if created
|
||||
|
||||
@@ -59,3 +59,11 @@ set_target_properties(
|
||||
hip_dynamic_shared
|
||||
PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
|
||||
)
|
||||
|
||||
set(LAPLACE_EQN_SOURCES ../sample/laplace_eqn.hip)
|
||||
set_source_files_properties(${LAPLACE_EQN_SOURCES} PROPERTIES LANGUAGE HIP)
|
||||
add_executable(laplace_eqn ${LAPLACE_EQN_SOURCES})
|
||||
set_target_properties(
|
||||
laplace_eqn
|
||||
PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_SOURCE_DIR}/tests
|
||||
)
|
||||
|
||||
@@ -31,9 +31,11 @@ import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
import test_utils
|
||||
from scipy.stats import zscore
|
||||
|
||||
# Globals
|
||||
|
||||
@@ -64,6 +66,8 @@ config["app_1"] = ["./tests/vcopy", "-n", "1048576", "-b", "256", "-i", "3"]
|
||||
config["app_occupancy"] = ["./tests/occupancy"]
|
||||
config["app_mat_mul_max"] = ["./tests/mat_mul_max"]
|
||||
config["app_hip_dynamic_shared"] = ["./tests/hip_dynamic_shared"]
|
||||
config["app_laplace_eqn"] = ["./tests/laplace_eqn", "-i", "5000"]
|
||||
config["app_laplace_eqn_iter"] = ["./tests/laplace_eqn", "-i", "15000"]
|
||||
config["cleanup"] = True
|
||||
config["COUNTER_LOGGING"] = False
|
||||
config["METRIC_COMPARE"] = False
|
||||
@@ -460,6 +464,150 @@ def validate(test_name, workload_dir, file_dict, args=[]):
|
||||
baseline_compare_metric(test_name, workload_dir, args)
|
||||
|
||||
|
||||
def are_stochastic_counters_similar(test_dfs, baseline_df):
|
||||
"""
|
||||
Compares multiple test dataframes against a baseline dataframe to check
|
||||
if the stochastic counter values are similar. Returns True if all test dataframes
|
||||
have similar counter values to the baseline, otherwise returns False.
|
||||
"""
|
||||
group_labels = [
|
||||
"Kernel_Name",
|
||||
"Grid_Size",
|
||||
"Workgroup_Size",
|
||||
"LDS_Per_Workgroup",
|
||||
"Counter_Name",
|
||||
]
|
||||
|
||||
baseline_grouped = baseline_df.groupby(group_labels)
|
||||
tests_grouped = [df.groupby(group_labels) for df in test_dfs]
|
||||
|
||||
baseline_group_keys = set(baseline_grouped.groups.keys())
|
||||
tests_group_keys = [set(group.groups.keys()) for group in tests_grouped]
|
||||
|
||||
# Check if all test dataframes have the same group keys as the baseline
|
||||
if not all(baseline_group_keys == keys for keys in tests_group_keys):
|
||||
return False
|
||||
|
||||
stochastic_counter_patterns = list(
|
||||
map(
|
||||
re.compile,
|
||||
[
|
||||
".*REQ_sum$",
|
||||
".*REQ_.*_sum$",
|
||||
".*READ_sum$",
|
||||
".*WRITE_sum$",
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
for group_key, baseline_group in baseline_grouped:
|
||||
test_groups = [
|
||||
test_grouped.get_group(group_key) for test_grouped in tests_grouped
|
||||
]
|
||||
|
||||
baseline_counters = baseline_group["Counter_Value"]
|
||||
test_counters_list = [test_group["Counter_Value"] for test_group in test_groups]
|
||||
|
||||
counter_name = group_key[4]
|
||||
|
||||
# Warmup values aren't ignored as they do not significantly impact
|
||||
# the analysis for stochastic counters and leaves too few data points
|
||||
# for baseline.
|
||||
if any(
|
||||
re.match(pattern, counter_name) for pattern in stochastic_counter_patterns
|
||||
):
|
||||
# Remove outliers using Z-score method
|
||||
z_score_threshold = 2.0
|
||||
|
||||
test_z_scores_list = [
|
||||
np.abs(zscore(test_counters)) for test_counters in test_counters_list
|
||||
]
|
||||
test_counters_list_trimmed = [
|
||||
test_counters[test_z_scores < z_score_threshold]
|
||||
for test_counters, test_z_scores in zip(
|
||||
test_counters_list, test_z_scores_list
|
||||
)
|
||||
]
|
||||
|
||||
baseline_mean = baseline_counters.mean()
|
||||
baseline_std = baseline_counters.std()
|
||||
upper_bound = baseline_mean + 3 * baseline_std
|
||||
lower_bound = baseline_mean - 3 * baseline_std
|
||||
|
||||
for test_counters in test_counters_list_trimmed:
|
||||
if test_counters.between(lower_bound, upper_bound).all() is False:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def are_deterministic_counters_equal(test_dfs, baseline_df):
|
||||
"""
|
||||
Compares multiple test dataframes against a baseline dataframe to check
|
||||
if the deterministic counter values are equal. Returns True if all test dataframes
|
||||
have equal counter values to the baseline, otherwise returns False.
|
||||
"""
|
||||
group_labels = [
|
||||
"Kernel_Name",
|
||||
"Grid_Size",
|
||||
"Workgroup_Size",
|
||||
"LDS_Per_Workgroup",
|
||||
"Counter_Name",
|
||||
]
|
||||
|
||||
baseline_grouped = baseline_df.groupby(group_labels)
|
||||
tests_grouped = [df.groupby(group_labels) for df in test_dfs]
|
||||
|
||||
baseline_group_keys = set(baseline_grouped.groups.keys())
|
||||
tests_group_keys = [set(group.groups.keys()) for group in tests_grouped]
|
||||
|
||||
# Check if all test dataframes have the same group keys as the baseline
|
||||
if not all(baseline_group_keys == keys for keys in tests_group_keys):
|
||||
return False
|
||||
|
||||
deterministic_counter_patterns = list(
|
||||
map(
|
||||
re.compile,
|
||||
[
|
||||
"SQ_INSTS_.*",
|
||||
"SPI_CS\\d_NUM_THREADGROUPS",
|
||||
"SPI_CS\\d_WAVE",
|
||||
"SQ_WAVES",
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
for group_key, baseline_group in baseline_grouped:
|
||||
test_groups = [
|
||||
test_grouped.get_group(group_key) for test_grouped in tests_grouped
|
||||
]
|
||||
|
||||
baseline_counters = baseline_group["Counter_Value"]
|
||||
test_counters_list = [test_group["Counter_Value"] for test_group in test_groups]
|
||||
|
||||
counter_name = group_key[4]
|
||||
if any(
|
||||
re.match(pattern, counter_name)
|
||||
for pattern in deterministic_counter_patterns
|
||||
):
|
||||
if (
|
||||
all([
|
||||
test_counters.unique().size == 1
|
||||
for test_counters in test_counters_list
|
||||
])
|
||||
and baseline_counters.unique().size == 1
|
||||
and all([
|
||||
test_counters.values[0] == baseline_counters.values[0]
|
||||
for test_counters in test_counters_list
|
||||
])
|
||||
):
|
||||
continue
|
||||
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# --
|
||||
# Start of profiling tests
|
||||
# --
|
||||
@@ -2367,7 +2515,7 @@ class TestSetsIntegration:
|
||||
test_utils.clean_output_dir(config["cleanup"], workload_dir)
|
||||
|
||||
|
||||
@pytest.mark.iteration_multiplexing
|
||||
@pytest.mark.iteration_multiplexing_1
|
||||
def test_profiler_options(binary_handler_profile_rocprof_compute):
|
||||
options = ["--no-native-tool", "--iteration-multiplexing"]
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
@@ -2377,7 +2525,7 @@ def test_profiler_options(binary_handler_profile_rocprof_compute):
|
||||
assert code == 1
|
||||
|
||||
|
||||
@pytest.mark.iteration_multiplexing
|
||||
@pytest.mark.iteration_multiplexing_1
|
||||
def test_iteration_multiplexing(binary_handler_profile_rocprof_compute):
|
||||
options = ["--iteration-multiplexing"]
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
@@ -2407,7 +2555,7 @@ def test_iteration_multiplexing(binary_handler_profile_rocprof_compute):
|
||||
test_utils.clean_output_dir(config["cleanup"], workload_dir)
|
||||
|
||||
|
||||
@pytest.mark.iteration_multiplexing
|
||||
@pytest.mark.iteration_multiplexing_1
|
||||
def test_iteration_multiplexing_kernel(binary_handler_profile_rocprof_compute):
|
||||
options = ["--iteration-multiplexing", "kernel"]
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
@@ -2437,7 +2585,7 @@ def test_iteration_multiplexing_kernel(binary_handler_profile_rocprof_compute):
|
||||
test_utils.clean_output_dir(config["cleanup"], workload_dir)
|
||||
|
||||
|
||||
@pytest.mark.iteration_multiplexing
|
||||
@pytest.mark.iteration_multiplexing_1
|
||||
def test_iteration_multiplexing_kernel_launch_params(
|
||||
binary_handler_profile_rocprof_compute,
|
||||
):
|
||||
@@ -2467,3 +2615,97 @@ def test_iteration_multiplexing_kernel_launch_params(
|
||||
)
|
||||
|
||||
test_utils.clean_output_dir(config["cleanup"], workload_dir)
|
||||
|
||||
|
||||
@pytest.mark.iteration_multiplexing_2
|
||||
def test_iteration_multiplexing_deterministic_counter_accuracy(
|
||||
binary_handler_profile_rocprof_compute,
|
||||
):
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config, workload_dir, check_success=True, roof=False, app_name="app_laplace_eqn"
|
||||
)
|
||||
counters_no_multiplexing = test_utils.check_csv_files(
|
||||
workload_dir, num_devices, num_kernels
|
||||
)["pmc_perf.csv"]
|
||||
|
||||
options = ["--iteration-multiplexing", "kernel"]
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
workload_dir,
|
||||
options,
|
||||
check_success=True,
|
||||
roof=False,
|
||||
app_name="app_laplace_eqn_iter",
|
||||
)
|
||||
counters_kernel = test_utils.check_csv_files(
|
||||
workload_dir, num_devices, num_kernels
|
||||
)["pmc_perf.csv"]
|
||||
|
||||
options = ["--iteration-multiplexing", "kernel_launch_params"]
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
workload_dir,
|
||||
options,
|
||||
check_success=True,
|
||||
roof=False,
|
||||
app_name="app_laplace_eqn_iter",
|
||||
)
|
||||
counters_kernel_launch_params = test_utils.check_csv_files(
|
||||
workload_dir, num_devices, num_kernels
|
||||
)["pmc_perf.csv"]
|
||||
|
||||
assert are_deterministic_counters_equal(
|
||||
[counters_kernel, counters_kernel_launch_params], counters_no_multiplexing
|
||||
)
|
||||
|
||||
test_utils.clean_output_dir(config["cleanup"], workload_dir)
|
||||
|
||||
|
||||
@pytest.mark.iteration_multiplexing_stochastic
|
||||
def test_iteration_multiplexing_stochastic_counter_accuracy(
|
||||
binary_handler_profile_rocprof_compute,
|
||||
):
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config, workload_dir, check_success=True, roof=False, app_name="app_laplace_eqn"
|
||||
)
|
||||
counters_no_multiplexing = test_utils.check_csv_files(
|
||||
workload_dir, num_devices, num_kernels
|
||||
)["pmc_perf.csv"]
|
||||
|
||||
options = ["--iteration-multiplexing", "kernel"]
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
workload_dir,
|
||||
options,
|
||||
check_success=True,
|
||||
roof=False,
|
||||
app_name="app_laplace_eqn_iter",
|
||||
)
|
||||
counters_kernel = test_utils.check_csv_files(
|
||||
workload_dir, num_devices, num_kernels
|
||||
)["pmc_perf.csv"]
|
||||
|
||||
options = ["--iteration-multiplexing", "kernel_launch_params"]
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
config,
|
||||
workload_dir,
|
||||
options,
|
||||
check_success=True,
|
||||
roof=False,
|
||||
app_name="app_laplace_eqn_iter",
|
||||
)
|
||||
counters_kernel_launch_params = test_utils.check_csv_files(
|
||||
workload_dir, num_devices, num_kernels
|
||||
)["pmc_perf.csv"]
|
||||
|
||||
assert are_stochastic_counters_similar(
|
||||
[counters_kernel, counters_kernel_launch_params], counters_no_multiplexing
|
||||
)
|
||||
|
||||
test_utils.clean_output_dir(config["cleanup"], workload_dir)
|
||||
|
||||
新增問題並參考
封鎖使用者