change interval for host_trap in unit test to adapt to single kernel (#1064)
Este commit está contenido en:
@@ -601,7 +601,8 @@ add_custom_target(
|
||||
# Create VERSION.sha file
|
||||
COMMAND git -C ${PROJECT_SOURCE_DIR} rev-parse HEAD > VERSION.sha
|
||||
# Build standalone binary
|
||||
# NOTE: --no-deployment-flag=self-execution is used to avoid self-execution and fork
|
||||
# NOTE: --no-deployment-flag=self-execution is used to avoid self-execution
|
||||
# and fork
|
||||
# bombs as explained in
|
||||
# https://nuitka.net/user-documentation/common-issue-solutions.html#fork-bombs-self-execution
|
||||
COMMAND
|
||||
|
||||
@@ -6,6 +6,18 @@
|
||||
#define TILE_SIZE 32 // Maximum block size: 32 x 32 = 1024 threads/block
|
||||
#define N 4096 // Matrix size: 4096 x 4096 (~67M elements)
|
||||
|
||||
// Helper macro for HIP error checking
|
||||
#define HIP_CHECK(call) \
|
||||
do { \
|
||||
hipError_t err = call; \
|
||||
if (err != hipSuccess) { \
|
||||
std::cerr << "HIP error: " << hipGetErrorString(err) \
|
||||
<< " at " << __FILE__ << ":" << __LINE__ \
|
||||
<< std::endl; \
|
||||
std::exit(EXIT_FAILURE); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
__global__ void matMulKernel(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C, int width) {
|
||||
__shared__ float tileA[TILE_SIZE][TILE_SIZE];
|
||||
__shared__ float tileB[TILE_SIZE][TILE_SIZE];
|
||||
@@ -15,6 +27,7 @@ __global__ void matMulKernel(const float* __restrict__ A, const float* __restric
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
// Loop over tiles of input matrices
|
||||
for (int t = 0; t < width / TILE_SIZE; ++t) {
|
||||
tileA[threadIdx.y][threadIdx.x] = A[row * width + t * TILE_SIZE + threadIdx.x];
|
||||
tileB[threadIdx.y][threadIdx.x] = B[(t * TILE_SIZE + threadIdx.y) * width + col];
|
||||
@@ -43,12 +56,12 @@ int main() {
|
||||
h_B[i] = static_cast<float>((i + 1) % 100) * 0.01f;
|
||||
}
|
||||
|
||||
hipMalloc(&d_A, size);
|
||||
hipMalloc(&d_B, size);
|
||||
hipMalloc(&d_C, size);
|
||||
HIP_CHECK(hipMalloc(&d_A, size));
|
||||
HIP_CHECK(hipMalloc(&d_B, size));
|
||||
HIP_CHECK(hipMalloc(&d_C, size));
|
||||
|
||||
hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice);
|
||||
hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice);
|
||||
HIP_CHECK(hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice));
|
||||
|
||||
dim3 blockDim(TILE_SIZE, TILE_SIZE); // 32 x 32 = 1024 threads
|
||||
dim3 gridDim(N / TILE_SIZE, N / TILE_SIZE); // 128 x 128 = 16,384 thread blocks
|
||||
@@ -59,15 +72,16 @@ int main() {
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
matMulKernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, N);
|
||||
hipDeviceSynchronize();
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipDeviceSynchronize());
|
||||
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
std::chrono::duration<double> elapsed = end - start;
|
||||
std::cout << "Execution time: " << elapsed.count() << " seconds\n";
|
||||
|
||||
hipFree(d_A);
|
||||
hipFree(d_B);
|
||||
hipFree(d_C);
|
||||
HIP_CHECK(hipFree(d_A));
|
||||
HIP_CHECK(hipFree(d_B));
|
||||
HIP_CHECK(hipFree(d_C));
|
||||
delete[] h_A;
|
||||
delete[] h_B;
|
||||
|
||||
|
||||
@@ -1174,7 +1174,10 @@ def apply_dispatch_filter(df: pd.DataFrame, workload: schema.Workload) -> pd.Dat
|
||||
if int(dispatch_id) >= len(df): # subtract 2 bc of the two header rows
|
||||
console_error("analysis", f"{dispatch_id} is an invalid dispatch id.")
|
||||
|
||||
if isinstance(workload.filter_dispatch_ids[0], str) and ">" in workload.filter_dispatch_ids[0]:
|
||||
if (
|
||||
isinstance(workload.filter_dispatch_ids[0], str)
|
||||
and ">" in workload.filter_dispatch_ids[0]
|
||||
):
|
||||
dispatch_match = re.match(r"\> (\d+)", workload.filter_dispatch_ids[0])
|
||||
df = df[
|
||||
df[schema.PMC_PERF_FILE_PREFIX]["Dispatch_ID"]
|
||||
|
||||
@@ -1817,7 +1817,7 @@ def test_pc_sampling_host_trap(binary_handler_profile_rocprof_compute):
|
||||
"--pc-sampling-method",
|
||||
"host_trap",
|
||||
"--pc-sampling-interval",
|
||||
"1048576",
|
||||
"256",
|
||||
]
|
||||
workload_dir = test_utils.get_output_dir()
|
||||
_ = binary_handler_profile_rocprof_compute(
|
||||
|
||||
Referencia en una nueva incidencia
Block a user