254 lines
8.9 KiB
C++
254 lines
8.9 KiB
C++
// MIT License
|
|
//
|
|
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
// of this software and associated documentation files (the "Software"), to deal
|
|
// in the Software without restriction, including without limitation the rights
|
|
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
// copies of the Software, and to permit persons to whom the Software is
|
|
// furnished to do so, subject to the following conditions:
|
|
//
|
|
// The above copyright notice and this permission notice shall be included in all
|
|
// copies or substantial portions of the Software.
|
|
//
|
|
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
// SOFTWARE.
|
|
|
|
/**
|
|
* @file tests/bin/code-object-multi-threaded/main.cpp
|
|
*
|
|
* @brief Multi-threaded code object loading stress test for rocprofiler-sdk
|
|
*
|
|
* This test verifies thread-safety when multiple threads concurrently load
|
|
* HIP modules on different GPUs, which triggers concurrent calls to
|
|
* executable_freeze_internal and code object tracing callbacks.
|
|
*
|
|
* Data races tested:
|
|
* - user_data map access (now protected with Synchronized)
|
|
* - contexts vector assignment (now protected with wlock)
|
|
* - is_shutdown flag (now atomic)
|
|
* - end_notified/beg_notified flags (now atomic)
|
|
* - executable_destroy serialization (now protected with mutex)
|
|
*/
|
|
|
|
#include <hip/hip_runtime.h>
|
|
#include <hip/hip_runtime_api.h>
|
|
#include <hip/hiprtc.h>
|
|
|
|
#include <chrono>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
#include <iostream>
|
|
#include <thread>
|
|
#include <vector>
|
|
|
|
#define HIP_CHECK(expr) \
|
|
do \
|
|
{ \
|
|
hipError_t _err = (expr); \
|
|
if(_err != hipSuccess) \
|
|
{ \
|
|
std::cerr << "HIP error " << hipGetErrorString(_err) << " at " << __FILE__ << ":" \
|
|
<< __LINE__ << "\n"; \
|
|
std::abort(); \
|
|
} \
|
|
} while(0)
|
|
|
|
// Simple kernel code as string for runtime compilation/loading
|
|
static const char* kernel_code = R"(
|
|
extern "C" __global__ void dynamic_kernel_a(int* data, int n) {
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if(idx < n) data[idx] = idx * 2;
|
|
}
|
|
|
|
extern "C" __global__ void dynamic_kernel_b(float* data, int n) {
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if(idx < n) data[idx] = idx * 3.14f;
|
|
}
|
|
|
|
extern "C" __global__ void dynamic_kernel_c(int* data, int n) {
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if(idx < n) data[idx] = idx * idx;
|
|
}
|
|
)";
|
|
|
|
// Static kernels as fallback
|
|
__global__ void
|
|
test_kernel_a()
|
|
{
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if(idx == 0) printf("Kernel A\n");
|
|
}
|
|
|
|
__global__ void
|
|
test_kernel_b(int* data)
|
|
{
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
data[idx] = idx * 2;
|
|
}
|
|
|
|
__global__ void
|
|
test_kernel_c(float* data, int n)
|
|
{
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
if(idx < n) data[idx] = idx * 3.14f;
|
|
}
|
|
|
|
void
|
|
gpu_worker(int gpu_id, int iterations)
|
|
{
|
|
// Set device for this thread
|
|
HIP_CHECK(hipSetDevice(gpu_id));
|
|
|
|
const int n = 1024;
|
|
|
|
// Each thread loads modules in parallel - this triggers concurrent
|
|
// calls to executable_freeze_internal and code object callbacks
|
|
for(int i = 0; i < iterations; ++i)
|
|
{
|
|
// Use hiprtc to compile kernel source at runtime, then load the compiled module
|
|
// This triggers parallel module loading across threads
|
|
hiprtcProgram prog{};
|
|
hiprtcResult rtc_result =
|
|
hiprtcCreateProgram(&prog, kernel_code, "dynamic_kernels.cu", 0, nullptr, nullptr);
|
|
|
|
bool use_dynamic_loading = (rtc_result == HIPRTC_SUCCESS);
|
|
|
|
if(use_dynamic_loading)
|
|
{
|
|
// Compile the program
|
|
rtc_result = hiprtcCompileProgram(prog, 0, nullptr);
|
|
|
|
if(rtc_result != HIPRTC_SUCCESS)
|
|
{
|
|
size_t log_size{0};
|
|
hiprtcGetProgramLogSize(prog, &log_size);
|
|
if(log_size > 1)
|
|
{
|
|
auto log = std::vector<char>(log_size);
|
|
hiprtcGetProgramLog(prog, log.data());
|
|
std::cerr << "Compilation failed:\n" << log.data() << "\n";
|
|
}
|
|
use_dynamic_loading = false;
|
|
}
|
|
}
|
|
|
|
if(use_dynamic_loading)
|
|
{
|
|
// Get the compiled code
|
|
size_t code_size{0};
|
|
hiprtcGetCodeSize(prog, &code_size);
|
|
|
|
std::vector<char> code(code_size);
|
|
hiprtcGetCode(prog, code.data());
|
|
hiprtcDestroyProgram(&prog);
|
|
|
|
// Load the compiled module - THIS is what triggers the parallel code object loading
|
|
hipModule_t module;
|
|
hipError_t err = hipModuleLoadData(&module, code.data());
|
|
|
|
if(err == hipSuccess)
|
|
{
|
|
// Get functions from the module
|
|
hipFunction_t func_a, func_b, func_c;
|
|
HIP_CHECK(hipModuleGetFunction(&func_a, module, "dynamic_kernel_a"));
|
|
HIP_CHECK(hipModuleGetFunction(&func_b, module, "dynamic_kernel_b"));
|
|
HIP_CHECK(hipModuleGetFunction(&func_c, module, "dynamic_kernel_c"));
|
|
|
|
// Allocate device memory
|
|
int* d_int_data{nullptr};
|
|
float* d_float_data{nullptr};
|
|
HIP_CHECK(hipMalloc(&d_int_data, n * sizeof(int)));
|
|
HIP_CHECK(hipMalloc(&d_float_data, n * sizeof(float)));
|
|
|
|
// Launch kernels using module functions
|
|
int n_arg = n;
|
|
void* args_a[] = {&d_int_data, &n_arg};
|
|
HIP_CHECK(hipModuleLaunchKernel(
|
|
func_a, (n + 255) / 256, 1, 1, 256, 1, 1, 0, 0, args_a, nullptr));
|
|
|
|
void* args_b[] = {&d_float_data, &n_arg};
|
|
HIP_CHECK(hipModuleLaunchKernel(
|
|
func_b, (n + 255) / 256, 1, 1, 256, 1, 1, 0, 0, args_b, nullptr));
|
|
|
|
void* args_c[] = {&d_int_data, &n_arg};
|
|
HIP_CHECK(hipModuleLaunchKernel(
|
|
func_c, (n + 255) / 256, 1, 1, 256, 1, 1, 0, 0, args_c, nullptr));
|
|
}
|
|
else
|
|
{
|
|
use_dynamic_loading = false;
|
|
}
|
|
}
|
|
|
|
if(!use_dynamic_loading)
|
|
{
|
|
// Fallback to regular kernel launches if module loading fails
|
|
int* d_int_data{nullptr};
|
|
float* d_float_data{nullptr};
|
|
HIP_CHECK(hipMalloc(&d_int_data, n * sizeof(int)));
|
|
HIP_CHECK(hipMalloc(&d_float_data, n * sizeof(float)));
|
|
|
|
test_kernel_a<<<1, 64>>>();
|
|
HIP_CHECK(hipGetLastError());
|
|
|
|
test_kernel_b<<<(n + 255) / 256, 256>>>(d_int_data);
|
|
HIP_CHECK(hipGetLastError());
|
|
|
|
test_kernel_c<<<(n + 255) / 256, 256>>>(d_float_data, n);
|
|
HIP_CHECK(hipGetLastError());
|
|
}
|
|
}
|
|
}
|
|
|
|
int
|
|
main()
|
|
{
|
|
std::cout << "Multi-Threaded Code Object Loading Test\n";
|
|
|
|
int num_gpus = 0;
|
|
HIP_CHECK(hipGetDeviceCount(&num_gpus));
|
|
|
|
if(num_gpus == 0)
|
|
{
|
|
std::cerr << "No GPUs found. Test requires at least one GPU.\n";
|
|
return 1;
|
|
}
|
|
|
|
std::cout << "Found " << num_gpus << " GPU(s)\n";
|
|
|
|
int num_threads =
|
|
std::thread::hardware_concurrency(); // More threads than GPUs to increase contention
|
|
int threads_per_gpu = num_threads / num_gpus;
|
|
std::cout << "Launching " << num_threads << " threads\n";
|
|
|
|
constexpr int iterations = 3;
|
|
|
|
// Create worker threads
|
|
auto threads = std::vector<std::thread>{};
|
|
threads.reserve(num_threads);
|
|
|
|
for(int gpu_id = 0; gpu_id < num_gpus; ++gpu_id)
|
|
{
|
|
for(int thread_id = 0; thread_id < threads_per_gpu; ++thread_id)
|
|
{
|
|
threads.emplace_back(gpu_worker, gpu_id, iterations);
|
|
}
|
|
}
|
|
|
|
// Wait for all threads to complete
|
|
for(auto& t : threads)
|
|
{
|
|
t.join();
|
|
}
|
|
|
|
std::cout << "Test completed successfully\n";
|
|
return 0;
|
|
}
|