Fixing Code Object Data Race and Thread Safety & Adding validation test (#2014)

This commit is contained in:
Ammar ELWazir
2025-11-26 20:28:52 -06:00
zatwierdzone przez GitHub
rodzic 5b75ec6a09
commit ed42157c31
11 zmienionych plików z 679 dodań i 92 usunięć
@@ -378,7 +378,14 @@ using code_object_unload_array_t = std::vector<hsa::code_object_unload>;
std::vector<hsa::code_object_unload>
shutdown(hsa_executable_t executable);
bool is_shutdown = false;
std::atomic<bool> is_shutdown{false};
auto&
get_destroy_mutex()
{
static auto _v = std::mutex{};
return _v;
}
auto*
get_executables()
@@ -733,7 +740,8 @@ get_unloaded_code_objects(hsa_executable_t executable)
{
auto _unloaded = std::vector<hsa::code_object_unload>{};
if(!is_shutdown && get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects)
if(!is_shutdown.load(std::memory_order_acquire) &&
get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects)
get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects(
executable, code_object_unload_callback, &_unloaded);
@@ -837,7 +845,7 @@ executable_freeze_internal(hsa_executable_t executable)
if(!ctxs.empty())
{
code_obj_vec->rlock([](const code_object_array_t& data) {
code_obj_vec->wlock([](code_object_array_t& data) {
auto tidx = common::get_tid();
// set the contexts for each code object
for(const auto& ditr : data)
@@ -864,8 +872,10 @@ executable_freeze_internal(hsa_executable_t executable)
// invoke callback
auto& cb_data =
citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND);
auto& user_data = ditr->user_data[citr];
cb_data.callback(record, &user_data, cb_data.data);
ditr->user_data.wlock([&](auto& user_data_map) {
auto& user_data = user_data_map[citr];
cb_data.callback(record, &user_data, cb_data.data);
});
}
}
@@ -889,52 +899,57 @@ executable_freeze_internal(hsa_executable_t executable)
// invoke callback
auto& cb_data =
citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND);
auto& user_data = sitr->user_data[citr];
cb_data.callback(record, &user_data, cb_data.data);
sitr->user_data.wlock([&](auto& user_data_map) {
auto& user_data = user_data_map[citr];
cb_data.callback(record, &user_data, cb_data.data);
std::string device_name =
CHECK_NOTNULL(get_hip_register_data())
->rlock([sym_data](
std::string device_name =
CHECK_NOTNULL(get_hip_register_data())
->rlock(
[sym_data](
const hip::hip_register_data& register_data) {
const auto& sym_map =
register_data.kernel_symbol_device_map;
const auto it = sym_map.find(*CHECK_NOTNULL(
common::get_string_entry(sym_data.kernel_name)));
if(it != sym_map.end()) return it->second;
return std::string();
});
// Does not have a host function, skip
if(device_name.empty()) continue;
auto host_data =
CHECK_NOTNULL(get_hip_register_data())
->rlock([device_name](
const hip::hip_register_data& register_data) {
// Add check for out of range here
const auto it =
register_data.host_function_map.find(device_name);
if(it == register_data.host_function_map.end())
{
return rocprofiler_callback_tracing_code_object_host_kernel_symbol_register_data_t{};
}
return it->second;
});
// when kernel_symbol_device_map kernels are not present in
// host_function_map, skip.
if(host_data.device_function == nullptr) continue;
host_data.code_object_id = sym_data.code_object_id;
host_data.kernel_id = sym_data.kernel_id;
host_data.host_function_id = ++get_host_function_id();
auto hip_record = rocprofiler_callback_tracing_record_t{
.context_id = rocprofiler_context_id_t{citr->context_idx},
.thread_id = tidx,
.correlation_id = rocprofiler_correlation_id_t{},
.kind = CODE_OBJECT_KIND,
.operation = CODE_OBJECT_HOST_SYMBOL,
.phase = ROCPROFILER_CALLBACK_PHASE_LOAD,
.payload = static_cast<void*>(&host_data)};
const auto& sym_map =
register_data.kernel_symbol_device_map;
const auto it = sym_map.find(
*CHECK_NOTNULL(common::get_string_entry(
sym_data.kernel_name)));
if(it != sym_map.end()) return it->second;
return std::string();
});
// Does not have a host function, skip
if(device_name.empty()) return;
auto host_data =
CHECK_NOTNULL(get_hip_register_data())
->rlock([device_name](const hip::hip_register_data&
register_data) {
// Add check for out of range here
const auto it =
register_data.host_function_map.find(
device_name);
if(it == register_data.host_function_map.end())
{
return rocprofiler_callback_tracing_code_object_host_kernel_symbol_register_data_t{};
}
return it->second;
});
// when kernel_symbol_device_map kernels are not present in
// host_function_map, skip.
if(host_data.device_function == nullptr) return;
host_data.code_object_id = sym_data.code_object_id;
host_data.kernel_id = sym_data.kernel_id;
host_data.host_function_id = ++get_host_function_id();
auto hip_record = rocprofiler_callback_tracing_record_t{
.context_id = rocprofiler_context_id_t{citr->context_idx},
.thread_id = tidx,
.correlation_id = rocprofiler_correlation_id_t{},
.kind = CODE_OBJECT_KIND,
.operation = CODE_OBJECT_HOST_SYMBOL,
.phase = ROCPROFILER_CALLBACK_PHASE_LOAD,
.payload = static_cast<void*>(&host_data)};
// invoke callback
cb_data.callback(hip_record, &user_data, cb_data.data);
// invoke callback
cb_data.callback(hip_record, &user_data, cb_data.data);
});
}
}
}
@@ -964,7 +979,13 @@ executable_freeze(hsa_executable_t executable, const char* options)
hsa_status_t
executable_destroy(hsa_executable_t executable)
{
if(is_shutdown) return HSA_STATUS_SUCCESS;
// Serialize all executable_destroy calls to prevent:
// 1. Concurrent access to code objects in shutdown()
// 2. Use-after-free when multiple threads destroy same executable
// 3. Race on end_notified flags (now atomic, but still need serialization for callbacks)
auto _lk = std::unique_lock{get_destroy_mutex()};
if(is_shutdown.load(std::memory_order_acquire)) return HSA_STATUS_SUCCESS;
auto _unloaded = shutdown(executable);
@@ -1098,9 +1119,11 @@ shutdown(hsa_executable_t executable)
.payload = static_cast<void*>(&itr.object->rocp_data)};
// invoke callback
auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND);
auto& user_data = itr.object->user_data.at(citr);
cb_data.callback(record, &user_data, cb_data.data);
auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND);
itr.object->user_data.wlock([&](auto& user_data_map) {
auto& user_data = user_data_map.at(citr);
cb_data.callback(record, &user_data, cb_data.data);
});
}
}
@@ -1123,9 +1146,11 @@ shutdown(hsa_executable_t executable)
.payload = static_cast<void*>(&sitr->rocp_data)};
// invoke callback
auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND);
auto& user_data = sitr->user_data.at(citr);
cb_data.callback(record, &user_data, cb_data.data);
auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND);
sitr->user_data.wlock([&](auto& user_data_map) {
auto& user_data = user_data_map.at(citr);
cb_data.callback(record, &user_data, cb_data.data);
});
}
}
}
@@ -1226,7 +1251,8 @@ get_kernel_id(uint64_t kernel_object)
void
finalize()
{
if(is_shutdown || !get_executables() || !get_code_objects()) return;
if(is_shutdown.load(std::memory_order_acquire) || !get_executables() || !get_code_objects())
return;
CHECK_NOTNULL(get_executables())->rlock([](const executable_array_t& edata) {
auto tmp = edata;
@@ -1237,13 +1263,14 @@ finalize()
CHECK_NOTNULL(get_code_objects())->wlock([](code_object_array_t& data) { data.clear(); });
is_shutdown = true;
is_shutdown.store(true, std::memory_order_release);
}
void
iterate_loaded_code_objects(code_object_iterator_t&& func)
{
if(is_shutdown || !get_executables() || !get_code_objects()) return;
if(is_shutdown.load(std::memory_order_acquire) || !get_executables() || !get_code_objects())
return;
CHECK_NOTNULL(get_code_objects())
->rlock(
[](const code_object_array_t& data, code_object_iterator_t&& func_v) {
@@ -49,15 +49,18 @@ code_object::operator=(code_object&& rhs) noexcept
{
if(this != &rhs)
{
beg_notified = rhs.beg_notified;
end_notified = rhs.end_notified;
beg_notified.store(rhs.beg_notified.load());
end_notified.store(rhs.end_notified.load());
uri = rhs.uri;
hsa_executable = rhs.hsa_executable;
hsa_code_object = rhs.hsa_code_object;
rocp_data = rhs.rocp_data;
user_data = std::move(rhs.user_data);
rocp_data.uri = (uri) ? uri->c_str() : nullptr;
symbols = std::move(rhs.symbols);
// Manually move user_data by extracting and inserting under locks
rhs.user_data.wlock([this](auto& rhs_map) {
this->user_data.wlock([&rhs_map](auto& lhs_map) { lhs_map = std::move(rhs_map); });
});
rocp_data.uri = (uri) ? uri->c_str() : nullptr;
symbols = std::move(rhs.symbols);
}
return *this;
@@ -22,12 +22,14 @@
#pragma once
#include "lib/common/synchronized.hpp"
#include "lib/rocprofiler-sdk/code_object/hsa/kernel_symbol.hpp"
#include "lib/rocprofiler-sdk/context/context.hpp"
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/hsa.h>
#include <atomic>
#include <cstdint>
#include <string>
#include <unordered_map>
@@ -39,11 +41,11 @@ namespace code_object
{
namespace hsa
{
using context_t = context::context;
using user_data_t = rocprofiler_user_data_t;
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
using context_array_t = context::context_array_t;
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
using context_t = context::context;
using user_data_t = rocprofiler_user_data_t;
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
using synchronized_user_data_t = common::Synchronized<context_user_data_map_t>;
using context_array_t = context::context_array_t;
struct code_object
{
@@ -59,15 +61,15 @@ struct code_object
code_object& operator=(const code_object&) = delete;
code_object& operator =(code_object&&) noexcept;
bool beg_notified = false;
bool end_notified = false;
std::atomic<bool> beg_notified = false;
std::atomic<bool> end_notified = false;
const std::string* uri = nullptr;
hsa_executable_t hsa_executable = {};
hsa_loaded_code_object_t hsa_code_object = {};
code_object_data_t rocp_data = common::init_public_api_struct(code_object_data_t{});
symbol_array_t symbols = {};
context_array_t contexts = {};
context_user_data_map_t user_data = {};
synchronized_user_data_t user_data = {};
};
struct code_object_unload
@@ -49,14 +49,17 @@ kernel_symbol::operator=(kernel_symbol&& rhs) noexcept
{
if(this != &rhs)
{
beg_notified = rhs.beg_notified;
end_notified = rhs.end_notified;
name = rhs.name;
hsa_executable = rhs.hsa_executable;
hsa_agent = rhs.hsa_agent;
hsa_symbol = rhs.hsa_symbol;
rocp_data = rhs.rocp_data;
user_data = std::move(rhs.user_data);
beg_notified.store(rhs.beg_notified.load());
end_notified.store(rhs.end_notified.load());
name = rhs.name;
hsa_executable = rhs.hsa_executable;
hsa_agent = rhs.hsa_agent;
hsa_symbol = rhs.hsa_symbol;
rocp_data = rhs.rocp_data;
// Manually move user_data by extracting and inserting under locks
rhs.user_data.wlock([this](auto& rhs_map) {
this->user_data.wlock([&rhs_map](auto& lhs_map) { lhs_map = std::move(rhs_map); });
});
rocp_data.kernel_name = (name) ? name->c_str() : nullptr;
}
@@ -22,11 +22,13 @@
#pragma once
#include "lib/common/synchronized.hpp"
#include "lib/rocprofiler-sdk/context/context.hpp"
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/hsa.h>
#include <atomic>
#include <cstdint>
#include <string>
#include <unordered_map>
@@ -38,11 +40,11 @@ namespace code_object
{
namespace hsa
{
using context_t = context::context;
using user_data_t = rocprofiler_user_data_t;
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
using context_array_t = context::context_array_t;
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
using context_t = context::context;
using user_data_t = rocprofiler_user_data_t;
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
using synchronized_user_data_t = common::Synchronized<context_user_data_map_t>;
using context_array_t = context::context_array_t;
struct kernel_symbol
{
@@ -58,14 +60,14 @@ struct kernel_symbol
kernel_symbol& operator=(const kernel_symbol&) = delete;
kernel_symbol& operator =(kernel_symbol&&) noexcept;
bool beg_notified = false;
bool end_notified = false;
const std::string* name = nullptr;
hsa_executable_t hsa_executable = {};
hsa_agent_t hsa_agent = {};
hsa_executable_symbol_t hsa_symbol = {};
kernel_symbol_data_t rocp_data = common::init_public_api_struct(kernel_symbol_data_t{});
context_user_data_map_t user_data = {};
std::atomic<bool> beg_notified = false;
std::atomic<bool> end_notified = false;
const std::string* name = nullptr;
hsa_executable_t hsa_executable = {};
hsa_agent_t hsa_agent = {};
hsa_executable_symbol_t hsa_symbol = {};
kernel_symbol_data_t rocp_data = common::init_public_api_struct(kernel_symbol_data_t{});
synchronized_user_data_t user_data = {};
};
bool
@@ -87,6 +87,7 @@ add_subdirectory(openmp-tools)
add_subdirectory(rocdecode)
add_subdirectory(rocjpeg)
add_subdirectory(hip-host-tracing)
add_subdirectory(code-object-multi-threaded)
# rocpd validation tests
add_subdirectory(rocpd)
@@ -42,3 +42,4 @@ add_subdirectory(hip-streams)
add_subdirectory(hip-streams-per-thread)
add_subdirectory(attachment-test)
add_subdirectory(hip-host)
add_subdirectory(module-loading-test)
@@ -0,0 +1,55 @@
#
# Multi-threaded code object loading test
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
if(NOT CMAKE_HIP_COMPILER)
find_program(
amdclangpp_EXECUTABLE
NAMES amdclang++
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
PATH_SUFFIXES bin llvm/bin NO_CACHE)
mark_as_advanced(amdclangpp_EXECUTABLE)
if(amdclangpp_EXECUTABLE)
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
endif()
endif()
project(rocprofiler-sdk-tests-bin-code-object-multi-threaded LANGUAGES CXX HIP)
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
endif()
endforeach()
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_HIP_EXTENSIONS OFF)
set(CMAKE_HIP_STANDARD_REQUIRED ON)
find_package(rocprofiler-sdk REQUIRED)
# Find hiprtc for runtime compilation
find_library(
HIPRTC_LIBRARY
NAMES hiprtc
HINTS ${ROCM_PATH}/lib ENV ROCM_PATH /opt/rocm/lib)
set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP)
add_executable(code-object-multithread-testapp)
target_sources(code-object-multithread-testapp PRIVATE main.cpp)
target_compile_options(code-object-multithread-testapp PRIVATE -W -Wall -Wextra)
find_package(Threads REQUIRED)
target_link_libraries(code-object-multithread-testapp PRIVATE Threads::Threads)
# Link hiprtc if found, otherwise test will fall back to static kernels
if(HIPRTC_LIBRARY)
target_link_libraries(code-object-multithread-testapp PRIVATE ${HIPRTC_LIBRARY})
endif()
@@ -0,0 +1,253 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
/**
* @file tests/bin/code-object-multi-threaded/main.cpp
*
* @brief Multi-threaded code object loading stress test for rocprofiler-sdk
*
* This test verifies thread-safety when multiple threads concurrently load
* HIP modules on different GPUs, which triggers concurrent calls to
* executable_freeze_internal and code object tracing callbacks.
*
* Data races tested:
* - user_data map access (now protected with Synchronized)
* - contexts vector assignment (now protected with wlock)
* - is_shutdown flag (now atomic)
* - end_notified/beg_notified flags (now atomic)
* - executable_destroy serialization (now protected with mutex)
*/
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#include <hip/hiprtc.h>
#include <chrono>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <thread>
#include <vector>
#define HIP_CHECK(expr) \
do \
{ \
hipError_t _err = (expr); \
if(_err != hipSuccess) \
{ \
std::cerr << "HIP error " << hipGetErrorString(_err) << " at " << __FILE__ << ":" \
<< __LINE__ << "\n"; \
std::abort(); \
} \
} while(0)
// Simple kernel code as string for runtime compilation/loading
static const char* kernel_code = R"(
extern "C" __global__ void dynamic_kernel_a(int* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < n) data[idx] = idx * 2;
}
extern "C" __global__ void dynamic_kernel_b(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < n) data[idx] = idx * 3.14f;
}
extern "C" __global__ void dynamic_kernel_c(int* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < n) data[idx] = idx * idx;
}
)";
// Static kernels as fallback
__global__ void
test_kernel_a()
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx == 0) printf("Kernel A\n");
}
__global__ void
test_kernel_b(int* data)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
data[idx] = idx * 2;
}
__global__ void
test_kernel_c(float* data, int n)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < n) data[idx] = idx * 3.14f;
}
void
gpu_worker(int gpu_id, int iterations)
{
// Set device for this thread
HIP_CHECK(hipSetDevice(gpu_id));
const int n = 1024;
// Each thread loads modules in parallel - this triggers concurrent
// calls to executable_freeze_internal and code object callbacks
for(int i = 0; i < iterations; ++i)
{
// Use hiprtc to compile kernel source at runtime, then load the compiled module
// This triggers parallel module loading across threads
hiprtcProgram prog{};
hiprtcResult rtc_result =
hiprtcCreateProgram(&prog, kernel_code, "dynamic_kernels.cu", 0, nullptr, nullptr);
bool use_dynamic_loading = (rtc_result == HIPRTC_SUCCESS);
if(use_dynamic_loading)
{
// Compile the program
rtc_result = hiprtcCompileProgram(prog, 0, nullptr);
if(rtc_result != HIPRTC_SUCCESS)
{
size_t log_size{0};
hiprtcGetProgramLogSize(prog, &log_size);
if(log_size > 1)
{
auto log = std::vector<char>(log_size);
hiprtcGetProgramLog(prog, log.data());
std::cerr << "Compilation failed:\n" << log.data() << "\n";
}
use_dynamic_loading = false;
}
}
if(use_dynamic_loading)
{
// Get the compiled code
size_t code_size{0};
hiprtcGetCodeSize(prog, &code_size);
std::vector<char> code(code_size);
hiprtcGetCode(prog, code.data());
hiprtcDestroyProgram(&prog);
// Load the compiled module - THIS is what triggers the parallel code object loading
hipModule_t module;
hipError_t err = hipModuleLoadData(&module, code.data());
if(err == hipSuccess)
{
// Get functions from the module
hipFunction_t func_a, func_b, func_c;
HIP_CHECK(hipModuleGetFunction(&func_a, module, "dynamic_kernel_a"));
HIP_CHECK(hipModuleGetFunction(&func_b, module, "dynamic_kernel_b"));
HIP_CHECK(hipModuleGetFunction(&func_c, module, "dynamic_kernel_c"));
// Allocate device memory
int* d_int_data{nullptr};
float* d_float_data{nullptr};
HIP_CHECK(hipMalloc(&d_int_data, n * sizeof(int)));
HIP_CHECK(hipMalloc(&d_float_data, n * sizeof(float)));
// Launch kernels using module functions
int n_arg = n;
void* args_a[] = {&d_int_data, &n_arg};
HIP_CHECK(hipModuleLaunchKernel(
func_a, (n + 255) / 256, 1, 1, 256, 1, 1, 0, 0, args_a, nullptr));
void* args_b[] = {&d_float_data, &n_arg};
HIP_CHECK(hipModuleLaunchKernel(
func_b, (n + 255) / 256, 1, 1, 256, 1, 1, 0, 0, args_b, nullptr));
void* args_c[] = {&d_int_data, &n_arg};
HIP_CHECK(hipModuleLaunchKernel(
func_c, (n + 255) / 256, 1, 1, 256, 1, 1, 0, 0, args_c, nullptr));
}
else
{
use_dynamic_loading = false;
}
}
if(!use_dynamic_loading)
{
// Fallback to regular kernel launches if module loading fails
int* d_int_data{nullptr};
float* d_float_data{nullptr};
HIP_CHECK(hipMalloc(&d_int_data, n * sizeof(int)));
HIP_CHECK(hipMalloc(&d_float_data, n * sizeof(float)));
test_kernel_a<<<1, 64>>>();
HIP_CHECK(hipGetLastError());
test_kernel_b<<<(n + 255) / 256, 256>>>(d_int_data);
HIP_CHECK(hipGetLastError());
test_kernel_c<<<(n + 255) / 256, 256>>>(d_float_data, n);
HIP_CHECK(hipGetLastError());
}
}
}
int
main()
{
std::cout << "Multi-Threaded Code Object Loading Test\n";
int num_gpus = 0;
HIP_CHECK(hipGetDeviceCount(&num_gpus));
if(num_gpus == 0)
{
std::cerr << "No GPUs found. Test requires at least one GPU.\n";
return 1;
}
std::cout << "Found " << num_gpus << " GPU(s)\n";
int num_threads =
std::thread::hardware_concurrency(); // More threads than GPUs to increase contention
int threads_per_gpu = num_threads / num_gpus;
std::cout << "Launching " << num_threads << " threads\n";
constexpr int iterations = 3;
// Create worker threads
auto threads = std::vector<std::thread>{};
threads.reserve(num_threads);
for(int gpu_id = 0; gpu_id < num_gpus; ++gpu_id)
{
for(int thread_id = 0; thread_id < threads_per_gpu; ++thread_id)
{
threads.emplace_back(gpu_worker, gpu_id, iterations);
}
}
// Wait for all threads to complete
for(auto& t : threads)
{
t.join();
}
std::cout << "Test completed successfully\n";
return 0;
}
@@ -0,0 +1,38 @@
#
# Multi-threaded code object loading test
#
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
project(rocprofiler-sdk-tests-code-object-multi-threaded LANGUAGES CXX)
find_package(rocprofiler-sdk REQUIRED)
# Build the client tool library
add_library(code-object-multithread-client SHARED)
target_sources(code-object-multithread-client PRIVATE client.cpp)
target_link_libraries(code-object-multithread-client
PRIVATE rocprofiler-sdk::rocprofiler-sdk)
if(ROCPROFILER_MEMCHECK_PRELOAD_ENV_VALUE)
set(PRELOAD_ENV
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV_VALUE}:$<TARGET_FILE:code-object-multithread-client>"
)
else()
set(PRELOAD_ENV "$<TARGET_FILE:code-object-multithread-client>")
endif()
# Register the test with CTest
rocprofiler_add_integration_execute_test(
test-code-object-multithread
COMMAND $<TARGET_FILE:code-object-multithread-testapp>
DEPENDS code-object-multithread-client
TIMEOUT 120
LABELS "integration-tests"
PRELOAD "${PRELOAD_ENV}"
ENVIRONMENT
"LD_LIBRARY_PATH=$<TARGET_FILE_DIR:rocprofiler-sdk::rocprofiler-sdk-shared-library>:$ENV{LD_LIBRARY_PATH}"
PASS_REGULAR_EXPRESSION
"Test PASSED: Successfully traced multi-threaded code object loading!"
FAIL_REGULAR_EXPRESSION
"${ROCPROFILER_DEFAULT_FAIL_REGEX}|SEGFAULT|Failed to \.*|Context is not valid|ERROR: \.*"
)
@@ -0,0 +1,202 @@
// MIT License
//
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
/**
* @file tests/bin/code-object-multi-threaded/client.cpp
*
* @brief ROCProfiler tool that tracks code object operations
*
* This tool uses rocprofiler-sdk callback tracing to monitor code object
* load/unload operations during multi-threaded execution, verifying that
* the thread-safety fixes work correctly.
*/
#include <rocprofiler-sdk/callback_tracing.h>
#include <rocprofiler-sdk/fwd.h>
#include <rocprofiler-sdk/registration.h>
#include <rocprofiler-sdk/rocprofiler.h>
#include <atomic>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <mutex>
#include <unordered_map>
namespace
{
using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t;
using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
// Thread-safe counters
std::atomic<uint64_t> g_code_object_load_count{0};
std::atomic<uint64_t> g_code_object_unload_count{0};
std::atomic<uint64_t> g_kernel_symbol_load_count{0};
std::atomic<uint64_t> g_kernel_symbol_unload_count{0};
// Track which threads invoke callbacks
std::mutex g_thread_mutex;
std::unordered_map<uint64_t, uint64_t> g_thread_callback_counts;
rocprofiler_client_id_t* client_id = nullptr;
rocprofiler_client_finalize_t client_fini_func = nullptr;
rocprofiler_context_id_t client_ctx = {};
void
codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record,
rocprofiler_user_data_t* /* user_data */,
void* /* callback_data */)
{
if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return;
// Track thread activity
{
std::lock_guard<std::mutex> lock(g_thread_mutex);
g_thread_callback_counts[record.thread_id]++;
}
if(record.operation == ROCPROFILER_CODE_OBJECT_LOAD)
{
if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD)
{
g_code_object_load_count.fetch_add(1, std::memory_order_relaxed);
}
else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD)
{
g_code_object_unload_count.fetch_add(1, std::memory_order_relaxed);
}
}
else if(record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER)
{
if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD)
{
g_kernel_symbol_load_count.fetch_add(1, std::memory_order_relaxed);
}
else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD)
{
g_kernel_symbol_unload_count.fetch_add(1, std::memory_order_relaxed);
}
}
}
void
tool_fini(void* /* tool_data */)
{
std::cout << "\n=== ROCProfiler Multi-Threaded Code Object Test Results ===\n";
std::cout << "Code objects loaded: " << g_code_object_load_count.load() << "\n";
std::cout << "Code objects unloaded: " << g_code_object_unload_count.load() << "\n";
std::cout << "Kernel symbols loaded: " << g_kernel_symbol_load_count.load() << "\n";
std::cout << "Kernel symbols unloaded: " << g_kernel_symbol_unload_count.load() << "\n";
size_t num_threads_with_callbacks = 0;
{
std::lock_guard<std::mutex> lock(g_thread_mutex);
num_threads_with_callbacks = g_thread_callback_counts.size();
std::cout << "Threads that invoked callbacks: " << num_threads_with_callbacks << "\n";
}
std::cout << "===========================================================\n";
// Verify we actually traced something
if(g_code_object_load_count.load() == 0)
{
std::cerr << "ERROR: No code objects were traced!\n";
std::abort();
}
if(g_kernel_symbol_load_count.load() == 0)
{
std::cerr << "ERROR: No kernel symbols were traced!\n";
std::abort();
}
// Verify multi-threaded execution
if(num_threads_with_callbacks < 2)
{
std::cerr << "ERROR: Expected callbacks from multiple threads, got "
<< num_threads_with_callbacks << "\n";
std::abort();
}
std::cout << "Test PASSED: Successfully traced multi-threaded code object loading!\n";
}
int
tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
{
client_fini_func = fini_func;
auto status = rocprofiler_create_context(&client_ctx);
if(status != ROCPROFILER_STATUS_SUCCESS)
{
std::cerr << "Failed to create context\n";
return -1;
}
status =
rocprofiler_configure_callback_tracing_service(client_ctx,
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
nullptr,
0,
codeobj_tracing_callback,
tool_data);
if(status != ROCPROFILER_STATUS_SUCCESS)
{
std::cerr << "Failed to configure code object tracing\n";
return -1;
}
int valid_ctx = 0;
status = rocprofiler_context_is_valid(client_ctx, &valid_ctx);
if(status != ROCPROFILER_STATUS_SUCCESS || valid_ctx == 0)
{
std::cerr << "Context is not valid\n";
return -1;
}
status = rocprofiler_start_context(client_ctx);
if(status != ROCPROFILER_STATUS_SUCCESS)
{
std::cerr << "Failed to start context\n";
return -1;
}
std::cout << "ROCProfiler multi-threaded code object tool initialized\n";
return 0;
}
} // namespace
extern "C" rocprofiler_tool_configure_result_t*
rocprofiler_configure(uint32_t /* version */,
const char* /* runtime_version */,
uint32_t /* priority */,
rocprofiler_client_id_t* id)
{
id->name = "CodeObjectMultiThreadedClient";
client_id = id;
static auto cfg = rocprofiler_tool_configure_result_t{
sizeof(rocprofiler_tool_configure_result_t), &tool_init, &tool_fini, nullptr};
return &cfg;
}