diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/code_object.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/code_object.cpp index 94eea074a9..475508379f 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/code_object.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/code_object.cpp @@ -378,7 +378,14 @@ using code_object_unload_array_t = std::vector; std::vector shutdown(hsa_executable_t executable); -bool is_shutdown = false; +std::atomic is_shutdown{false}; + +auto& +get_destroy_mutex() +{ + static auto _v = std::mutex{}; + return _v; +} auto* get_executables() @@ -733,7 +740,8 @@ get_unloaded_code_objects(hsa_executable_t executable) { auto _unloaded = std::vector{}; - if(!is_shutdown && get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects) + if(!is_shutdown.load(std::memory_order_acquire) && + get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects) get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects( executable, code_object_unload_callback, &_unloaded); @@ -837,7 +845,7 @@ executable_freeze_internal(hsa_executable_t executable) if(!ctxs.empty()) { - code_obj_vec->rlock([](const code_object_array_t& data) { + code_obj_vec->wlock([](code_object_array_t& data) { auto tidx = common::get_tid(); // set the contexts for each code object for(const auto& ditr : data) @@ -864,8 +872,10 @@ executable_freeze_internal(hsa_executable_t executable) // invoke callback auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND); - auto& user_data = ditr->user_data[citr]; - cb_data.callback(record, &user_data, cb_data.data); + ditr->user_data.wlock([&](auto& user_data_map) { + auto& user_data = user_data_map[citr]; + cb_data.callback(record, &user_data, cb_data.data); + }); } } @@ -889,52 +899,57 @@ executable_freeze_internal(hsa_executable_t executable) // invoke callback auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND); - auto& user_data = sitr->user_data[citr]; - cb_data.callback(record, &user_data, cb_data.data); + sitr->user_data.wlock([&](auto& user_data_map) { + auto& user_data = user_data_map[citr]; + cb_data.callback(record, &user_data, cb_data.data); - std::string device_name = - CHECK_NOTNULL(get_hip_register_data()) - ->rlock([sym_data]( + std::string device_name = + CHECK_NOTNULL(get_hip_register_data()) + ->rlock( + [sym_data]( const hip::hip_register_data& register_data) { - const auto& sym_map = - register_data.kernel_symbol_device_map; - const auto it = sym_map.find(*CHECK_NOTNULL( - common::get_string_entry(sym_data.kernel_name))); - if(it != sym_map.end()) return it->second; - return std::string(); - }); - // Does not have a host function, skip - if(device_name.empty()) continue; - auto host_data = - CHECK_NOTNULL(get_hip_register_data()) - ->rlock([device_name]( - const hip::hip_register_data& register_data) { - // Add check for out of range here - const auto it = - register_data.host_function_map.find(device_name); - if(it == register_data.host_function_map.end()) - { - return rocprofiler_callback_tracing_code_object_host_kernel_symbol_register_data_t{}; - } - return it->second; - }); - // when kernel_symbol_device_map kernels are not present in - // host_function_map, skip. - if(host_data.device_function == nullptr) continue; - host_data.code_object_id = sym_data.code_object_id; - host_data.kernel_id = sym_data.kernel_id; - host_data.host_function_id = ++get_host_function_id(); - auto hip_record = rocprofiler_callback_tracing_record_t{ - .context_id = rocprofiler_context_id_t{citr->context_idx}, - .thread_id = tidx, - .correlation_id = rocprofiler_correlation_id_t{}, - .kind = CODE_OBJECT_KIND, - .operation = CODE_OBJECT_HOST_SYMBOL, - .phase = ROCPROFILER_CALLBACK_PHASE_LOAD, - .payload = static_cast(&host_data)}; + const auto& sym_map = + register_data.kernel_symbol_device_map; + const auto it = sym_map.find( + *CHECK_NOTNULL(common::get_string_entry( + sym_data.kernel_name))); + if(it != sym_map.end()) return it->second; + return std::string(); + }); + // Does not have a host function, skip + if(device_name.empty()) return; + auto host_data = + CHECK_NOTNULL(get_hip_register_data()) + ->rlock([device_name](const hip::hip_register_data& + register_data) { + // Add check for out of range here + const auto it = + register_data.host_function_map.find( + device_name); + if(it == register_data.host_function_map.end()) + { + return rocprofiler_callback_tracing_code_object_host_kernel_symbol_register_data_t{}; + } + return it->second; + }); + // when kernel_symbol_device_map kernels are not present in + // host_function_map, skip. + if(host_data.device_function == nullptr) return; + host_data.code_object_id = sym_data.code_object_id; + host_data.kernel_id = sym_data.kernel_id; + host_data.host_function_id = ++get_host_function_id(); + auto hip_record = rocprofiler_callback_tracing_record_t{ + .context_id = rocprofiler_context_id_t{citr->context_idx}, + .thread_id = tidx, + .correlation_id = rocprofiler_correlation_id_t{}, + .kind = CODE_OBJECT_KIND, + .operation = CODE_OBJECT_HOST_SYMBOL, + .phase = ROCPROFILER_CALLBACK_PHASE_LOAD, + .payload = static_cast(&host_data)}; - // invoke callback - cb_data.callback(hip_record, &user_data, cb_data.data); + // invoke callback + cb_data.callback(hip_record, &user_data, cb_data.data); + }); } } } @@ -964,7 +979,13 @@ executable_freeze(hsa_executable_t executable, const char* options) hsa_status_t executable_destroy(hsa_executable_t executable) { - if(is_shutdown) return HSA_STATUS_SUCCESS; + // Serialize all executable_destroy calls to prevent: + // 1. Concurrent access to code objects in shutdown() + // 2. Use-after-free when multiple threads destroy same executable + // 3. Race on end_notified flags (now atomic, but still need serialization for callbacks) + auto _lk = std::unique_lock{get_destroy_mutex()}; + + if(is_shutdown.load(std::memory_order_acquire)) return HSA_STATUS_SUCCESS; auto _unloaded = shutdown(executable); @@ -1098,9 +1119,11 @@ shutdown(hsa_executable_t executable) .payload = static_cast(&itr.object->rocp_data)}; // invoke callback - auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND); - auto& user_data = itr.object->user_data.at(citr); - cb_data.callback(record, &user_data, cb_data.data); + auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND); + itr.object->user_data.wlock([&](auto& user_data_map) { + auto& user_data = user_data_map.at(citr); + cb_data.callback(record, &user_data, cb_data.data); + }); } } @@ -1123,9 +1146,11 @@ shutdown(hsa_executable_t executable) .payload = static_cast(&sitr->rocp_data)}; // invoke callback - auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND); - auto& user_data = sitr->user_data.at(citr); - cb_data.callback(record, &user_data, cb_data.data); + auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND); + sitr->user_data.wlock([&](auto& user_data_map) { + auto& user_data = user_data_map.at(citr); + cb_data.callback(record, &user_data, cb_data.data); + }); } } } @@ -1226,7 +1251,8 @@ get_kernel_id(uint64_t kernel_object) void finalize() { - if(is_shutdown || !get_executables() || !get_code_objects()) return; + if(is_shutdown.load(std::memory_order_acquire) || !get_executables() || !get_code_objects()) + return; CHECK_NOTNULL(get_executables())->rlock([](const executable_array_t& edata) { auto tmp = edata; @@ -1237,13 +1263,14 @@ finalize() CHECK_NOTNULL(get_code_objects())->wlock([](code_object_array_t& data) { data.clear(); }); - is_shutdown = true; + is_shutdown.store(true, std::memory_order_release); } void iterate_loaded_code_objects(code_object_iterator_t&& func) { - if(is_shutdown || !get_executables() || !get_code_objects()) return; + if(is_shutdown.load(std::memory_order_acquire) || !get_executables() || !get_code_objects()) + return; CHECK_NOTNULL(get_code_objects()) ->rlock( [](const code_object_array_t& data, code_object_iterator_t&& func_v) { diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/code_object.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/code_object.cpp index 2c32e5d0c6..2f6bf89601 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/code_object.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/code_object.cpp @@ -49,15 +49,18 @@ code_object::operator=(code_object&& rhs) noexcept { if(this != &rhs) { - beg_notified = rhs.beg_notified; - end_notified = rhs.end_notified; + beg_notified.store(rhs.beg_notified.load()); + end_notified.store(rhs.end_notified.load()); uri = rhs.uri; hsa_executable = rhs.hsa_executable; hsa_code_object = rhs.hsa_code_object; rocp_data = rhs.rocp_data; - user_data = std::move(rhs.user_data); - rocp_data.uri = (uri) ? uri->c_str() : nullptr; - symbols = std::move(rhs.symbols); + // Manually move user_data by extracting and inserting under locks + rhs.user_data.wlock([this](auto& rhs_map) { + this->user_data.wlock([&rhs_map](auto& lhs_map) { lhs_map = std::move(rhs_map); }); + }); + rocp_data.uri = (uri) ? uri->c_str() : nullptr; + symbols = std::move(rhs.symbols); } return *this; diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/code_object.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/code_object.hpp index f41c6573f9..7e7193e015 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/code_object.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/code_object.hpp @@ -22,12 +22,14 @@ #pragma once +#include "lib/common/synchronized.hpp" #include "lib/rocprofiler-sdk/code_object/hsa/kernel_symbol.hpp" #include "lib/rocprofiler-sdk/context/context.hpp" #include #include +#include #include #include #include @@ -39,11 +41,11 @@ namespace code_object { namespace hsa { -using context_t = context::context; -using user_data_t = rocprofiler_user_data_t; -using context_user_data_map_t = std::unordered_map; -using context_array_t = context::context_array_t; -using context_user_data_map_t = std::unordered_map; +using context_t = context::context; +using user_data_t = rocprofiler_user_data_t; +using context_user_data_map_t = std::unordered_map; +using synchronized_user_data_t = common::Synchronized; +using context_array_t = context::context_array_t; struct code_object { @@ -59,15 +61,15 @@ struct code_object code_object& operator=(const code_object&) = delete; code_object& operator =(code_object&&) noexcept; - bool beg_notified = false; - bool end_notified = false; + std::atomic beg_notified = false; + std::atomic end_notified = false; const std::string* uri = nullptr; hsa_executable_t hsa_executable = {}; hsa_loaded_code_object_t hsa_code_object = {}; code_object_data_t rocp_data = common::init_public_api_struct(code_object_data_t{}); symbol_array_t symbols = {}; context_array_t contexts = {}; - context_user_data_map_t user_data = {}; + synchronized_user_data_t user_data = {}; }; struct code_object_unload diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/kernel_symbol.cpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/kernel_symbol.cpp index 6d28ed6419..fc9d78a8e0 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/kernel_symbol.cpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/kernel_symbol.cpp @@ -49,14 +49,17 @@ kernel_symbol::operator=(kernel_symbol&& rhs) noexcept { if(this != &rhs) { - beg_notified = rhs.beg_notified; - end_notified = rhs.end_notified; - name = rhs.name; - hsa_executable = rhs.hsa_executable; - hsa_agent = rhs.hsa_agent; - hsa_symbol = rhs.hsa_symbol; - rocp_data = rhs.rocp_data; - user_data = std::move(rhs.user_data); + beg_notified.store(rhs.beg_notified.load()); + end_notified.store(rhs.end_notified.load()); + name = rhs.name; + hsa_executable = rhs.hsa_executable; + hsa_agent = rhs.hsa_agent; + hsa_symbol = rhs.hsa_symbol; + rocp_data = rhs.rocp_data; + // Manually move user_data by extracting and inserting under locks + rhs.user_data.wlock([this](auto& rhs_map) { + this->user_data.wlock([&rhs_map](auto& lhs_map) { lhs_map = std::move(rhs_map); }); + }); rocp_data.kernel_name = (name) ? name->c_str() : nullptr; } diff --git a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/kernel_symbol.hpp b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/kernel_symbol.hpp index d18d980962..3afdb724fb 100644 --- a/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/kernel_symbol.hpp +++ b/projects/rocprofiler-sdk/source/lib/rocprofiler-sdk/code_object/hsa/kernel_symbol.hpp @@ -22,11 +22,13 @@ #pragma once +#include "lib/common/synchronized.hpp" #include "lib/rocprofiler-sdk/context/context.hpp" #include #include +#include #include #include #include @@ -38,11 +40,11 @@ namespace code_object { namespace hsa { -using context_t = context::context; -using user_data_t = rocprofiler_user_data_t; -using context_user_data_map_t = std::unordered_map; -using context_array_t = context::context_array_t; -using context_user_data_map_t = std::unordered_map; +using context_t = context::context; +using user_data_t = rocprofiler_user_data_t; +using context_user_data_map_t = std::unordered_map; +using synchronized_user_data_t = common::Synchronized; +using context_array_t = context::context_array_t; struct kernel_symbol { @@ -58,14 +60,14 @@ struct kernel_symbol kernel_symbol& operator=(const kernel_symbol&) = delete; kernel_symbol& operator =(kernel_symbol&&) noexcept; - bool beg_notified = false; - bool end_notified = false; - const std::string* name = nullptr; - hsa_executable_t hsa_executable = {}; - hsa_agent_t hsa_agent = {}; - hsa_executable_symbol_t hsa_symbol = {}; - kernel_symbol_data_t rocp_data = common::init_public_api_struct(kernel_symbol_data_t{}); - context_user_data_map_t user_data = {}; + std::atomic beg_notified = false; + std::atomic end_notified = false; + const std::string* name = nullptr; + hsa_executable_t hsa_executable = {}; + hsa_agent_t hsa_agent = {}; + hsa_executable_symbol_t hsa_symbol = {}; + kernel_symbol_data_t rocp_data = common::init_public_api_struct(kernel_symbol_data_t{}); + synchronized_user_data_t user_data = {}; }; bool diff --git a/projects/rocprofiler-sdk/tests/CMakeLists.txt b/projects/rocprofiler-sdk/tests/CMakeLists.txt index 77403292e4..571ca7cfe5 100644 --- a/projects/rocprofiler-sdk/tests/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/CMakeLists.txt @@ -87,6 +87,7 @@ add_subdirectory(openmp-tools) add_subdirectory(rocdecode) add_subdirectory(rocjpeg) add_subdirectory(hip-host-tracing) +add_subdirectory(code-object-multi-threaded) # rocpd validation tests add_subdirectory(rocpd) diff --git a/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt index d817778bbe..9bc9fa7d05 100644 --- a/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt +++ b/projects/rocprofiler-sdk/tests/bin/CMakeLists.txt @@ -42,3 +42,4 @@ add_subdirectory(hip-streams) add_subdirectory(hip-streams-per-thread) add_subdirectory(attachment-test) add_subdirectory(hip-host) +add_subdirectory(module-loading-test) diff --git a/projects/rocprofiler-sdk/tests/bin/module-loading-test/CMakeLists.txt b/projects/rocprofiler-sdk/tests/bin/module-loading-test/CMakeLists.txt new file mode 100644 index 0000000000..ec119740c7 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/module-loading-test/CMakeLists.txt @@ -0,0 +1,55 @@ +# +# Multi-threaded code object loading test +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +if(NOT CMAKE_HIP_COMPILER) + find_program( + amdclangpp_EXECUTABLE + NAMES amdclang++ + HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm + PATH_SUFFIXES bin llvm/bin NO_CACHE) + mark_as_advanced(amdclangpp_EXECUTABLE) + + if(amdclangpp_EXECUTABLE) + set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}") + endif() +endif() + +project(rocprofiler-sdk-tests-bin-code-object-multi-threaded LANGUAGES CXX HIP) + +foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO) + if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "") + set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}") + endif() +endforeach() + +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_HIP_STANDARD 17) +set(CMAKE_HIP_EXTENSIONS OFF) +set(CMAKE_HIP_STANDARD_REQUIRED ON) + +find_package(rocprofiler-sdk REQUIRED) + +# Find hiprtc for runtime compilation +find_library( + HIPRTC_LIBRARY + NAMES hiprtc + HINTS ${ROCM_PATH}/lib ENV ROCM_PATH /opt/rocm/lib) + +set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP) + +add_executable(code-object-multithread-testapp) +target_sources(code-object-multithread-testapp PRIVATE main.cpp) +target_compile_options(code-object-multithread-testapp PRIVATE -W -Wall -Wextra) + +find_package(Threads REQUIRED) +target_link_libraries(code-object-multithread-testapp PRIVATE Threads::Threads) + +# Link hiprtc if found, otherwise test will fall back to static kernels +if(HIPRTC_LIBRARY) + target_link_libraries(code-object-multithread-testapp PRIVATE ${HIPRTC_LIBRARY}) +endif() diff --git a/projects/rocprofiler-sdk/tests/bin/module-loading-test/main.cpp b/projects/rocprofiler-sdk/tests/bin/module-loading-test/main.cpp new file mode 100644 index 0000000000..6bb650fa01 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/bin/module-loading-test/main.cpp @@ -0,0 +1,253 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +/** + * @file tests/bin/code-object-multi-threaded/main.cpp + * + * @brief Multi-threaded code object loading stress test for rocprofiler-sdk + * + * This test verifies thread-safety when multiple threads concurrently load + * HIP modules on different GPUs, which triggers concurrent calls to + * executable_freeze_internal and code object tracing callbacks. + * + * Data races tested: + * - user_data map access (now protected with Synchronized) + * - contexts vector assignment (now protected with wlock) + * - is_shutdown flag (now atomic) + * - end_notified/beg_notified flags (now atomic) + * - executable_destroy serialization (now protected with mutex) + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define HIP_CHECK(expr) \ + do \ + { \ + hipError_t _err = (expr); \ + if(_err != hipSuccess) \ + { \ + std::cerr << "HIP error " << hipGetErrorString(_err) << " at " << __FILE__ << ":" \ + << __LINE__ << "\n"; \ + std::abort(); \ + } \ + } while(0) + +// Simple kernel code as string for runtime compilation/loading +static const char* kernel_code = R"( +extern "C" __global__ void dynamic_kernel_a(int* data, int n) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx < n) data[idx] = idx * 2; +} + +extern "C" __global__ void dynamic_kernel_b(float* data, int n) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx < n) data[idx] = idx * 3.14f; +} + +extern "C" __global__ void dynamic_kernel_c(int* data, int n) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx < n) data[idx] = idx * idx; +} +)"; + +// Static kernels as fallback +__global__ void +test_kernel_a() +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx == 0) printf("Kernel A\n"); +} + +__global__ void +test_kernel_b(int* data) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + data[idx] = idx * 2; +} + +__global__ void +test_kernel_c(float* data, int n) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx < n) data[idx] = idx * 3.14f; +} + +void +gpu_worker(int gpu_id, int iterations) +{ + // Set device for this thread + HIP_CHECK(hipSetDevice(gpu_id)); + + const int n = 1024; + + // Each thread loads modules in parallel - this triggers concurrent + // calls to executable_freeze_internal and code object callbacks + for(int i = 0; i < iterations; ++i) + { + // Use hiprtc to compile kernel source at runtime, then load the compiled module + // This triggers parallel module loading across threads + hiprtcProgram prog{}; + hiprtcResult rtc_result = + hiprtcCreateProgram(&prog, kernel_code, "dynamic_kernels.cu", 0, nullptr, nullptr); + + bool use_dynamic_loading = (rtc_result == HIPRTC_SUCCESS); + + if(use_dynamic_loading) + { + // Compile the program + rtc_result = hiprtcCompileProgram(prog, 0, nullptr); + + if(rtc_result != HIPRTC_SUCCESS) + { + size_t log_size{0}; + hiprtcGetProgramLogSize(prog, &log_size); + if(log_size > 1) + { + auto log = std::vector(log_size); + hiprtcGetProgramLog(prog, log.data()); + std::cerr << "Compilation failed:\n" << log.data() << "\n"; + } + use_dynamic_loading = false; + } + } + + if(use_dynamic_loading) + { + // Get the compiled code + size_t code_size{0}; + hiprtcGetCodeSize(prog, &code_size); + + std::vector code(code_size); + hiprtcGetCode(prog, code.data()); + hiprtcDestroyProgram(&prog); + + // Load the compiled module - THIS is what triggers the parallel code object loading + hipModule_t module; + hipError_t err = hipModuleLoadData(&module, code.data()); + + if(err == hipSuccess) + { + // Get functions from the module + hipFunction_t func_a, func_b, func_c; + HIP_CHECK(hipModuleGetFunction(&func_a, module, "dynamic_kernel_a")); + HIP_CHECK(hipModuleGetFunction(&func_b, module, "dynamic_kernel_b")); + HIP_CHECK(hipModuleGetFunction(&func_c, module, "dynamic_kernel_c")); + + // Allocate device memory + int* d_int_data{nullptr}; + float* d_float_data{nullptr}; + HIP_CHECK(hipMalloc(&d_int_data, n * sizeof(int))); + HIP_CHECK(hipMalloc(&d_float_data, n * sizeof(float))); + + // Launch kernels using module functions + int n_arg = n; + void* args_a[] = {&d_int_data, &n_arg}; + HIP_CHECK(hipModuleLaunchKernel( + func_a, (n + 255) / 256, 1, 1, 256, 1, 1, 0, 0, args_a, nullptr)); + + void* args_b[] = {&d_float_data, &n_arg}; + HIP_CHECK(hipModuleLaunchKernel( + func_b, (n + 255) / 256, 1, 1, 256, 1, 1, 0, 0, args_b, nullptr)); + + void* args_c[] = {&d_int_data, &n_arg}; + HIP_CHECK(hipModuleLaunchKernel( + func_c, (n + 255) / 256, 1, 1, 256, 1, 1, 0, 0, args_c, nullptr)); + } + else + { + use_dynamic_loading = false; + } + } + + if(!use_dynamic_loading) + { + // Fallback to regular kernel launches if module loading fails + int* d_int_data{nullptr}; + float* d_float_data{nullptr}; + HIP_CHECK(hipMalloc(&d_int_data, n * sizeof(int))); + HIP_CHECK(hipMalloc(&d_float_data, n * sizeof(float))); + + test_kernel_a<<<1, 64>>>(); + HIP_CHECK(hipGetLastError()); + + test_kernel_b<<<(n + 255) / 256, 256>>>(d_int_data); + HIP_CHECK(hipGetLastError()); + + test_kernel_c<<<(n + 255) / 256, 256>>>(d_float_data, n); + HIP_CHECK(hipGetLastError()); + } + } +} + +int +main() +{ + std::cout << "Multi-Threaded Code Object Loading Test\n"; + + int num_gpus = 0; + HIP_CHECK(hipGetDeviceCount(&num_gpus)); + + if(num_gpus == 0) + { + std::cerr << "No GPUs found. Test requires at least one GPU.\n"; + return 1; + } + + std::cout << "Found " << num_gpus << " GPU(s)\n"; + + int num_threads = + std::thread::hardware_concurrency(); // More threads than GPUs to increase contention + int threads_per_gpu = num_threads / num_gpus; + std::cout << "Launching " << num_threads << " threads\n"; + + constexpr int iterations = 3; + + // Create worker threads + auto threads = std::vector{}; + threads.reserve(num_threads); + + for(int gpu_id = 0; gpu_id < num_gpus; ++gpu_id) + { + for(int thread_id = 0; thread_id < threads_per_gpu; ++thread_id) + { + threads.emplace_back(gpu_worker, gpu_id, iterations); + } + } + + // Wait for all threads to complete + for(auto& t : threads) + { + t.join(); + } + + std::cout << "Test completed successfully\n"; + return 0; +} diff --git a/projects/rocprofiler-sdk/tests/code-object-multi-threaded/CMakeLists.txt b/projects/rocprofiler-sdk/tests/code-object-multi-threaded/CMakeLists.txt new file mode 100644 index 0000000000..ea4f231f96 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/code-object-multi-threaded/CMakeLists.txt @@ -0,0 +1,38 @@ +# +# Multi-threaded code object loading test +# +cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR) + +project(rocprofiler-sdk-tests-code-object-multi-threaded LANGUAGES CXX) + +find_package(rocprofiler-sdk REQUIRED) + +# Build the client tool library +add_library(code-object-multithread-client SHARED) +target_sources(code-object-multithread-client PRIVATE client.cpp) +target_link_libraries(code-object-multithread-client + PRIVATE rocprofiler-sdk::rocprofiler-sdk) + +if(ROCPROFILER_MEMCHECK_PRELOAD_ENV_VALUE) + set(PRELOAD_ENV + "${ROCPROFILER_MEMCHECK_PRELOAD_ENV_VALUE}:$" + ) +else() + set(PRELOAD_ENV "$") +endif() + +# Register the test with CTest +rocprofiler_add_integration_execute_test( + test-code-object-multithread + COMMAND $ + DEPENDS code-object-multithread-client + TIMEOUT 120 + LABELS "integration-tests" + PRELOAD "${PRELOAD_ENV}" + ENVIRONMENT + "LD_LIBRARY_PATH=$:$ENV{LD_LIBRARY_PATH}" + PASS_REGULAR_EXPRESSION + "Test PASSED: Successfully traced multi-threaded code object loading!" + FAIL_REGULAR_EXPRESSION + "${ROCPROFILER_DEFAULT_FAIL_REGEX}|SEGFAULT|Failed to \.*|Context is not valid|ERROR: \.*" + ) diff --git a/projects/rocprofiler-sdk/tests/code-object-multi-threaded/client.cpp b/projects/rocprofiler-sdk/tests/code-object-multi-threaded/client.cpp new file mode 100644 index 0000000000..4493d11381 --- /dev/null +++ b/projects/rocprofiler-sdk/tests/code-object-multi-threaded/client.cpp @@ -0,0 +1,202 @@ +// MIT License +// +// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +/** + * @file tests/bin/code-object-multi-threaded/client.cpp + * + * @brief ROCProfiler tool that tracks code object operations + * + * This tool uses rocprofiler-sdk callback tracing to monitor code object + * load/unload operations during multi-threaded execution, verifying that + * the thread-safety fixes work correctly. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace +{ +using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t; +using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; + +// Thread-safe counters +std::atomic g_code_object_load_count{0}; +std::atomic g_code_object_unload_count{0}; +std::atomic g_kernel_symbol_load_count{0}; +std::atomic g_kernel_symbol_unload_count{0}; + +// Track which threads invoke callbacks +std::mutex g_thread_mutex; +std::unordered_map g_thread_callback_counts; + +rocprofiler_client_id_t* client_id = nullptr; +rocprofiler_client_finalize_t client_fini_func = nullptr; +rocprofiler_context_id_t client_ctx = {}; + +void +codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* /* user_data */, + void* /* callback_data */) +{ + if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return; + + // Track thread activity + { + std::lock_guard lock(g_thread_mutex); + g_thread_callback_counts[record.thread_id]++; + } + + if(record.operation == ROCPROFILER_CODE_OBJECT_LOAD) + { + if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) + { + g_code_object_load_count.fetch_add(1, std::memory_order_relaxed); + } + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) + { + g_code_object_unload_count.fetch_add(1, std::memory_order_relaxed); + } + } + else if(record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) + { + if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) + { + g_kernel_symbol_load_count.fetch_add(1, std::memory_order_relaxed); + } + else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) + { + g_kernel_symbol_unload_count.fetch_add(1, std::memory_order_relaxed); + } + } +} + +void +tool_fini(void* /* tool_data */) +{ + std::cout << "\n=== ROCProfiler Multi-Threaded Code Object Test Results ===\n"; + std::cout << "Code objects loaded: " << g_code_object_load_count.load() << "\n"; + std::cout << "Code objects unloaded: " << g_code_object_unload_count.load() << "\n"; + std::cout << "Kernel symbols loaded: " << g_kernel_symbol_load_count.load() << "\n"; + std::cout << "Kernel symbols unloaded: " << g_kernel_symbol_unload_count.load() << "\n"; + + size_t num_threads_with_callbacks = 0; + { + std::lock_guard lock(g_thread_mutex); + num_threads_with_callbacks = g_thread_callback_counts.size(); + std::cout << "Threads that invoked callbacks: " << num_threads_with_callbacks << "\n"; + } + std::cout << "===========================================================\n"; + + // Verify we actually traced something + if(g_code_object_load_count.load() == 0) + { + std::cerr << "ERROR: No code objects were traced!\n"; + std::abort(); + } + + if(g_kernel_symbol_load_count.load() == 0) + { + std::cerr << "ERROR: No kernel symbols were traced!\n"; + std::abort(); + } + + // Verify multi-threaded execution + if(num_threads_with_callbacks < 2) + { + std::cerr << "ERROR: Expected callbacks from multiple threads, got " + << num_threads_with_callbacks << "\n"; + std::abort(); + } + + std::cout << "Test PASSED: Successfully traced multi-threaded code object loading!\n"; +} + +int +tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) +{ + client_fini_func = fini_func; + + auto status = rocprofiler_create_context(&client_ctx); + if(status != ROCPROFILER_STATUS_SUCCESS) + { + std::cerr << "Failed to create context\n"; + return -1; + } + + status = + rocprofiler_configure_callback_tracing_service(client_ctx, + ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + nullptr, + 0, + codeobj_tracing_callback, + tool_data); + + if(status != ROCPROFILER_STATUS_SUCCESS) + { + std::cerr << "Failed to configure code object tracing\n"; + return -1; + } + + int valid_ctx = 0; + status = rocprofiler_context_is_valid(client_ctx, &valid_ctx); + if(status != ROCPROFILER_STATUS_SUCCESS || valid_ctx == 0) + { + std::cerr << "Context is not valid\n"; + return -1; + } + + status = rocprofiler_start_context(client_ctx); + if(status != ROCPROFILER_STATUS_SUCCESS) + { + std::cerr << "Failed to start context\n"; + return -1; + } + + std::cout << "ROCProfiler multi-threaded code object tool initialized\n"; + return 0; +} +} // namespace + +extern "C" rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t /* version */, + const char* /* runtime_version */, + uint32_t /* priority */, + rocprofiler_client_id_t* id) +{ + id->name = "CodeObjectMultiThreadedClient"; + client_id = id; + + static auto cfg = rocprofiler_tool_configure_result_t{ + sizeof(rocprofiler_tool_configure_result_t), &tool_init, &tool_fini, nullptr}; + + return &cfg; +}