Fixing Code Object Data Race and Thread Safety & Adding validation test (#2014)
This commit is contained in:
zatwierdzone przez
GitHub
rodzic
5b75ec6a09
commit
ed42157c31
+85
-58
@@ -378,7 +378,14 @@ using code_object_unload_array_t = std::vector<hsa::code_object_unload>;
|
||||
std::vector<hsa::code_object_unload>
|
||||
shutdown(hsa_executable_t executable);
|
||||
|
||||
bool is_shutdown = false;
|
||||
std::atomic<bool> is_shutdown{false};
|
||||
|
||||
auto&
|
||||
get_destroy_mutex()
|
||||
{
|
||||
static auto _v = std::mutex{};
|
||||
return _v;
|
||||
}
|
||||
|
||||
auto*
|
||||
get_executables()
|
||||
@@ -733,7 +740,8 @@ get_unloaded_code_objects(hsa_executable_t executable)
|
||||
{
|
||||
auto _unloaded = std::vector<hsa::code_object_unload>{};
|
||||
|
||||
if(!is_shutdown && get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects)
|
||||
if(!is_shutdown.load(std::memory_order_acquire) &&
|
||||
get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects)
|
||||
get_loader_table().hsa_ven_amd_loader_executable_iterate_loaded_code_objects(
|
||||
executable, code_object_unload_callback, &_unloaded);
|
||||
|
||||
@@ -837,7 +845,7 @@ executable_freeze_internal(hsa_executable_t executable)
|
||||
|
||||
if(!ctxs.empty())
|
||||
{
|
||||
code_obj_vec->rlock([](const code_object_array_t& data) {
|
||||
code_obj_vec->wlock([](code_object_array_t& data) {
|
||||
auto tidx = common::get_tid();
|
||||
// set the contexts for each code object
|
||||
for(const auto& ditr : data)
|
||||
@@ -864,8 +872,10 @@ executable_freeze_internal(hsa_executable_t executable)
|
||||
// invoke callback
|
||||
auto& cb_data =
|
||||
citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND);
|
||||
auto& user_data = ditr->user_data[citr];
|
||||
cb_data.callback(record, &user_data, cb_data.data);
|
||||
ditr->user_data.wlock([&](auto& user_data_map) {
|
||||
auto& user_data = user_data_map[citr];
|
||||
cb_data.callback(record, &user_data, cb_data.data);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -889,52 +899,57 @@ executable_freeze_internal(hsa_executable_t executable)
|
||||
// invoke callback
|
||||
auto& cb_data =
|
||||
citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND);
|
||||
auto& user_data = sitr->user_data[citr];
|
||||
cb_data.callback(record, &user_data, cb_data.data);
|
||||
sitr->user_data.wlock([&](auto& user_data_map) {
|
||||
auto& user_data = user_data_map[citr];
|
||||
cb_data.callback(record, &user_data, cb_data.data);
|
||||
|
||||
std::string device_name =
|
||||
CHECK_NOTNULL(get_hip_register_data())
|
||||
->rlock([sym_data](
|
||||
std::string device_name =
|
||||
CHECK_NOTNULL(get_hip_register_data())
|
||||
->rlock(
|
||||
[sym_data](
|
||||
const hip::hip_register_data& register_data) {
|
||||
const auto& sym_map =
|
||||
register_data.kernel_symbol_device_map;
|
||||
const auto it = sym_map.find(*CHECK_NOTNULL(
|
||||
common::get_string_entry(sym_data.kernel_name)));
|
||||
if(it != sym_map.end()) return it->second;
|
||||
return std::string();
|
||||
});
|
||||
// Does not have a host function, skip
|
||||
if(device_name.empty()) continue;
|
||||
auto host_data =
|
||||
CHECK_NOTNULL(get_hip_register_data())
|
||||
->rlock([device_name](
|
||||
const hip::hip_register_data& register_data) {
|
||||
// Add check for out of range here
|
||||
const auto it =
|
||||
register_data.host_function_map.find(device_name);
|
||||
if(it == register_data.host_function_map.end())
|
||||
{
|
||||
return rocprofiler_callback_tracing_code_object_host_kernel_symbol_register_data_t{};
|
||||
}
|
||||
return it->second;
|
||||
});
|
||||
// when kernel_symbol_device_map kernels are not present in
|
||||
// host_function_map, skip.
|
||||
if(host_data.device_function == nullptr) continue;
|
||||
host_data.code_object_id = sym_data.code_object_id;
|
||||
host_data.kernel_id = sym_data.kernel_id;
|
||||
host_data.host_function_id = ++get_host_function_id();
|
||||
auto hip_record = rocprofiler_callback_tracing_record_t{
|
||||
.context_id = rocprofiler_context_id_t{citr->context_idx},
|
||||
.thread_id = tidx,
|
||||
.correlation_id = rocprofiler_correlation_id_t{},
|
||||
.kind = CODE_OBJECT_KIND,
|
||||
.operation = CODE_OBJECT_HOST_SYMBOL,
|
||||
.phase = ROCPROFILER_CALLBACK_PHASE_LOAD,
|
||||
.payload = static_cast<void*>(&host_data)};
|
||||
const auto& sym_map =
|
||||
register_data.kernel_symbol_device_map;
|
||||
const auto it = sym_map.find(
|
||||
*CHECK_NOTNULL(common::get_string_entry(
|
||||
sym_data.kernel_name)));
|
||||
if(it != sym_map.end()) return it->second;
|
||||
return std::string();
|
||||
});
|
||||
// Does not have a host function, skip
|
||||
if(device_name.empty()) return;
|
||||
auto host_data =
|
||||
CHECK_NOTNULL(get_hip_register_data())
|
||||
->rlock([device_name](const hip::hip_register_data&
|
||||
register_data) {
|
||||
// Add check for out of range here
|
||||
const auto it =
|
||||
register_data.host_function_map.find(
|
||||
device_name);
|
||||
if(it == register_data.host_function_map.end())
|
||||
{
|
||||
return rocprofiler_callback_tracing_code_object_host_kernel_symbol_register_data_t{};
|
||||
}
|
||||
return it->second;
|
||||
});
|
||||
// when kernel_symbol_device_map kernels are not present in
|
||||
// host_function_map, skip.
|
||||
if(host_data.device_function == nullptr) return;
|
||||
host_data.code_object_id = sym_data.code_object_id;
|
||||
host_data.kernel_id = sym_data.kernel_id;
|
||||
host_data.host_function_id = ++get_host_function_id();
|
||||
auto hip_record = rocprofiler_callback_tracing_record_t{
|
||||
.context_id = rocprofiler_context_id_t{citr->context_idx},
|
||||
.thread_id = tidx,
|
||||
.correlation_id = rocprofiler_correlation_id_t{},
|
||||
.kind = CODE_OBJECT_KIND,
|
||||
.operation = CODE_OBJECT_HOST_SYMBOL,
|
||||
.phase = ROCPROFILER_CALLBACK_PHASE_LOAD,
|
||||
.payload = static_cast<void*>(&host_data)};
|
||||
|
||||
// invoke callback
|
||||
cb_data.callback(hip_record, &user_data, cb_data.data);
|
||||
// invoke callback
|
||||
cb_data.callback(hip_record, &user_data, cb_data.data);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -964,7 +979,13 @@ executable_freeze(hsa_executable_t executable, const char* options)
|
||||
hsa_status_t
|
||||
executable_destroy(hsa_executable_t executable)
|
||||
{
|
||||
if(is_shutdown) return HSA_STATUS_SUCCESS;
|
||||
// Serialize all executable_destroy calls to prevent:
|
||||
// 1. Concurrent access to code objects in shutdown()
|
||||
// 2. Use-after-free when multiple threads destroy same executable
|
||||
// 3. Race on end_notified flags (now atomic, but still need serialization for callbacks)
|
||||
auto _lk = std::unique_lock{get_destroy_mutex()};
|
||||
|
||||
if(is_shutdown.load(std::memory_order_acquire)) return HSA_STATUS_SUCCESS;
|
||||
|
||||
auto _unloaded = shutdown(executable);
|
||||
|
||||
@@ -1098,9 +1119,11 @@ shutdown(hsa_executable_t executable)
|
||||
.payload = static_cast<void*>(&itr.object->rocp_data)};
|
||||
|
||||
// invoke callback
|
||||
auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND);
|
||||
auto& user_data = itr.object->user_data.at(citr);
|
||||
cb_data.callback(record, &user_data, cb_data.data);
|
||||
auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND);
|
||||
itr.object->user_data.wlock([&](auto& user_data_map) {
|
||||
auto& user_data = user_data_map.at(citr);
|
||||
cb_data.callback(record, &user_data, cb_data.data);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1123,9 +1146,11 @@ shutdown(hsa_executable_t executable)
|
||||
.payload = static_cast<void*>(&sitr->rocp_data)};
|
||||
|
||||
// invoke callback
|
||||
auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND);
|
||||
auto& user_data = sitr->user_data.at(citr);
|
||||
cb_data.callback(record, &user_data, cb_data.data);
|
||||
auto& cb_data = citr->callback_tracer->callback_data.at(CODE_OBJECT_KIND);
|
||||
sitr->user_data.wlock([&](auto& user_data_map) {
|
||||
auto& user_data = user_data_map.at(citr);
|
||||
cb_data.callback(record, &user_data, cb_data.data);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1226,7 +1251,8 @@ get_kernel_id(uint64_t kernel_object)
|
||||
void
|
||||
finalize()
|
||||
{
|
||||
if(is_shutdown || !get_executables() || !get_code_objects()) return;
|
||||
if(is_shutdown.load(std::memory_order_acquire) || !get_executables() || !get_code_objects())
|
||||
return;
|
||||
|
||||
CHECK_NOTNULL(get_executables())->rlock([](const executable_array_t& edata) {
|
||||
auto tmp = edata;
|
||||
@@ -1237,13 +1263,14 @@ finalize()
|
||||
|
||||
CHECK_NOTNULL(get_code_objects())->wlock([](code_object_array_t& data) { data.clear(); });
|
||||
|
||||
is_shutdown = true;
|
||||
is_shutdown.store(true, std::memory_order_release);
|
||||
}
|
||||
|
||||
void
|
||||
iterate_loaded_code_objects(code_object_iterator_t&& func)
|
||||
{
|
||||
if(is_shutdown || !get_executables() || !get_code_objects()) return;
|
||||
if(is_shutdown.load(std::memory_order_acquire) || !get_executables() || !get_code_objects())
|
||||
return;
|
||||
CHECK_NOTNULL(get_code_objects())
|
||||
->rlock(
|
||||
[](const code_object_array_t& data, code_object_iterator_t&& func_v) {
|
||||
|
||||
+8
-5
@@ -49,15 +49,18 @@ code_object::operator=(code_object&& rhs) noexcept
|
||||
{
|
||||
if(this != &rhs)
|
||||
{
|
||||
beg_notified = rhs.beg_notified;
|
||||
end_notified = rhs.end_notified;
|
||||
beg_notified.store(rhs.beg_notified.load());
|
||||
end_notified.store(rhs.end_notified.load());
|
||||
uri = rhs.uri;
|
||||
hsa_executable = rhs.hsa_executable;
|
||||
hsa_code_object = rhs.hsa_code_object;
|
||||
rocp_data = rhs.rocp_data;
|
||||
user_data = std::move(rhs.user_data);
|
||||
rocp_data.uri = (uri) ? uri->c_str() : nullptr;
|
||||
symbols = std::move(rhs.symbols);
|
||||
// Manually move user_data by extracting and inserting under locks
|
||||
rhs.user_data.wlock([this](auto& rhs_map) {
|
||||
this->user_data.wlock([&rhs_map](auto& lhs_map) { lhs_map = std::move(rhs_map); });
|
||||
});
|
||||
rocp_data.uri = (uri) ? uri->c_str() : nullptr;
|
||||
symbols = std::move(rhs.symbols);
|
||||
}
|
||||
|
||||
return *this;
|
||||
|
||||
+10
-8
@@ -22,12 +22,14 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "lib/common/synchronized.hpp"
|
||||
#include "lib/rocprofiler-sdk/code_object/hsa/kernel_symbol.hpp"
|
||||
#include "lib/rocprofiler-sdk/context/context.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/hsa.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
@@ -39,11 +41,11 @@ namespace code_object
|
||||
{
|
||||
namespace hsa
|
||||
{
|
||||
using context_t = context::context;
|
||||
using user_data_t = rocprofiler_user_data_t;
|
||||
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
|
||||
using context_array_t = context::context_array_t;
|
||||
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
|
||||
using context_t = context::context;
|
||||
using user_data_t = rocprofiler_user_data_t;
|
||||
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
|
||||
using synchronized_user_data_t = common::Synchronized<context_user_data_map_t>;
|
||||
using context_array_t = context::context_array_t;
|
||||
|
||||
struct code_object
|
||||
{
|
||||
@@ -59,15 +61,15 @@ struct code_object
|
||||
code_object& operator=(const code_object&) = delete;
|
||||
code_object& operator =(code_object&&) noexcept;
|
||||
|
||||
bool beg_notified = false;
|
||||
bool end_notified = false;
|
||||
std::atomic<bool> beg_notified = false;
|
||||
std::atomic<bool> end_notified = false;
|
||||
const std::string* uri = nullptr;
|
||||
hsa_executable_t hsa_executable = {};
|
||||
hsa_loaded_code_object_t hsa_code_object = {};
|
||||
code_object_data_t rocp_data = common::init_public_api_struct(code_object_data_t{});
|
||||
symbol_array_t symbols = {};
|
||||
context_array_t contexts = {};
|
||||
context_user_data_map_t user_data = {};
|
||||
synchronized_user_data_t user_data = {};
|
||||
};
|
||||
|
||||
struct code_object_unload
|
||||
|
||||
+11
-8
@@ -49,14 +49,17 @@ kernel_symbol::operator=(kernel_symbol&& rhs) noexcept
|
||||
{
|
||||
if(this != &rhs)
|
||||
{
|
||||
beg_notified = rhs.beg_notified;
|
||||
end_notified = rhs.end_notified;
|
||||
name = rhs.name;
|
||||
hsa_executable = rhs.hsa_executable;
|
||||
hsa_agent = rhs.hsa_agent;
|
||||
hsa_symbol = rhs.hsa_symbol;
|
||||
rocp_data = rhs.rocp_data;
|
||||
user_data = std::move(rhs.user_data);
|
||||
beg_notified.store(rhs.beg_notified.load());
|
||||
end_notified.store(rhs.end_notified.load());
|
||||
name = rhs.name;
|
||||
hsa_executable = rhs.hsa_executable;
|
||||
hsa_agent = rhs.hsa_agent;
|
||||
hsa_symbol = rhs.hsa_symbol;
|
||||
rocp_data = rhs.rocp_data;
|
||||
// Manually move user_data by extracting and inserting under locks
|
||||
rhs.user_data.wlock([this](auto& rhs_map) {
|
||||
this->user_data.wlock([&rhs_map](auto& lhs_map) { lhs_map = std::move(rhs_map); });
|
||||
});
|
||||
rocp_data.kernel_name = (name) ? name->c_str() : nullptr;
|
||||
}
|
||||
|
||||
|
||||
+15
-13
@@ -22,11 +22,13 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "lib/common/synchronized.hpp"
|
||||
#include "lib/rocprofiler-sdk/context/context.hpp"
|
||||
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/hsa.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
@@ -38,11 +40,11 @@ namespace code_object
|
||||
{
|
||||
namespace hsa
|
||||
{
|
||||
using context_t = context::context;
|
||||
using user_data_t = rocprofiler_user_data_t;
|
||||
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
|
||||
using context_array_t = context::context_array_t;
|
||||
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
|
||||
using context_t = context::context;
|
||||
using user_data_t = rocprofiler_user_data_t;
|
||||
using context_user_data_map_t = std::unordered_map<const context_t*, user_data_t>;
|
||||
using synchronized_user_data_t = common::Synchronized<context_user_data_map_t>;
|
||||
using context_array_t = context::context_array_t;
|
||||
|
||||
struct kernel_symbol
|
||||
{
|
||||
@@ -58,14 +60,14 @@ struct kernel_symbol
|
||||
kernel_symbol& operator=(const kernel_symbol&) = delete;
|
||||
kernel_symbol& operator =(kernel_symbol&&) noexcept;
|
||||
|
||||
bool beg_notified = false;
|
||||
bool end_notified = false;
|
||||
const std::string* name = nullptr;
|
||||
hsa_executable_t hsa_executable = {};
|
||||
hsa_agent_t hsa_agent = {};
|
||||
hsa_executable_symbol_t hsa_symbol = {};
|
||||
kernel_symbol_data_t rocp_data = common::init_public_api_struct(kernel_symbol_data_t{});
|
||||
context_user_data_map_t user_data = {};
|
||||
std::atomic<bool> beg_notified = false;
|
||||
std::atomic<bool> end_notified = false;
|
||||
const std::string* name = nullptr;
|
||||
hsa_executable_t hsa_executable = {};
|
||||
hsa_agent_t hsa_agent = {};
|
||||
hsa_executable_symbol_t hsa_symbol = {};
|
||||
kernel_symbol_data_t rocp_data = common::init_public_api_struct(kernel_symbol_data_t{});
|
||||
synchronized_user_data_t user_data = {};
|
||||
};
|
||||
|
||||
bool
|
||||
|
||||
@@ -87,6 +87,7 @@ add_subdirectory(openmp-tools)
|
||||
add_subdirectory(rocdecode)
|
||||
add_subdirectory(rocjpeg)
|
||||
add_subdirectory(hip-host-tracing)
|
||||
add_subdirectory(code-object-multi-threaded)
|
||||
|
||||
# rocpd validation tests
|
||||
add_subdirectory(rocpd)
|
||||
|
||||
@@ -42,3 +42,4 @@ add_subdirectory(hip-streams)
|
||||
add_subdirectory(hip-streams-per-thread)
|
||||
add_subdirectory(attachment-test)
|
||||
add_subdirectory(hip-host)
|
||||
add_subdirectory(module-loading-test)
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
#
|
||||
# Multi-threaded code object loading test
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
if(NOT CMAKE_HIP_COMPILER)
|
||||
find_program(
|
||||
amdclangpp_EXECUTABLE
|
||||
NAMES amdclang++
|
||||
HINTS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATHS ${ROCM_PATH} ENV ROCM_PATH /opt/rocm
|
||||
PATH_SUFFIXES bin llvm/bin NO_CACHE)
|
||||
mark_as_advanced(amdclangpp_EXECUTABLE)
|
||||
|
||||
if(amdclangpp_EXECUTABLE)
|
||||
set(CMAKE_HIP_COMPILER "${amdclangpp_EXECUTABLE}")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
project(rocprofiler-sdk-tests-bin-code-object-multi-threaded LANGUAGES CXX HIP)
|
||||
|
||||
foreach(_TYPE DEBUG MINSIZEREL RELEASE RELWITHDEBINFO)
|
||||
if("${CMAKE_HIP_FLAGS_${_TYPE}}" STREQUAL "")
|
||||
set(CMAKE_HIP_FLAGS_${_TYPE} "${CMAKE_CXX_FLAGS_${_TYPE}}")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_HIP_STANDARD 17)
|
||||
set(CMAKE_HIP_EXTENSIONS OFF)
|
||||
set(CMAKE_HIP_STANDARD_REQUIRED ON)
|
||||
|
||||
find_package(rocprofiler-sdk REQUIRED)
|
||||
|
||||
# Find hiprtc for runtime compilation
|
||||
find_library(
|
||||
HIPRTC_LIBRARY
|
||||
NAMES hiprtc
|
||||
HINTS ${ROCM_PATH}/lib ENV ROCM_PATH /opt/rocm/lib)
|
||||
|
||||
set_source_files_properties(main.cpp PROPERTIES LANGUAGE HIP)
|
||||
|
||||
add_executable(code-object-multithread-testapp)
|
||||
target_sources(code-object-multithread-testapp PRIVATE main.cpp)
|
||||
target_compile_options(code-object-multithread-testapp PRIVATE -W -Wall -Wextra)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
target_link_libraries(code-object-multithread-testapp PRIVATE Threads::Threads)
|
||||
|
||||
# Link hiprtc if found, otherwise test will fall back to static kernels
|
||||
if(HIPRTC_LIBRARY)
|
||||
target_link_libraries(code-object-multithread-testapp PRIVATE ${HIPRTC_LIBRARY})
|
||||
endif()
|
||||
@@ -0,0 +1,253 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
/**
|
||||
* @file tests/bin/code-object-multi-threaded/main.cpp
|
||||
*
|
||||
* @brief Multi-threaded code object loading stress test for rocprofiler-sdk
|
||||
*
|
||||
* This test verifies thread-safety when multiple threads concurrently load
|
||||
* HIP modules on different GPUs, which triggers concurrent calls to
|
||||
* executable_freeze_internal and code object tracing callbacks.
|
||||
*
|
||||
* Data races tested:
|
||||
* - user_data map access (now protected with Synchronized)
|
||||
* - contexts vector assignment (now protected with wlock)
|
||||
* - is_shutdown flag (now atomic)
|
||||
* - end_notified/beg_notified flags (now atomic)
|
||||
* - executable_destroy serialization (now protected with mutex)
|
||||
*/
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hip/hip_runtime_api.h>
|
||||
#include <hip/hiprtc.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#define HIP_CHECK(expr) \
|
||||
do \
|
||||
{ \
|
||||
hipError_t _err = (expr); \
|
||||
if(_err != hipSuccess) \
|
||||
{ \
|
||||
std::cerr << "HIP error " << hipGetErrorString(_err) << " at " << __FILE__ << ":" \
|
||||
<< __LINE__ << "\n"; \
|
||||
std::abort(); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
// Simple kernel code as string for runtime compilation/loading
|
||||
static const char* kernel_code = R"(
|
||||
extern "C" __global__ void dynamic_kernel_a(int* data, int n) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(idx < n) data[idx] = idx * 2;
|
||||
}
|
||||
|
||||
extern "C" __global__ void dynamic_kernel_b(float* data, int n) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(idx < n) data[idx] = idx * 3.14f;
|
||||
}
|
||||
|
||||
extern "C" __global__ void dynamic_kernel_c(int* data, int n) {
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(idx < n) data[idx] = idx * idx;
|
||||
}
|
||||
)";
|
||||
|
||||
// Static kernels as fallback
|
||||
__global__ void
|
||||
test_kernel_a()
|
||||
{
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(idx == 0) printf("Kernel A\n");
|
||||
}
|
||||
|
||||
__global__ void
|
||||
test_kernel_b(int* data)
|
||||
{
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
data[idx] = idx * 2;
|
||||
}
|
||||
|
||||
__global__ void
|
||||
test_kernel_c(float* data, int n)
|
||||
{
|
||||
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
if(idx < n) data[idx] = idx * 3.14f;
|
||||
}
|
||||
|
||||
void
|
||||
gpu_worker(int gpu_id, int iterations)
|
||||
{
|
||||
// Set device for this thread
|
||||
HIP_CHECK(hipSetDevice(gpu_id));
|
||||
|
||||
const int n = 1024;
|
||||
|
||||
// Each thread loads modules in parallel - this triggers concurrent
|
||||
// calls to executable_freeze_internal and code object callbacks
|
||||
for(int i = 0; i < iterations; ++i)
|
||||
{
|
||||
// Use hiprtc to compile kernel source at runtime, then load the compiled module
|
||||
// This triggers parallel module loading across threads
|
||||
hiprtcProgram prog{};
|
||||
hiprtcResult rtc_result =
|
||||
hiprtcCreateProgram(&prog, kernel_code, "dynamic_kernels.cu", 0, nullptr, nullptr);
|
||||
|
||||
bool use_dynamic_loading = (rtc_result == HIPRTC_SUCCESS);
|
||||
|
||||
if(use_dynamic_loading)
|
||||
{
|
||||
// Compile the program
|
||||
rtc_result = hiprtcCompileProgram(prog, 0, nullptr);
|
||||
|
||||
if(rtc_result != HIPRTC_SUCCESS)
|
||||
{
|
||||
size_t log_size{0};
|
||||
hiprtcGetProgramLogSize(prog, &log_size);
|
||||
if(log_size > 1)
|
||||
{
|
||||
auto log = std::vector<char>(log_size);
|
||||
hiprtcGetProgramLog(prog, log.data());
|
||||
std::cerr << "Compilation failed:\n" << log.data() << "\n";
|
||||
}
|
||||
use_dynamic_loading = false;
|
||||
}
|
||||
}
|
||||
|
||||
if(use_dynamic_loading)
|
||||
{
|
||||
// Get the compiled code
|
||||
size_t code_size{0};
|
||||
hiprtcGetCodeSize(prog, &code_size);
|
||||
|
||||
std::vector<char> code(code_size);
|
||||
hiprtcGetCode(prog, code.data());
|
||||
hiprtcDestroyProgram(&prog);
|
||||
|
||||
// Load the compiled module - THIS is what triggers the parallel code object loading
|
||||
hipModule_t module;
|
||||
hipError_t err = hipModuleLoadData(&module, code.data());
|
||||
|
||||
if(err == hipSuccess)
|
||||
{
|
||||
// Get functions from the module
|
||||
hipFunction_t func_a, func_b, func_c;
|
||||
HIP_CHECK(hipModuleGetFunction(&func_a, module, "dynamic_kernel_a"));
|
||||
HIP_CHECK(hipModuleGetFunction(&func_b, module, "dynamic_kernel_b"));
|
||||
HIP_CHECK(hipModuleGetFunction(&func_c, module, "dynamic_kernel_c"));
|
||||
|
||||
// Allocate device memory
|
||||
int* d_int_data{nullptr};
|
||||
float* d_float_data{nullptr};
|
||||
HIP_CHECK(hipMalloc(&d_int_data, n * sizeof(int)));
|
||||
HIP_CHECK(hipMalloc(&d_float_data, n * sizeof(float)));
|
||||
|
||||
// Launch kernels using module functions
|
||||
int n_arg = n;
|
||||
void* args_a[] = {&d_int_data, &n_arg};
|
||||
HIP_CHECK(hipModuleLaunchKernel(
|
||||
func_a, (n + 255) / 256, 1, 1, 256, 1, 1, 0, 0, args_a, nullptr));
|
||||
|
||||
void* args_b[] = {&d_float_data, &n_arg};
|
||||
HIP_CHECK(hipModuleLaunchKernel(
|
||||
func_b, (n + 255) / 256, 1, 1, 256, 1, 1, 0, 0, args_b, nullptr));
|
||||
|
||||
void* args_c[] = {&d_int_data, &n_arg};
|
||||
HIP_CHECK(hipModuleLaunchKernel(
|
||||
func_c, (n + 255) / 256, 1, 1, 256, 1, 1, 0, 0, args_c, nullptr));
|
||||
}
|
||||
else
|
||||
{
|
||||
use_dynamic_loading = false;
|
||||
}
|
||||
}
|
||||
|
||||
if(!use_dynamic_loading)
|
||||
{
|
||||
// Fallback to regular kernel launches if module loading fails
|
||||
int* d_int_data{nullptr};
|
||||
float* d_float_data{nullptr};
|
||||
HIP_CHECK(hipMalloc(&d_int_data, n * sizeof(int)));
|
||||
HIP_CHECK(hipMalloc(&d_float_data, n * sizeof(float)));
|
||||
|
||||
test_kernel_a<<<1, 64>>>();
|
||||
HIP_CHECK(hipGetLastError());
|
||||
|
||||
test_kernel_b<<<(n + 255) / 256, 256>>>(d_int_data);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
|
||||
test_kernel_c<<<(n + 255) / 256, 256>>>(d_float_data, n);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main()
|
||||
{
|
||||
std::cout << "Multi-Threaded Code Object Loading Test\n";
|
||||
|
||||
int num_gpus = 0;
|
||||
HIP_CHECK(hipGetDeviceCount(&num_gpus));
|
||||
|
||||
if(num_gpus == 0)
|
||||
{
|
||||
std::cerr << "No GPUs found. Test requires at least one GPU.\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cout << "Found " << num_gpus << " GPU(s)\n";
|
||||
|
||||
int num_threads =
|
||||
std::thread::hardware_concurrency(); // More threads than GPUs to increase contention
|
||||
int threads_per_gpu = num_threads / num_gpus;
|
||||
std::cout << "Launching " << num_threads << " threads\n";
|
||||
|
||||
constexpr int iterations = 3;
|
||||
|
||||
// Create worker threads
|
||||
auto threads = std::vector<std::thread>{};
|
||||
threads.reserve(num_threads);
|
||||
|
||||
for(int gpu_id = 0; gpu_id < num_gpus; ++gpu_id)
|
||||
{
|
||||
for(int thread_id = 0; thread_id < threads_per_gpu; ++thread_id)
|
||||
{
|
||||
threads.emplace_back(gpu_worker, gpu_id, iterations);
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for all threads to complete
|
||||
for(auto& t : threads)
|
||||
{
|
||||
t.join();
|
||||
}
|
||||
|
||||
std::cout << "Test completed successfully\n";
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
#
|
||||
# Multi-threaded code object loading test
|
||||
#
|
||||
cmake_minimum_required(VERSION 3.21.0 FATAL_ERROR)
|
||||
|
||||
project(rocprofiler-sdk-tests-code-object-multi-threaded LANGUAGES CXX)
|
||||
|
||||
find_package(rocprofiler-sdk REQUIRED)
|
||||
|
||||
# Build the client tool library
|
||||
add_library(code-object-multithread-client SHARED)
|
||||
target_sources(code-object-multithread-client PRIVATE client.cpp)
|
||||
target_link_libraries(code-object-multithread-client
|
||||
PRIVATE rocprofiler-sdk::rocprofiler-sdk)
|
||||
|
||||
if(ROCPROFILER_MEMCHECK_PRELOAD_ENV_VALUE)
|
||||
set(PRELOAD_ENV
|
||||
"${ROCPROFILER_MEMCHECK_PRELOAD_ENV_VALUE}:$<TARGET_FILE:code-object-multithread-client>"
|
||||
)
|
||||
else()
|
||||
set(PRELOAD_ENV "$<TARGET_FILE:code-object-multithread-client>")
|
||||
endif()
|
||||
|
||||
# Register the test with CTest
|
||||
rocprofiler_add_integration_execute_test(
|
||||
test-code-object-multithread
|
||||
COMMAND $<TARGET_FILE:code-object-multithread-testapp>
|
||||
DEPENDS code-object-multithread-client
|
||||
TIMEOUT 120
|
||||
LABELS "integration-tests"
|
||||
PRELOAD "${PRELOAD_ENV}"
|
||||
ENVIRONMENT
|
||||
"LD_LIBRARY_PATH=$<TARGET_FILE_DIR:rocprofiler-sdk::rocprofiler-sdk-shared-library>:$ENV{LD_LIBRARY_PATH}"
|
||||
PASS_REGULAR_EXPRESSION
|
||||
"Test PASSED: Successfully traced multi-threaded code object loading!"
|
||||
FAIL_REGULAR_EXPRESSION
|
||||
"${ROCPROFILER_DEFAULT_FAIL_REGEX}|SEGFAULT|Failed to \.*|Context is not valid|ERROR: \.*"
|
||||
)
|
||||
@@ -0,0 +1,202 @@
|
||||
// MIT License
|
||||
//
|
||||
// Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in all
|
||||
// copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
// SOFTWARE.
|
||||
|
||||
/**
|
||||
* @file tests/bin/code-object-multi-threaded/client.cpp
|
||||
*
|
||||
* @brief ROCProfiler tool that tracks code object operations
|
||||
*
|
||||
* This tool uses rocprofiler-sdk callback tracing to monitor code object
|
||||
* load/unload operations during multi-threaded execution, verifying that
|
||||
* the thread-safety fixes work correctly.
|
||||
*/
|
||||
|
||||
#include <rocprofiler-sdk/callback_tracing.h>
|
||||
#include <rocprofiler-sdk/fwd.h>
|
||||
#include <rocprofiler-sdk/registration.h>
|
||||
#include <rocprofiler-sdk/rocprofiler.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace
|
||||
{
|
||||
using code_obj_load_data_t = rocprofiler_callback_tracing_code_object_load_data_t;
|
||||
using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t;
|
||||
|
||||
// Thread-safe counters
|
||||
std::atomic<uint64_t> g_code_object_load_count{0};
|
||||
std::atomic<uint64_t> g_code_object_unload_count{0};
|
||||
std::atomic<uint64_t> g_kernel_symbol_load_count{0};
|
||||
std::atomic<uint64_t> g_kernel_symbol_unload_count{0};
|
||||
|
||||
// Track which threads invoke callbacks
|
||||
std::mutex g_thread_mutex;
|
||||
std::unordered_map<uint64_t, uint64_t> g_thread_callback_counts;
|
||||
|
||||
rocprofiler_client_id_t* client_id = nullptr;
|
||||
rocprofiler_client_finalize_t client_fini_func = nullptr;
|
||||
rocprofiler_context_id_t client_ctx = {};
|
||||
|
||||
void
|
||||
codeobj_tracing_callback(rocprofiler_callback_tracing_record_t record,
|
||||
rocprofiler_user_data_t* /* user_data */,
|
||||
void* /* callback_data */)
|
||||
{
|
||||
if(record.kind != ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT) return;
|
||||
|
||||
// Track thread activity
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_thread_mutex);
|
||||
g_thread_callback_counts[record.thread_id]++;
|
||||
}
|
||||
|
||||
if(record.operation == ROCPROFILER_CODE_OBJECT_LOAD)
|
||||
{
|
||||
if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD)
|
||||
{
|
||||
g_code_object_load_count.fetch_add(1, std::memory_order_relaxed);
|
||||
}
|
||||
else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD)
|
||||
{
|
||||
g_code_object_unload_count.fetch_add(1, std::memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
else if(record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER)
|
||||
{
|
||||
if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD)
|
||||
{
|
||||
g_kernel_symbol_load_count.fetch_add(1, std::memory_order_relaxed);
|
||||
}
|
||||
else if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD)
|
||||
{
|
||||
g_kernel_symbol_unload_count.fetch_add(1, std::memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
tool_fini(void* /* tool_data */)
|
||||
{
|
||||
std::cout << "\n=== ROCProfiler Multi-Threaded Code Object Test Results ===\n";
|
||||
std::cout << "Code objects loaded: " << g_code_object_load_count.load() << "\n";
|
||||
std::cout << "Code objects unloaded: " << g_code_object_unload_count.load() << "\n";
|
||||
std::cout << "Kernel symbols loaded: " << g_kernel_symbol_load_count.load() << "\n";
|
||||
std::cout << "Kernel symbols unloaded: " << g_kernel_symbol_unload_count.load() << "\n";
|
||||
|
||||
size_t num_threads_with_callbacks = 0;
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(g_thread_mutex);
|
||||
num_threads_with_callbacks = g_thread_callback_counts.size();
|
||||
std::cout << "Threads that invoked callbacks: " << num_threads_with_callbacks << "\n";
|
||||
}
|
||||
std::cout << "===========================================================\n";
|
||||
|
||||
// Verify we actually traced something
|
||||
if(g_code_object_load_count.load() == 0)
|
||||
{
|
||||
std::cerr << "ERROR: No code objects were traced!\n";
|
||||
std::abort();
|
||||
}
|
||||
|
||||
if(g_kernel_symbol_load_count.load() == 0)
|
||||
{
|
||||
std::cerr << "ERROR: No kernel symbols were traced!\n";
|
||||
std::abort();
|
||||
}
|
||||
|
||||
// Verify multi-threaded execution
|
||||
if(num_threads_with_callbacks < 2)
|
||||
{
|
||||
std::cerr << "ERROR: Expected callbacks from multiple threads, got "
|
||||
<< num_threads_with_callbacks << "\n";
|
||||
std::abort();
|
||||
}
|
||||
|
||||
std::cout << "Test PASSED: Successfully traced multi-threaded code object loading!\n";
|
||||
}
|
||||
|
||||
int
|
||||
tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data)
|
||||
{
|
||||
client_fini_func = fini_func;
|
||||
|
||||
auto status = rocprofiler_create_context(&client_ctx);
|
||||
if(status != ROCPROFILER_STATUS_SUCCESS)
|
||||
{
|
||||
std::cerr << "Failed to create context\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
status =
|
||||
rocprofiler_configure_callback_tracing_service(client_ctx,
|
||||
ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT,
|
||||
nullptr,
|
||||
0,
|
||||
codeobj_tracing_callback,
|
||||
tool_data);
|
||||
|
||||
if(status != ROCPROFILER_STATUS_SUCCESS)
|
||||
{
|
||||
std::cerr << "Failed to configure code object tracing\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
int valid_ctx = 0;
|
||||
status = rocprofiler_context_is_valid(client_ctx, &valid_ctx);
|
||||
if(status != ROCPROFILER_STATUS_SUCCESS || valid_ctx == 0)
|
||||
{
|
||||
std::cerr << "Context is not valid\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
status = rocprofiler_start_context(client_ctx);
|
||||
if(status != ROCPROFILER_STATUS_SUCCESS)
|
||||
{
|
||||
std::cerr << "Failed to start context\n";
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::cout << "ROCProfiler multi-threaded code object tool initialized\n";
|
||||
return 0;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
extern "C" rocprofiler_tool_configure_result_t*
|
||||
rocprofiler_configure(uint32_t /* version */,
|
||||
const char* /* runtime_version */,
|
||||
uint32_t /* priority */,
|
||||
rocprofiler_client_id_t* id)
|
||||
{
|
||||
id->name = "CodeObjectMultiThreadedClient";
|
||||
client_id = id;
|
||||
|
||||
static auto cfg = rocprofiler_tool_configure_result_t{
|
||||
sizeof(rocprofiler_tool_configure_result_t), &tool_init, &tool_fini, nullptr};
|
||||
|
||||
return &cfg;
|
||||
}
|
||||
Reference in New Issue
Block a user