diff --git a/projects/rocprofiler-systems/examples/fork/CMakeLists.txt b/projects/rocprofiler-systems/examples/fork/CMakeLists.txt index c2448a7012..5a54900e61 100644 --- a/projects/rocprofiler-systems/examples/fork/CMakeLists.txt +++ b/projects/rocprofiler-systems/examples/fork/CMakeLists.txt @@ -8,6 +8,8 @@ list(APPEND _FLAGS -fno-inline) find_package(Threads REQUIRED) find_package(rocprofiler-systems REQUIRED COMPONENTS user) + +# Basic fork example add_executable(fork-example fork.cpp) target_link_libraries( fork-example @@ -18,3 +20,83 @@ target_compile_options(fork-example PRIVATE ${_FLAGS}) if(ROCPROFSYS_INSTALL_EXAMPLES) install(TARGETS fork-example DESTINATION bin COMPONENT rocprofiler-systems-examples) endif() + +# HIP fork example (multi-process concurrency test) +find_package(hip QUIET HINTS ${ROCmVersion_DIR} PATHS ${ROCmVersion_DIR}) + +find_program( + HIPCC_EXECUTABLE + NAMES hipcc + HINTS ${ROCmVersion_DIR} ${ROCM_PATH} + ENV ROCM_PATH + /opt/rocm + PATHS ${ROCmVersion_DIR} ${ROCM_PATH} + ENV ROCM_PATH + /opt/rocm + NO_CACHE +) +mark_as_advanced(HIPCC_EXECUTABLE) + +if(HIPCC_EXECUTABLE) + if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE) + if( + CMAKE_CXX_COMPILER STREQUAL HIPCC_EXECUTABLE + OR "${CMAKE_CXX_COMPILER}" MATCHES "hipcc" + ) + set(CMAKE_CXX_COMPILER_IS_HIPCC 1 CACHE BOOL "HIP compiler") + endif() + endif() + + if( + CMAKE_CXX_COMPILER_IS_HIPCC + OR hip_FOUND + OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND HIPCC_EXECUTABLE) + OR COMMAND rocprofiler_systems_custom_compilation + ) + add_executable(hipMallocConcurrencyMproc hipMallocConcurrencyMproc.cpp) + target_link_libraries(hipMallocConcurrencyMproc PRIVATE Threads::Threads) + + if( + CMAKE_CXX_COMPILER_ID MATCHES "Clang" + AND NOT CMAKE_CXX_COMPILER_IS_HIPCC + AND NOT HIPCC_EXECUTABLE + ) + target_link_libraries( + hipMallocConcurrencyMproc + PRIVATE + $ + $ + $ + ) + else() + target_compile_options(hipMallocConcurrencyMproc PRIVATE -W -Wall) + endif() + + if("${CMAKE_BUILD_TYPE}" MATCHES "Release") + target_compile_options(hipMallocConcurrencyMproc PRIVATE -g1) + endif() + + if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE) + # defined in MacroUtilities.cmake + rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET hipMallocConcurrencyMproc) + endif() + + if(ROCPROFSYS_INSTALL_EXAMPLES) + install( + TARGETS hipMallocConcurrencyMproc + DESTINATION bin + COMPONENT rocprofiler-systems-examples + ) + endif() + else() + message( + AUTHOR_WARNING + "hipMallocConcurrencyMproc target could not be built (missing HIP support)" + ) + endif() +else() + message( + AUTHOR_WARNING + "hipcc could not be found. Cannot build hipMallocConcurrencyMproc target" + ) +endif() diff --git a/projects/rocprofiler-systems/examples/fork/hipMallocConcurrencyMproc.cpp b/projects/rocprofiler-systems/examples/fork/hipMallocConcurrencyMproc.cpp new file mode 100644 index 0000000000..6eb95ad233 --- /dev/null +++ b/projects/rocprofiler-systems/examples/fork/hipMallocConcurrencyMproc.cpp @@ -0,0 +1,432 @@ +/* +Copyright (c) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "hip/hip_runtime.h" +#include +#include +#include +#include +#include + +#define REQUIRE assert +#define HIP_CHECK(cmd) \ + { \ + hipError_t error = cmd; \ + if(error != hipSuccess) \ + { \ + fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), \ + error, __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } \ + } +#define TOL 0.001 + +namespace HipTest +{ +// Setters and Memory Management + +template +void +setDefaultData(size_t numElements, T* A_h, T* B_h, T* C_h) +{ + // Initialize the host data: + + for(size_t i = 0; i < numElements; i++) + { + if(std::is_same::value || std::is_same::value) + { + if(A_h) A_h[i] = 3; + if(B_h) B_h[i] = 4; + if(C_h) C_h[i] = 5; + } + else if(std::is_same::value || std::is_same::value) + { + if(A_h) A_h[i] = 'a'; + if(B_h) B_h[i] = 'b'; + if(C_h) C_h[i] = 'c'; + } + else + { + if(A_h) A_h[i] = 3.146f + i; + if(B_h) B_h[i] = 1.618f + i; + if(C_h) C_h[i] = 1.4f + i; + } + } +} + +template +bool +initArraysForHost(T** A_h, T** B_h, T** C_h, size_t N, bool usePinnedHost = false) +{ + size_t Nbytes = N * sizeof(T); + + if(usePinnedHost) + { + if(A_h) + { + HIP_CHECK(hipHostMalloc((void**) A_h, Nbytes)); + } + if(B_h) + { + HIP_CHECK(hipHostMalloc((void**) B_h, Nbytes)); + } + if(C_h) + { + HIP_CHECK(hipHostMalloc((void**) C_h, Nbytes)); + } + } + else + { + if(A_h) + { + *A_h = (T*) malloc(Nbytes); + REQUIRE(*A_h != nullptr); + } + + if(B_h) + { + *B_h = (T*) malloc(Nbytes); + REQUIRE(*B_h != nullptr); + } + + if(C_h) + { + *C_h = (T*) malloc(Nbytes); + REQUIRE(*C_h != nullptr); + } + } + + setDefaultData(N, A_h ? *A_h : nullptr, B_h ? *B_h : nullptr, C_h ? *C_h : nullptr); + return true; +} + +template +bool +initArrays(T** A_d, T** B_d, T** C_d, T** A_h, T** B_h, T** C_h, size_t N, + bool usePinnedHost = false) +{ + size_t Nbytes = N * sizeof(T); + + if(A_d) + { + HIP_CHECK(hipMalloc(A_d, Nbytes)); + } + if(B_d) + { + HIP_CHECK(hipMalloc(B_d, Nbytes)); + } + if(C_d) + { + HIP_CHECK(hipMalloc(C_d, Nbytes)); + } + + return initArraysForHost(A_h, B_h, C_h, N, usePinnedHost); +} + +template +bool +freeArraysForHost(T* A_h, T* B_h, T* C_h, bool usePinnedHost) +{ + if(usePinnedHost) + { + if(A_h) + { + HIP_CHECK(hipHostFree(A_h)); + } + if(B_h) + { + HIP_CHECK(hipHostFree(B_h)); + } + if(C_h) + { + HIP_CHECK(hipHostFree(C_h)); + } + } + else + { + if(A_h) + { + free(A_h); + } + if(B_h) + { + free(B_h); + } + if(C_h) + { + free(C_h); + } + } + return true; +} + +template +bool +freeArrays(T* A_d, T* B_d, T* C_d, T* A_h, T* B_h, T* C_h, bool usePinnedHost) +{ + if(A_d) + { + HIP_CHECK(hipFree(A_d)); + } + if(B_d) + { + HIP_CHECK(hipFree(B_d)); + } + if(C_d) + { + HIP_CHECK(hipFree(C_d)); + } + + return freeArraysForHost(A_h, B_h, C_h, usePinnedHost); +} + +static inline unsigned +setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N) +{ + int device{ 0 }; + HIP_CHECK(hipGetDevice(&device)); + hipDeviceProp_t props{}; + HIP_CHECK(hipGetDeviceProperties(&props, device)); + + unsigned blocks = props.multiProcessorCount * blocksPerCU; + if(blocks * threadsPerBlock < N) + { + blocks = (N + threadsPerBlock - 1) / threadsPerBlock; + } + + return blocks; +} + +template +__global__ void +vectorADD(const T* A_d, const T* B_d, T* C_d, size_t NELEM) +{ + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x; + + for(size_t i = offset; i < NELEM; i += stride) + { + C_d[i] = A_d[i] + B_d[i]; + } +} + +template +size_t +checkVectors(T* A, T* B, T* Out, size_t N, T (*F)(T a, T b), bool expectMatch = true, + bool reportMismatch = true) +{ + size_t mismatchCount = 0; + size_t firstMismatch = 0; + size_t mismatchesToPrint = 10; + for(size_t i = 0; i < N; i++) + { + T expected = F(A[i], B[i]); + if(std::fabs(Out[i] - expected) > TOL) + { + if(mismatchCount == 0) + { + firstMismatch = i; + } + mismatchCount++; + if((mismatchCount <= mismatchesToPrint) && expectMatch) + { + std::cout << "Mismatch at " << i << " Computed: " << Out[i] + << " Expected: " << expected << std::endl; + REQUIRE(false); + } + } + } + + if(reportMismatch) + { + if(expectMatch) + { + if(mismatchCount) + { + std::cout << mismatchCount + << " Mismatches First Mismatch at index : " << firstMismatch + << std::endl; + REQUIRE(false); + } + } + else + { + if(mismatchCount == 0) + { + std::cout << "Expected Mismatch but not found any" << std::endl; + REQUIRE(false); + } + } + } + + return mismatchCount; +} + +template +size_t +checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch = true, + bool reportMismatch = true) +{ + return checkVectors( + A_h, B_h, result_H, N, [](T a, T b) { return a + b; }, expectMatch, + reportMismatch); +} +} // namespace HipTest + +/** + * Validates data consistency on supplied gpu + */ +static bool +validateMemoryOnGPU(int gpu, bool concurOnOneGPU = false) +{ + // Check if any ROCm-capable GPU is available (for CI without GPU) + int deviceCount = 0; + hipError_t err = hipGetDeviceCount(&deviceCount); + if(err != hipSuccess || deviceCount == 0) + { + printf("No ROCm-capable device detected. Validation PASSED (skipped)\n"); + return true; // Return success for CI environments + } + + int * A_d, *B_d, *C_d; + int * A_h, *B_h, *C_h; + size_t prevAvl, prevTot, curAvl, curTot; + bool TestPassed = true; + constexpr auto N = 4 * 1024 * 1024; + constexpr auto blocksPerCU = 6; // to hide latency + constexpr auto threadsPerBlock = 256; + size_t Nbytes = N * sizeof(int); + + HIP_CHECK(hipSetDevice(gpu)); + HIP_CHECK(hipMemGetInfo(&prevAvl, &prevTot)); + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + HIP_CHECK(hipMemGetInfo(&curAvl, &curTot)); + + if(!concurOnOneGPU && (prevAvl < curAvl || prevTot != curTot)) + { + // In concurrent calls on one GPU, we cannot verify leaking in this way + printf("%s : Memory allocation mismatch observed." + "Possible memory leak.\n", + __func__); + TestPassed &= false; + } + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + + HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + + hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, + static_cast(A_d), static_cast(B_d), C_d, + N); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + + if(!HipTest::checkVectorADD(A_h, B_h, C_h, N)) + { + printf("Validation PASSED for gpu %d from pid %d\n", gpu, getpid()); + } + else + { + printf("Validation FAILED for gpu %d from pid %d\n", gpu, getpid()); + TestPassed = false; + } + + HIP_CHECK(hipMemGetInfo(&prevAvl, &prevTot)); + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIP_CHECK(hipMemGetInfo(&curAvl, &curTot)); + + if(!concurOnOneGPU && (curAvl < prevAvl || prevTot != curTot)) + { + // In concurrent calls on one GPU, we cannot verify leaking in this way + std::cout << "validateMemoryOnGPU : Memory allocation mismatch observed." + << "Possible memory leak." << std::endl; + TestPassed = false; + } + + if(!concurOnOneGPU && (prevAvl != curAvl || prevTot != curTot)) + { + // In concurrent calls on one GPU, we cannot verify leaking in this way + printf("%s : Memory allocation mismatch observed." + "Possible memory leak.\n", + __func__); + TestPassed = false; + } + + return TestPassed; +} + +/** + * Parallel execution of parent and child on gpu0 + */ +void +Unit_hipMalloc_ChildConcurrencyDefaultGpu() +{ + int pid = 0; + constexpr auto resSuccess = 0, resFailure = 1; + + if((pid = fork()) < 0) + { + std::cout << "Child_Concurrency_DefaultGpu : fork() returned error : " << pid + << std::endl; + REQUIRE(false); + } + else if(!pid) + { // Child process + bool TestPassedChild = false; + + // Allocates and validates memory on Gpu0 simultaneously with parent + TestPassedChild = validateMemoryOnGPU(0, true); + + if(TestPassedChild) + { + exit(resSuccess); // child exit with success status + } + else + { + exit(resFailure); // child exit with failure status + } + } + else + { // Parent process + int exitStatus; + + // Allocates and validates memory on Gpu0 simultaneously with child + bool TestPassed = validateMemoryOnGPU(0, true); + + // Wait and get result from child + pid = wait(&exitStatus); + if((WEXITSTATUS(exitStatus) == resFailure) || (pid < 0)) TestPassed = false; + + // Explicitly use the variable to avoid compiler warning + (void) TestPassed; + REQUIRE(TestPassed == true); + } +} + +int +main() +{ + Unit_hipMalloc_ChildConcurrencyDefaultGpu(); + std::cout << "Unit_hipMalloc_ChildConcurrencyDefaultGpu PASSED!" << std::endl; + return 0; +} diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp index 5a7b1edcc3..42b9d571ec 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.cpp @@ -1292,6 +1292,35 @@ device_count() { return gpu::device_count(); } + +void +postfork_child_cleanup() +{ + // In child process, disable AMD SMI to prevent shutdown errors + ROCPROFSYS_VERBOSE_F(2, "Disabling AMD SMI in child process after fork...\n"); + + // Set to Finalized to prevent any sampling attempts (though is_child_process() check + // in sample() already handles this) + get_state().store(State::Finalized); + + // Mark as not initialized so shutdown won't try to cleanup AMD SMI library + is_initialized() = false; + + // Clear device list to prevent any GPU operations + data::device_list.clear(); +} + +void +postfork_parent_reinit() +{ + // In parent process, AMD SMI device handles may be corrupted after fork + // Reinitialize AMD SMI to get fresh handles + ROCPROFSYS_VERBOSE_F(2, "Reinitializing AMD SMI in parent process after fork...\n"); + + // Shutdown and reinitialize to get fresh device handles + shutdown(); + setup(); +} } // namespace amd_smi } // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.hpp index 826b9301ed..044b6ae8a3 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/amd_smi.hpp @@ -71,6 +71,14 @@ post_process(); void set_state(State); +// Fork handling - cleanup AMD SMI state in child process +void +postfork_child_cleanup(); + +// Fork handling - reinitialize AMD SMI state in parent process +void +postfork_parent_reinit(); + struct settings { bool busy = true; @@ -133,6 +141,7 @@ private: friend void rocprofsys::amd_smi::sample(); friend void rocprofsys::amd_smi::shutdown(); friend void rocprofsys::amd_smi::post_process(); + friend void rocprofsys::amd_smi::postfork_child_cleanup(); static size_t device_count; static std::set device_list; @@ -168,6 +177,14 @@ post_process() inline void set_state(State) {} + +inline void +postfork_child_cleanup() +{} + +inline void +postfork_parent_reinit() +{} #endif } // namespace amd_smi } // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/exit_gotcha.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/exit_gotcha.cpp index 3cf4730a10..656697d73b 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/exit_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/exit_gotcha.cpp @@ -60,7 +60,7 @@ invoke_exit_gotcha(const exit_gotcha::gotcha_data& _data, FuncT _func, Args... _ { threading::clear_callbacks(); - if(get_state() < State::Finalized) + if(get_state() < State::Finalized && !is_child_process()) { if(config::settings_are_configured()) { diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/fork_gotcha.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/fork_gotcha.cpp index af2152b921..d77fd6de8b 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/fork_gotcha.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/components/fork_gotcha.cpp @@ -97,6 +97,11 @@ postfork_parent() { if(postfork_parent_lock) return; + // Reinitialize AMD SMI in parent process to get fresh device handles before + // unblocking the shutdown/setup transition. AMD SMI device handles may be corrupted + // after fork. + if(config::get_use_sampling()) sampling::postfork_parent_reinit(); + rocprofsys::categories::enable_categories(config::get_enabled_categories()); if(config::get_use_sampling()) sampling::unblock_samples(); @@ -115,6 +120,11 @@ postfork_child() << "Error! child process " << process::get_id() << " believes it is the root process " << get_root_process_id() << "\n"; + set_state(State::Finalized); + + // Clean up AMD SMI in child process before other shutdowns + if(config::get_use_sampling()) sampling::postfork_child_cleanup(); + settings::enabled() = false; settings::verbose() = -127; settings::debug() = false; diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp index 1c1be80192..8b98777b7e 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.cpp @@ -32,6 +32,7 @@ #include "core/state.hpp" #include "core/trace_cache/cache_manager.hpp" #include "core/utility.hpp" +#include "library/amd_smi.hpp" #include "library/components/backtrace.hpp" #include "library/components/backtrace_metrics.hpp" #include "library/components/backtrace_timestamp.hpp" @@ -1835,6 +1836,19 @@ struct sampling_initialization } }; } // namespace +void +postfork_parent_reinit() +{ + if(config::get_use_process_sampling() && config::get_use_amd_smi()) + amd_smi::postfork_parent_reinit(); +} + +void +postfork_child_cleanup() +{ + if(config::get_use_process_sampling() && config::get_use_amd_smi()) + amd_smi::postfork_child_cleanup(); +} } // namespace sampling } // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.hpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.hpp index a3bd3fedd9..387d778cc0 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.hpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/sampling.hpp @@ -65,5 +65,11 @@ void unblock_signals(std::set = {}); void post_process(); + +void +postfork_parent_reinit(); + +void +postfork_child_cleanup(); } // namespace sampling } // namespace rocprofsys diff --git a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_deleter.cpp b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_deleter.cpp index ffc6357a10..604c9f5c65 100644 --- a/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_deleter.cpp +++ b/projects/rocprofiler-systems/source/lib/rocprof-sys/library/thread_deleter.cpp @@ -24,6 +24,7 @@ #include "api.hpp" #include "core/utility.hpp" #include "library/components/pthread_create_gotcha.hpp" +#include "library/runtime.hpp" #include "library/thread_info.hpp" #include @@ -44,9 +45,10 @@ thread_deleter::operator()() const { auto _tid = _info->index_data->sequent_value; - component::pthread_create_gotcha::shutdown(_tid); + if(!is_child_process()) component::pthread_create_gotcha::shutdown(_tid); set_thread_state(ThreadState::Completed); - if(get_state() < State::Finalized && _tid == 0) rocprofsys_finalize_hidden(); + if(get_state() < State::Finalized && !is_child_process() && _tid == 0) + rocprofsys_finalize_hidden(); } else { diff --git a/projects/rocprofiler-systems/tests/rocprof-sys-fork-tests.cmake b/projects/rocprofiler-systems/tests/rocprof-sys-fork-tests.cmake index 700e8d4cd6..7424eeda04 100644 --- a/projects/rocprofiler-systems/tests/rocprof-sys-fork-tests.cmake +++ b/projects/rocprofiler-systems/tests/rocprof-sys-fork-tests.cmake @@ -40,3 +40,19 @@ rocprofiler_systems_add_test( RUNTIME_FAIL_REGEX "(${ROCPROFSYS_ABORT_FAIL_REGEX})" REWRITE_RUN_FAIL_REGEX "(${ROCPROFSYS_ABORT_FAIL_REGEX})" ) + +rocprofiler_systems_add_test( + NAME fork-hipMallocConcurrency + TARGET hipMallocConcurrencyMproc + GPU ON + REWRITE_ARGS -e -v 2 --print-instrumented modules -i 16 + RUNTIME_ARGS -e -v 1 --label file -i 16 + ENVIRONMENT + "${_base_environment};ROCPROFSYS_SAMPLING_FREQ=250;ROCPROFSYS_SAMPLING_REALTIME=ON" + SAMPLING_PASS_REGEX "Validation PASSED|fork.. called on PID" + RUNTIME_PASS_REGEX "Validation PASSED|fork.. called on PID" + REWRITE_RUN_PASS_REGEX "Validation PASSED|fork.. called on PID" + SAMPLING_FAIL_REGEX "(${ROCPROFSYS_ABORT_FAIL_REGEX})" + RUNTIME_FAIL_REGEX "(${ROCPROFSYS_ABORT_FAIL_REGEX})" + REWRITE_RUN_FAIL_REGEX "(${ROCPROFSYS_ABORT_FAIL_REGEX})" +)