[rocprof-sys] Fix fork() handling for GPU profiling and AMD SMI (#1930)
- Fix fork() handling for GPU profiling and AMD SMI - Add hipMallocConcurrency test for CI with GPU
This commit is contained in:
@@ -8,6 +8,8 @@ list(APPEND _FLAGS -fno-inline)
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
find_package(rocprofiler-systems REQUIRED COMPONENTS user)
|
||||
|
||||
# Basic fork example
|
||||
add_executable(fork-example fork.cpp)
|
||||
target_link_libraries(
|
||||
fork-example
|
||||
@@ -18,3 +20,83 @@ target_compile_options(fork-example PRIVATE ${_FLAGS})
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(TARGETS fork-example DESTINATION bin COMPONENT rocprofiler-systems-examples)
|
||||
endif()
|
||||
|
||||
# HIP fork example (multi-process concurrency test)
|
||||
find_package(hip QUIET HINTS ${ROCmVersion_DIR} PATHS ${ROCmVersion_DIR})
|
||||
|
||||
find_program(
|
||||
HIPCC_EXECUTABLE
|
||||
NAMES hipcc
|
||||
HINTS ${ROCmVersion_DIR} ${ROCM_PATH}
|
||||
ENV ROCM_PATH
|
||||
/opt/rocm
|
||||
PATHS ${ROCmVersion_DIR} ${ROCM_PATH}
|
||||
ENV ROCM_PATH
|
||||
/opt/rocm
|
||||
NO_CACHE
|
||||
)
|
||||
mark_as_advanced(HIPCC_EXECUTABLE)
|
||||
|
||||
if(HIPCC_EXECUTABLE)
|
||||
if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
|
||||
if(
|
||||
CMAKE_CXX_COMPILER STREQUAL HIPCC_EXECUTABLE
|
||||
OR "${CMAKE_CXX_COMPILER}" MATCHES "hipcc"
|
||||
)
|
||||
set(CMAKE_CXX_COMPILER_IS_HIPCC 1 CACHE BOOL "HIP compiler")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if(
|
||||
CMAKE_CXX_COMPILER_IS_HIPCC
|
||||
OR hip_FOUND
|
||||
OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND HIPCC_EXECUTABLE)
|
||||
OR COMMAND rocprofiler_systems_custom_compilation
|
||||
)
|
||||
add_executable(hipMallocConcurrencyMproc hipMallocConcurrencyMproc.cpp)
|
||||
target_link_libraries(hipMallocConcurrencyMproc PRIVATE Threads::Threads)
|
||||
|
||||
if(
|
||||
CMAKE_CXX_COMPILER_ID MATCHES "Clang"
|
||||
AND NOT CMAKE_CXX_COMPILER_IS_HIPCC
|
||||
AND NOT HIPCC_EXECUTABLE
|
||||
)
|
||||
target_link_libraries(
|
||||
hipMallocConcurrencyMproc
|
||||
PRIVATE
|
||||
$<TARGET_NAME_IF_EXISTS:rocprofiler-systems::rocprofiler-systems-compile-options>
|
||||
$<TARGET_NAME_IF_EXISTS:hip::host>
|
||||
$<TARGET_NAME_IF_EXISTS:hip::device>
|
||||
)
|
||||
else()
|
||||
target_compile_options(hipMallocConcurrencyMproc PRIVATE -W -Wall)
|
||||
endif()
|
||||
|
||||
if("${CMAKE_BUILD_TYPE}" MATCHES "Release")
|
||||
target_compile_options(hipMallocConcurrencyMproc PRIVATE -g1)
|
||||
endif()
|
||||
|
||||
if(NOT CMAKE_CXX_COMPILER_IS_HIPCC AND HIPCC_EXECUTABLE)
|
||||
# defined in MacroUtilities.cmake
|
||||
rocprofiler_systems_custom_compilation(COMPILER ${HIPCC_EXECUTABLE} TARGET hipMallocConcurrencyMproc)
|
||||
endif()
|
||||
|
||||
if(ROCPROFSYS_INSTALL_EXAMPLES)
|
||||
install(
|
||||
TARGETS hipMallocConcurrencyMproc
|
||||
DESTINATION bin
|
||||
COMPONENT rocprofiler-systems-examples
|
||||
)
|
||||
endif()
|
||||
else()
|
||||
message(
|
||||
AUTHOR_WARNING
|
||||
"hipMallocConcurrencyMproc target could not be built (missing HIP support)"
|
||||
)
|
||||
endif()
|
||||
else()
|
||||
message(
|
||||
AUTHOR_WARNING
|
||||
"hipcc could not be found. Cannot build hipMallocConcurrencyMproc target"
|
||||
)
|
||||
endif()
|
||||
|
||||
@@ -0,0 +1,432 @@
|
||||
/*
|
||||
Copyright (c) 2024 - 2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include <assert.h>
|
||||
#include <iostream>
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#define REQUIRE assert
|
||||
#define HIP_CHECK(cmd) \
|
||||
{ \
|
||||
hipError_t error = cmd; \
|
||||
if(error != hipSuccess) \
|
||||
{ \
|
||||
fprintf(stderr, "error: '%s'(%d) at %s:%d\n", hipGetErrorString(error), \
|
||||
error, __FILE__, __LINE__); \
|
||||
exit(EXIT_FAILURE); \
|
||||
} \
|
||||
}
|
||||
#define TOL 0.001
|
||||
|
||||
namespace HipTest
|
||||
{
|
||||
// Setters and Memory Management
|
||||
|
||||
template <typename T>
|
||||
void
|
||||
setDefaultData(size_t numElements, T* A_h, T* B_h, T* C_h)
|
||||
{
|
||||
// Initialize the host data:
|
||||
|
||||
for(size_t i = 0; i < numElements; i++)
|
||||
{
|
||||
if(std::is_same<T, int>::value || std::is_same<T, unsigned int>::value)
|
||||
{
|
||||
if(A_h) A_h[i] = 3;
|
||||
if(B_h) B_h[i] = 4;
|
||||
if(C_h) C_h[i] = 5;
|
||||
}
|
||||
else if(std::is_same<T, char>::value || std::is_same<T, unsigned char>::value)
|
||||
{
|
||||
if(A_h) A_h[i] = 'a';
|
||||
if(B_h) B_h[i] = 'b';
|
||||
if(C_h) C_h[i] = 'c';
|
||||
}
|
||||
else
|
||||
{
|
||||
if(A_h) A_h[i] = 3.146f + i;
|
||||
if(B_h) B_h[i] = 1.618f + i;
|
||||
if(C_h) C_h[i] = 1.4f + i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool
|
||||
initArraysForHost(T** A_h, T** B_h, T** C_h, size_t N, bool usePinnedHost = false)
|
||||
{
|
||||
size_t Nbytes = N * sizeof(T);
|
||||
|
||||
if(usePinnedHost)
|
||||
{
|
||||
if(A_h)
|
||||
{
|
||||
HIP_CHECK(hipHostMalloc((void**) A_h, Nbytes));
|
||||
}
|
||||
if(B_h)
|
||||
{
|
||||
HIP_CHECK(hipHostMalloc((void**) B_h, Nbytes));
|
||||
}
|
||||
if(C_h)
|
||||
{
|
||||
HIP_CHECK(hipHostMalloc((void**) C_h, Nbytes));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(A_h)
|
||||
{
|
||||
*A_h = (T*) malloc(Nbytes);
|
||||
REQUIRE(*A_h != nullptr);
|
||||
}
|
||||
|
||||
if(B_h)
|
||||
{
|
||||
*B_h = (T*) malloc(Nbytes);
|
||||
REQUIRE(*B_h != nullptr);
|
||||
}
|
||||
|
||||
if(C_h)
|
||||
{
|
||||
*C_h = (T*) malloc(Nbytes);
|
||||
REQUIRE(*C_h != nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
setDefaultData(N, A_h ? *A_h : nullptr, B_h ? *B_h : nullptr, C_h ? *C_h : nullptr);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool
|
||||
initArrays(T** A_d, T** B_d, T** C_d, T** A_h, T** B_h, T** C_h, size_t N,
|
||||
bool usePinnedHost = false)
|
||||
{
|
||||
size_t Nbytes = N * sizeof(T);
|
||||
|
||||
if(A_d)
|
||||
{
|
||||
HIP_CHECK(hipMalloc(A_d, Nbytes));
|
||||
}
|
||||
if(B_d)
|
||||
{
|
||||
HIP_CHECK(hipMalloc(B_d, Nbytes));
|
||||
}
|
||||
if(C_d)
|
||||
{
|
||||
HIP_CHECK(hipMalloc(C_d, Nbytes));
|
||||
}
|
||||
|
||||
return initArraysForHost(A_h, B_h, C_h, N, usePinnedHost);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool
|
||||
freeArraysForHost(T* A_h, T* B_h, T* C_h, bool usePinnedHost)
|
||||
{
|
||||
if(usePinnedHost)
|
||||
{
|
||||
if(A_h)
|
||||
{
|
||||
HIP_CHECK(hipHostFree(A_h));
|
||||
}
|
||||
if(B_h)
|
||||
{
|
||||
HIP_CHECK(hipHostFree(B_h));
|
||||
}
|
||||
if(C_h)
|
||||
{
|
||||
HIP_CHECK(hipHostFree(C_h));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(A_h)
|
||||
{
|
||||
free(A_h);
|
||||
}
|
||||
if(B_h)
|
||||
{
|
||||
free(B_h);
|
||||
}
|
||||
if(C_h)
|
||||
{
|
||||
free(C_h);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool
|
||||
freeArrays(T* A_d, T* B_d, T* C_d, T* A_h, T* B_h, T* C_h, bool usePinnedHost)
|
||||
{
|
||||
if(A_d)
|
||||
{
|
||||
HIP_CHECK(hipFree(A_d));
|
||||
}
|
||||
if(B_d)
|
||||
{
|
||||
HIP_CHECK(hipFree(B_d));
|
||||
}
|
||||
if(C_d)
|
||||
{
|
||||
HIP_CHECK(hipFree(C_d));
|
||||
}
|
||||
|
||||
return freeArraysForHost(A_h, B_h, C_h, usePinnedHost);
|
||||
}
|
||||
|
||||
static inline unsigned
|
||||
setNumBlocks(unsigned blocksPerCU, unsigned threadsPerBlock, size_t N)
|
||||
{
|
||||
int device{ 0 };
|
||||
HIP_CHECK(hipGetDevice(&device));
|
||||
hipDeviceProp_t props{};
|
||||
HIP_CHECK(hipGetDeviceProperties(&props, device));
|
||||
|
||||
unsigned blocks = props.multiProcessorCount * blocksPerCU;
|
||||
if(blocks * threadsPerBlock < N)
|
||||
{
|
||||
blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
|
||||
}
|
||||
|
||||
return blocks;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void
|
||||
vectorADD(const T* A_d, const T* B_d, T* C_d, size_t NELEM)
|
||||
{
|
||||
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
|
||||
size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
for(size_t i = offset; i < NELEM; i += stride)
|
||||
{
|
||||
C_d[i] = A_d[i] + B_d[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t
|
||||
checkVectors(T* A, T* B, T* Out, size_t N, T (*F)(T a, T b), bool expectMatch = true,
|
||||
bool reportMismatch = true)
|
||||
{
|
||||
size_t mismatchCount = 0;
|
||||
size_t firstMismatch = 0;
|
||||
size_t mismatchesToPrint = 10;
|
||||
for(size_t i = 0; i < N; i++)
|
||||
{
|
||||
T expected = F(A[i], B[i]);
|
||||
if(std::fabs(Out[i] - expected) > TOL)
|
||||
{
|
||||
if(mismatchCount == 0)
|
||||
{
|
||||
firstMismatch = i;
|
||||
}
|
||||
mismatchCount++;
|
||||
if((mismatchCount <= mismatchesToPrint) && expectMatch)
|
||||
{
|
||||
std::cout << "Mismatch at " << i << " Computed: " << Out[i]
|
||||
<< " Expected: " << expected << std::endl;
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(reportMismatch)
|
||||
{
|
||||
if(expectMatch)
|
||||
{
|
||||
if(mismatchCount)
|
||||
{
|
||||
std::cout << mismatchCount
|
||||
<< " Mismatches First Mismatch at index : " << firstMismatch
|
||||
<< std::endl;
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(mismatchCount == 0)
|
||||
{
|
||||
std::cout << "Expected Mismatch but not found any" << std::endl;
|
||||
REQUIRE(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return mismatchCount;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
size_t
|
||||
checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch = true,
|
||||
bool reportMismatch = true)
|
||||
{
|
||||
return checkVectors<T>(
|
||||
A_h, B_h, result_H, N, [](T a, T b) { return a + b; }, expectMatch,
|
||||
reportMismatch);
|
||||
}
|
||||
} // namespace HipTest
|
||||
|
||||
/**
|
||||
* Validates data consistency on supplied gpu
|
||||
*/
|
||||
static bool
|
||||
validateMemoryOnGPU(int gpu, bool concurOnOneGPU = false)
|
||||
{
|
||||
// Check if any ROCm-capable GPU is available (for CI without GPU)
|
||||
int deviceCount = 0;
|
||||
hipError_t err = hipGetDeviceCount(&deviceCount);
|
||||
if(err != hipSuccess || deviceCount == 0)
|
||||
{
|
||||
printf("No ROCm-capable device detected. Validation PASSED (skipped)\n");
|
||||
return true; // Return success for CI environments
|
||||
}
|
||||
|
||||
int * A_d, *B_d, *C_d;
|
||||
int * A_h, *B_h, *C_h;
|
||||
size_t prevAvl, prevTot, curAvl, curTot;
|
||||
bool TestPassed = true;
|
||||
constexpr auto N = 4 * 1024 * 1024;
|
||||
constexpr auto blocksPerCU = 6; // to hide latency
|
||||
constexpr auto threadsPerBlock = 256;
|
||||
size_t Nbytes = N * sizeof(int);
|
||||
|
||||
HIP_CHECK(hipSetDevice(gpu));
|
||||
HIP_CHECK(hipMemGetInfo(&prevAvl, &prevTot));
|
||||
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
|
||||
HIP_CHECK(hipMemGetInfo(&curAvl, &curTot));
|
||||
|
||||
if(!concurOnOneGPU && (prevAvl < curAvl || prevTot != curTot))
|
||||
{
|
||||
// In concurrent calls on one GPU, we cannot verify leaking in this way
|
||||
printf("%s : Memory allocation mismatch observed."
|
||||
"Possible memory leak.\n",
|
||||
__func__);
|
||||
TestPassed &= false;
|
||||
}
|
||||
|
||||
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
|
||||
|
||||
HIP_CHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
|
||||
HIP_CHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
|
||||
|
||||
hipLaunchKernelGGL(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0,
|
||||
static_cast<const int*>(A_d), static_cast<const int*>(B_d), C_d,
|
||||
N);
|
||||
HIP_CHECK(hipGetLastError());
|
||||
HIP_CHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
|
||||
|
||||
if(!HipTest::checkVectorADD(A_h, B_h, C_h, N))
|
||||
{
|
||||
printf("Validation PASSED for gpu %d from pid %d\n", gpu, getpid());
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("Validation FAILED for gpu %d from pid %d\n", gpu, getpid());
|
||||
TestPassed = false;
|
||||
}
|
||||
|
||||
HIP_CHECK(hipMemGetInfo(&prevAvl, &prevTot));
|
||||
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
|
||||
HIP_CHECK(hipMemGetInfo(&curAvl, &curTot));
|
||||
|
||||
if(!concurOnOneGPU && (curAvl < prevAvl || prevTot != curTot))
|
||||
{
|
||||
// In concurrent calls on one GPU, we cannot verify leaking in this way
|
||||
std::cout << "validateMemoryOnGPU : Memory allocation mismatch observed."
|
||||
<< "Possible memory leak." << std::endl;
|
||||
TestPassed = false;
|
||||
}
|
||||
|
||||
if(!concurOnOneGPU && (prevAvl != curAvl || prevTot != curTot))
|
||||
{
|
||||
// In concurrent calls on one GPU, we cannot verify leaking in this way
|
||||
printf("%s : Memory allocation mismatch observed."
|
||||
"Possible memory leak.\n",
|
||||
__func__);
|
||||
TestPassed = false;
|
||||
}
|
||||
|
||||
return TestPassed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parallel execution of parent and child on gpu0
|
||||
*/
|
||||
void
|
||||
Unit_hipMalloc_ChildConcurrencyDefaultGpu()
|
||||
{
|
||||
int pid = 0;
|
||||
constexpr auto resSuccess = 0, resFailure = 1;
|
||||
|
||||
if((pid = fork()) < 0)
|
||||
{
|
||||
std::cout << "Child_Concurrency_DefaultGpu : fork() returned error : " << pid
|
||||
<< std::endl;
|
||||
REQUIRE(false);
|
||||
}
|
||||
else if(!pid)
|
||||
{ // Child process
|
||||
bool TestPassedChild = false;
|
||||
|
||||
// Allocates and validates memory on Gpu0 simultaneously with parent
|
||||
TestPassedChild = validateMemoryOnGPU(0, true);
|
||||
|
||||
if(TestPassedChild)
|
||||
{
|
||||
exit(resSuccess); // child exit with success status
|
||||
}
|
||||
else
|
||||
{
|
||||
exit(resFailure); // child exit with failure status
|
||||
}
|
||||
}
|
||||
else
|
||||
{ // Parent process
|
||||
int exitStatus;
|
||||
|
||||
// Allocates and validates memory on Gpu0 simultaneously with child
|
||||
bool TestPassed = validateMemoryOnGPU(0, true);
|
||||
|
||||
// Wait and get result from child
|
||||
pid = wait(&exitStatus);
|
||||
if((WEXITSTATUS(exitStatus) == resFailure) || (pid < 0)) TestPassed = false;
|
||||
|
||||
// Explicitly use the variable to avoid compiler warning
|
||||
(void) TestPassed;
|
||||
REQUIRE(TestPassed == true);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main()
|
||||
{
|
||||
Unit_hipMalloc_ChildConcurrencyDefaultGpu();
|
||||
std::cout << "Unit_hipMalloc_ChildConcurrencyDefaultGpu PASSED!" << std::endl;
|
||||
return 0;
|
||||
}
|
||||
@@ -1292,6 +1292,35 @@ device_count()
|
||||
{
|
||||
return gpu::device_count();
|
||||
}
|
||||
|
||||
void
|
||||
postfork_child_cleanup()
|
||||
{
|
||||
// In child process, disable AMD SMI to prevent shutdown errors
|
||||
ROCPROFSYS_VERBOSE_F(2, "Disabling AMD SMI in child process after fork...\n");
|
||||
|
||||
// Set to Finalized to prevent any sampling attempts (though is_child_process() check
|
||||
// in sample() already handles this)
|
||||
get_state().store(State::Finalized);
|
||||
|
||||
// Mark as not initialized so shutdown won't try to cleanup AMD SMI library
|
||||
is_initialized() = false;
|
||||
|
||||
// Clear device list to prevent any GPU operations
|
||||
data::device_list.clear();
|
||||
}
|
||||
|
||||
void
|
||||
postfork_parent_reinit()
|
||||
{
|
||||
// In parent process, AMD SMI device handles may be corrupted after fork
|
||||
// Reinitialize AMD SMI to get fresh handles
|
||||
ROCPROFSYS_VERBOSE_F(2, "Reinitializing AMD SMI in parent process after fork...\n");
|
||||
|
||||
// Shutdown and reinitialize to get fresh device handles
|
||||
shutdown();
|
||||
setup();
|
||||
}
|
||||
} // namespace amd_smi
|
||||
} // namespace rocprofsys
|
||||
|
||||
|
||||
@@ -71,6 +71,14 @@ post_process();
|
||||
|
||||
void set_state(State);
|
||||
|
||||
// Fork handling - cleanup AMD SMI state in child process
|
||||
void
|
||||
postfork_child_cleanup();
|
||||
|
||||
// Fork handling - reinitialize AMD SMI state in parent process
|
||||
void
|
||||
postfork_parent_reinit();
|
||||
|
||||
struct settings
|
||||
{
|
||||
bool busy = true;
|
||||
@@ -133,6 +141,7 @@ private:
|
||||
friend void rocprofsys::amd_smi::sample();
|
||||
friend void rocprofsys::amd_smi::shutdown();
|
||||
friend void rocprofsys::amd_smi::post_process();
|
||||
friend void rocprofsys::amd_smi::postfork_child_cleanup();
|
||||
|
||||
static size_t device_count;
|
||||
static std::set<uint32_t> device_list;
|
||||
@@ -168,6 +177,14 @@ post_process()
|
||||
inline void
|
||||
set_state(State)
|
||||
{}
|
||||
|
||||
inline void
|
||||
postfork_child_cleanup()
|
||||
{}
|
||||
|
||||
inline void
|
||||
postfork_parent_reinit()
|
||||
{}
|
||||
#endif
|
||||
} // namespace amd_smi
|
||||
} // namespace rocprofsys
|
||||
|
||||
+1
-1
@@ -60,7 +60,7 @@ invoke_exit_gotcha(const exit_gotcha::gotcha_data& _data, FuncT _func, Args... _
|
||||
{
|
||||
threading::clear_callbacks();
|
||||
|
||||
if(get_state() < State::Finalized)
|
||||
if(get_state() < State::Finalized && !is_child_process())
|
||||
{
|
||||
if(config::settings_are_configured())
|
||||
{
|
||||
|
||||
+10
@@ -97,6 +97,11 @@ postfork_parent()
|
||||
{
|
||||
if(postfork_parent_lock) return;
|
||||
|
||||
// Reinitialize AMD SMI in parent process to get fresh device handles before
|
||||
// unblocking the shutdown/setup transition. AMD SMI device handles may be corrupted
|
||||
// after fork.
|
||||
if(config::get_use_sampling()) sampling::postfork_parent_reinit();
|
||||
|
||||
rocprofsys::categories::enable_categories(config::get_enabled_categories());
|
||||
|
||||
if(config::get_use_sampling()) sampling::unblock_samples();
|
||||
@@ -115,6 +120,11 @@ postfork_child()
|
||||
<< "Error! child process " << process::get_id()
|
||||
<< " believes it is the root process " << get_root_process_id() << "\n";
|
||||
|
||||
set_state(State::Finalized);
|
||||
|
||||
// Clean up AMD SMI in child process before other shutdowns
|
||||
if(config::get_use_sampling()) sampling::postfork_child_cleanup();
|
||||
|
||||
settings::enabled() = false;
|
||||
settings::verbose() = -127;
|
||||
settings::debug() = false;
|
||||
|
||||
@@ -32,6 +32,7 @@
|
||||
#include "core/state.hpp"
|
||||
#include "core/trace_cache/cache_manager.hpp"
|
||||
#include "core/utility.hpp"
|
||||
#include "library/amd_smi.hpp"
|
||||
#include "library/components/backtrace.hpp"
|
||||
#include "library/components/backtrace_metrics.hpp"
|
||||
#include "library/components/backtrace_timestamp.hpp"
|
||||
@@ -1835,6 +1836,19 @@ struct sampling_initialization
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
void
|
||||
postfork_parent_reinit()
|
||||
{
|
||||
if(config::get_use_process_sampling() && config::get_use_amd_smi())
|
||||
amd_smi::postfork_parent_reinit();
|
||||
}
|
||||
|
||||
void
|
||||
postfork_child_cleanup()
|
||||
{
|
||||
if(config::get_use_process_sampling() && config::get_use_amd_smi())
|
||||
amd_smi::postfork_child_cleanup();
|
||||
}
|
||||
} // namespace sampling
|
||||
} // namespace rocprofsys
|
||||
|
||||
|
||||
@@ -65,5 +65,11 @@ void unblock_signals(std::set<int> = {});
|
||||
|
||||
void
|
||||
post_process();
|
||||
|
||||
void
|
||||
postfork_parent_reinit();
|
||||
|
||||
void
|
||||
postfork_child_cleanup();
|
||||
} // namespace sampling
|
||||
} // namespace rocprofsys
|
||||
|
||||
@@ -24,6 +24,7 @@
|
||||
#include "api.hpp"
|
||||
#include "core/utility.hpp"
|
||||
#include "library/components/pthread_create_gotcha.hpp"
|
||||
#include "library/runtime.hpp"
|
||||
#include "library/thread_info.hpp"
|
||||
|
||||
#include <timemory/backends/threading.hpp>
|
||||
@@ -44,9 +45,10 @@ thread_deleter<void>::operator()() const
|
||||
{
|
||||
auto _tid = _info->index_data->sequent_value;
|
||||
|
||||
component::pthread_create_gotcha::shutdown(_tid);
|
||||
if(!is_child_process()) component::pthread_create_gotcha::shutdown(_tid);
|
||||
set_thread_state(ThreadState::Completed);
|
||||
if(get_state() < State::Finalized && _tid == 0) rocprofsys_finalize_hidden();
|
||||
if(get_state() < State::Finalized && !is_child_process() && _tid == 0)
|
||||
rocprofsys_finalize_hidden();
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -40,3 +40,19 @@ rocprofiler_systems_add_test(
|
||||
RUNTIME_FAIL_REGEX "(${ROCPROFSYS_ABORT_FAIL_REGEX})"
|
||||
REWRITE_RUN_FAIL_REGEX "(${ROCPROFSYS_ABORT_FAIL_REGEX})"
|
||||
)
|
||||
|
||||
rocprofiler_systems_add_test(
|
||||
NAME fork-hipMallocConcurrency
|
||||
TARGET hipMallocConcurrencyMproc
|
||||
GPU ON
|
||||
REWRITE_ARGS -e -v 2 --print-instrumented modules -i 16
|
||||
RUNTIME_ARGS -e -v 1 --label file -i 16
|
||||
ENVIRONMENT
|
||||
"${_base_environment};ROCPROFSYS_SAMPLING_FREQ=250;ROCPROFSYS_SAMPLING_REALTIME=ON"
|
||||
SAMPLING_PASS_REGEX "Validation PASSED|fork.. called on PID"
|
||||
RUNTIME_PASS_REGEX "Validation PASSED|fork.. called on PID"
|
||||
REWRITE_RUN_PASS_REGEX "Validation PASSED|fork.. called on PID"
|
||||
SAMPLING_FAIL_REGEX "(${ROCPROFSYS_ABORT_FAIL_REGEX})"
|
||||
RUNTIME_FAIL_REGEX "(${ROCPROFSYS_ABORT_FAIL_REGEX})"
|
||||
REWRITE_RUN_FAIL_REGEX "(${ROCPROFSYS_ABORT_FAIL_REGEX})"
|
||||
)
|
||||
|
||||
مرجع در شماره جدید
Block a user