Added Cooperative Groups configure option and header

[ROCm/rocshmem commit: 63667a3167]
Este commit está contenido en:
Yiltan Hassan Temucin
2024-10-01 13:26:43 -07:00
padre 45976b23ae
commit caa6d356c0
Se han modificado 5 ficheros con 60 adiciones y 0 borrados
+1
Ver fichero
@@ -68,6 +68,7 @@ option(USE_FUNC_CALL "Force compiler to use function calls on library API" OFF)
option(USE_SHARED_CTX "Request support for shared ctx between WG" OFF)
option(USE_SINGLE_NODE "Enable single node support only." OFF)
option(USE_HOST_SIDE_HDP_FLUSH "Use a polling thread to flush the HDP cache on the host." OFF)
option(USE_COOPERATIVE_GROUPS "Use cooperative groups for internal syncronization" OFF)
option(BUILD_FUNCTIONAL_TESTS "Build the functional tests" ON)
option(BUILD_SOS_TESTS "Build the host-facing tests" OFF)
option(BUILD_UNIT_TESTS "Build the unit tests" ON)
+1
Ver fichero
@@ -14,3 +14,4 @@
#cmakedefine USE_FUNC_CALL
#cmakedefine USE_SINGLE_NODE
#cmakedefine USE_HOST_SIDE_HDP_FLUSH
#cmakedefine USE_COOPERATIVE_GROUPS
+31
Ver fichero
@@ -0,0 +1,31 @@
#!/bin/bash
# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
if [ -z $1 ]
then
install_path=~/rocshmem
else
install_path=$1
fi
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=$install_path \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GPU_IB=OFF \
-DUSE_RO=OFF \
-DUSE_DC=OFF \
-DUSE_IPC=ON \
-DUSE_COHERENT_HEAP=ON \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_SINGLE_NODE=ON \
-DUSE_HOST_SIDE_HDP_FLUSH=OFF \
-DUSE_COOPERATIVE_GROUPS=ON \
$src_path
cmake --build . --parallel 8
cmake --install .
@@ -26,6 +26,11 @@
#include "../util.hpp"
#include "ipc_team.hpp"
#ifdef USE_COOPERATIVE_GROUPS
#include <hip/hip_cooperative_groups.h>
namespace cg = cooperative_groups;
#endif /* USE_COOPERATIVE_GROUPS */
namespace rocshmem {
__device__ void IPCContext::internal_direct_barrier(int pe, int PE_start,
@@ -84,8 +89,14 @@ __device__ void IPCContext::internal_atomic_barrier(int pe, int PE_start,
// Uses PE values that are relative to world
__device__ void IPCContext::internal_sync(int pe, int PE_start, int stride,
int PE_size, int64_t *pSync) {
#ifdef USE_COOPERATIVE_GROUPS
cg::grid_group grid = cg::this_grid();
grid.sync();
if (0 == grid.thread_rank()) {
#else
__syncthreads();
if (is_thread_zero_in_block()) {
#endif /* USE_COOPERATIVE_GROUPS */
if (PE_size < 64) {
internal_direct_barrier(pe, PE_start, stride, PE_size, pSync);
} else {
@@ -93,7 +104,11 @@ __device__ void IPCContext::internal_sync(int pe, int PE_start, int stride,
}
}
__threadfence();
#ifdef USE_COOPERATIVE_GROUPS
grid.sync();
#else
__syncthreads();
#endif /* USE_COOPERATIVE_GROUPS */
}
__device__ void IPCContext::sync(roc_shmem_team_t team) {
@@ -83,9 +83,21 @@ void SyncTester::launchKernel(dim3 gridSize, dim3 blockSize, int loop,
roc_shmem_team_split_strided(ROC_SHMEM_TEAM_WORLD, 0, 1, n_pes, nullptr, 0,
&team_sync_world_dup);
#ifdef USE_COOPERATIVE_GROUPS
void* kernelParams[] = {(void*)&loop,
(void*)&args.skip,
(void*)&timer,
(void*)&_type,
(void*)&_shmem_context,
(void*)&team_sync_world_dup};
CHECK_HIP(hipLaunchCooperativeKernel(SyncTest, gridSize, blockSize,
kernelParams, shared_bytes, stream));
#else
hipLaunchKernelGGL(SyncTest, gridSize, blockSize, shared_bytes, stream, loop,
args.skip, timer, _type, _shmem_context,
team_sync_world_dup);
#endif /* USE_COOPERATIVE_GROUPS */
num_msgs = (loop + args.skip) * gridSize.x;
num_timed_msgs = loop;