fix reduction test for gfx1201 (#374)
* fix reduction for gfx942 and 1201
match the synchronizaation of internal_putmem_wg and internal_getmem_wg
to their non-internal counterparts. the internal_putmem_wg is used in
the ipc reduction
* move specialization to internal_putmem
[ROCm/rocshmem commit: 8d2504d6c1]
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
cc727261de
Коммит
e38f98fad5
@@ -178,7 +178,6 @@ ExecTest() {
|
||||
fi
|
||||
|
||||
CMD+=" >> $LOG_DIR/$TEST_LOG_NAME.log 2>&1"
|
||||
|
||||
# Run Test
|
||||
if [ $NUM_GPUS -ge $NUM_RANKS ] || [[ "" != "$HOSTFILE" ]]; then
|
||||
echo $TEST_LOG_NAME
|
||||
@@ -452,7 +451,6 @@ TestColl() {
|
||||
|
||||
ExecTest "teambroadcast" 2 1 64 32768
|
||||
|
||||
ExecTest "fcollect" 2 1 64 512
|
||||
ExecTest "fcollect" 2 1 64 32768
|
||||
|
||||
ExecTest "teamreduction" 2 1 64 32768
|
||||
@@ -641,7 +639,6 @@ TestGDA() {
|
||||
|
||||
ExecTest "teambroadcast" 2 1 1 32768
|
||||
|
||||
ExecTest "fcollect" 2 1 1 512
|
||||
ExecTest "fcollect" 2 1 1 32768
|
||||
|
||||
# deadlock on gda, size 8KB
|
||||
|
||||
@@ -164,7 +164,13 @@ __device__ void IPCContext::internal_putmem(void *dest, const void *source,
|
||||
size_t nelems, int pe) {
|
||||
uint64_t L_offset = reinterpret_cast<char *>(dest) - wrk_sync_pool_bases_[my_pe];
|
||||
memcpy_lane(wrk_sync_pool_bases_[pe] + L_offset, const_cast<void *>(source), nelems);
|
||||
#if defined(__gfx90a__)
|
||||
__threadfence_system();
|
||||
#elif defined (__gfx1201__) || defined (__gfx1100__)
|
||||
fence(pe);
|
||||
#else
|
||||
ipcImpl_.ipcFence();
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ void IPCContext::internal_getmem(void *dest, const void *source,
|
||||
@@ -180,6 +186,15 @@ __device__ void IPCContext::internal_putmem_wg(void *dest, const void *source,
|
||||
uint64_t L_offset = reinterpret_cast<char *>(dest) - wrk_sync_pool_bases_[my_pe];
|
||||
memcpy_wg(wrk_sync_pool_bases_[pe] + L_offset, const_cast<void *>(source), nelems);
|
||||
__syncthreads();
|
||||
#if defined(__gfx90a__)
|
||||
__threadfence_system();
|
||||
#elif defined (__gfx1201__) || defined (__gfx1100__)
|
||||
if (is_thread_zero_in_block() ) {
|
||||
fence(pe);
|
||||
}
|
||||
#else
|
||||
ipcImpl_.ipcFence();
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ void IPCContext::internal_getmem_wg(void *dest, const void *source,
|
||||
@@ -188,13 +203,22 @@ __device__ void IPCContext::internal_getmem_wg(void *dest, const void *source,
|
||||
uint64_t L_offset = const_cast<char *>(src_typed) - wrk_sync_pool_bases_[my_pe];
|
||||
memcpy_wg(dest, wrk_sync_pool_bases_[pe] + L_offset, nelems);
|
||||
__syncthreads();
|
||||
ipcImpl_.ipcFence();
|
||||
}
|
||||
|
||||
__device__ void IPCContext::internal_putmem_wave(void *dest,
|
||||
const void *source, size_t nelems, int pe) {
|
||||
uint64_t L_offset = reinterpret_cast<char *>(dest) - wrk_sync_pool_bases_[my_pe];
|
||||
memcpy_wave(wrk_sync_pool_bases_[pe] + L_offset, const_cast<void *>(source), nelems);
|
||||
#if defined(__gfx90a__)
|
||||
__threadfence_system();
|
||||
#elif defined (__gfx1201__) || defined (__gfx1100__)
|
||||
if (is_thread_zero_in_wave() ) {
|
||||
fence(pe);
|
||||
}
|
||||
#else
|
||||
ipcImpl_.ipcFence();
|
||||
#endif
|
||||
}
|
||||
|
||||
__device__ void IPCContext::internal_getmem_wave(void *dest,
|
||||
|
||||
@@ -217,7 +217,6 @@ __device__ void IPCContext::internal_direct_allreduce(
|
||||
threadfence_system();
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
for (int i = wg_id; i < num_pes; i += wg_size) {
|
||||
@@ -311,7 +310,7 @@ __device__ void IPCContext::internal_ring_allreduce(
|
||||
|
||||
for (int seg = 0; seg < n_seg; seg++) {
|
||||
off_seg = seg * seg_size;
|
||||
// Loop 2 in the algorithm above
|
||||
// Loop 1 in the algorithm above
|
||||
for (int iter = 0; iter < PE_size - 1; iter++) {
|
||||
off_send = (((my_pe_in_team + 1 - iter + 2 * PE_size) % PE_size) * chunk_size);
|
||||
off_recv = (((my_pe_in_team - iter + 2 * PE_size) % PE_size) * chunk_size);
|
||||
@@ -322,12 +321,8 @@ __device__ void IPCContext::internal_ring_allreduce(
|
||||
|
||||
if (is_thread_zero_in_block()) {
|
||||
fence();
|
||||
|
||||
wait_val = seg + 100;
|
||||
internal_putmem(&pSync[iter], &wait_val, sizeof(*pSync), send_pe);
|
||||
#if defined(__gfx90a__)
|
||||
__threadfence_system();
|
||||
#endif /* __gfx90a__ */
|
||||
wait_until(&pSync[iter], ROCSHMEM_CMP_EQ, wait_val);
|
||||
}
|
||||
__syncthreads();
|
||||
@@ -344,21 +339,18 @@ __device__ void IPCContext::internal_ring_allreduce(
|
||||
|
||||
if (is_thread_zero_in_block()) {
|
||||
fence();
|
||||
wait_val = seg + 100;
|
||||
wait_val = seg + 10;
|
||||
internal_putmem(&pSync[iter], &wait_val, sizeof(*pSync), send_pe);
|
||||
#if defined(__gfx90a__)
|
||||
__threadfence_system();
|
||||
#endif /* __gfx90a__ */
|
||||
wait_until(&pSync[iter], ROCSHMEM_CMP_EQ, wait_val);
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
for (int i = wg_id; i < 2 * num_pes - 2; i += wg_size) {
|
||||
pSync[i] = ROCSHMEM_SYNC_VALUE;
|
||||
}
|
||||
threadfence_system();
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
@@ -400,7 +392,6 @@ __device__ int IPCContext::reduce(rocshmem_team_t team, T *dest,
|
||||
const T *p_src = (source + (n_seg * seg_size));
|
||||
int p_count = nreduce - (n_seg * seg_size);
|
||||
int p_chunk = p_count / PE_size;
|
||||
|
||||
internal_ring_allreduce<T, Op>(p_dst, p_src, p_count, team_obj, 1,
|
||||
(p_chunk * PE_size), p_chunk);
|
||||
|
||||
|
||||
Ссылка в новой задаче
Block a user