Adding RCCL_CLIQUE_DEBUG to help debug experimental clique feature (#300)
Этот коммит содержится в:
коммит произвёл
GitHub
родитель
4e68229c8b
Коммит
c8d08a7c2f
@@ -36,6 +36,7 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
|
||||
size_t const N = args->clique.count; // Total number of elements to reduce
|
||||
int const nBlocks = args->clique.nChannels; // Total number of blocks assigned to this kernel (may be different than gridDim.x)
|
||||
int const blockId = args->clique.bid; // 0-indexed blockIdx for this threadblock (may be different than blockIdx.x)
|
||||
int const verbose = args->clique.verbose; // For debug purposes
|
||||
int const rank = args->comm->rank; // Current rank
|
||||
|
||||
// Each threadblock works independently of others on a subsection of the input
|
||||
@@ -46,6 +47,10 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
|
||||
size_t const currBlockStop = min(currBlockStart + perBlockN, N);
|
||||
size_t const blockN = currBlockStop - currBlockStart;
|
||||
|
||||
if (verbose && threadIdx.x == 0)
|
||||
{
|
||||
printf("Rank %d block %d of %d %lu -> %lu [%lu]\n", rank, blockId, nBlocks, currBlockStart, currBlockStop, blockN);
|
||||
}
|
||||
if (blockN > 0)
|
||||
{
|
||||
// Prepare input / output subarrays
|
||||
@@ -69,7 +74,12 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
|
||||
|
||||
// Even if there was nothing for this GPU to do, it must participate in a barrier
|
||||
// because other GPUs may be modifying this GPUs output buffer still
|
||||
if (blockId == 0) WaitForBarrier<NUM_RANKS>(cliquePtrs->barrier);
|
||||
if (blockId == 0)
|
||||
{
|
||||
if (verbose && threadIdx.x == 0) printf("Rank %d enters GPU barrier\n", rank);
|
||||
WaitForBarrier<NUM_RANKS>(cliquePtrs->barrier, rank, verbose);
|
||||
if (verbose && threadIdx.x == 0) printf("Rank %d exits GPU barrier\n", rank);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@@ -62,7 +62,7 @@ typedef struct
|
||||
|
||||
// Multi-GPU (on same node) barrier. One thread per grid per GPU updates barrier / waits
|
||||
template <int NUM_RANKS>
|
||||
__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
|
||||
__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier, int const rank, int const verbose)
|
||||
{
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
@@ -71,6 +71,7 @@ __forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
|
||||
int localSense = *barrier.localSense;
|
||||
|
||||
int val = __atomic_add_fetch(barrier.globalCount, 1, __ATOMIC_SEQ_CST);
|
||||
if (verbose) printf("Rank %d arrived at GPU barrier %d\n", rank, val);
|
||||
if (val == NUM_RANKS)
|
||||
{
|
||||
// Last arrival resets barrier
|
||||
@@ -80,7 +81,18 @@ __forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
|
||||
else
|
||||
{
|
||||
// Wait for all ranks to reach barrier
|
||||
while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense);
|
||||
int counter = 0;
|
||||
while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense)
|
||||
{
|
||||
if (verbose)
|
||||
{
|
||||
counter++;
|
||||
if (counter == 100000000)
|
||||
{
|
||||
printf("Rank %d waiting on GPU barrier: (%d != %d)", rank, *barrier.globalSense, localSense);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,6 +48,7 @@ int* CliqueManager::m_staticGpuBarrierMem = NULL;
|
||||
RCCL_PARAM(EnableClique, "ENABLE_CLIQUE", 0); // Opt-in environment variable for clique-based kernels
|
||||
RCCL_PARAM(AllReduceCliqueByteLimit, "CLIQUE_ALLREDUCE_BYTE_LIMIT", 2097152); // Max number of bytes to use clique-based kernels for all reduce
|
||||
RCCL_PARAM(AllReduceNumChannels, "CLIQUE_ALLREDUCE_NCHANNELS", 4); // Number of channels to use for all-reduce
|
||||
RCCL_PARAM(CliqueDebug, "CLIQUE_DEBUG", 0); // Emit debug messages
|
||||
|
||||
CliqueManager::CliqueManager(int const rank,
|
||||
int const numRanks,
|
||||
@@ -337,7 +338,7 @@ ncclResult_t CliqueManager::SetCliqueCollectiveArgs(CollectiveArgs* args)
|
||||
// Prepare clique argments (NOTE: clique pointers are not ready yet)
|
||||
int opIndex = args->opCount % NCCL_MAX_OPS;
|
||||
args->clique.ptrs = &m_pinnedCliquePtrs[opIndex];
|
||||
|
||||
args->clique.verbose = rcclParamCliqueDebug();
|
||||
|
||||
// Determine number of channels to use for this collective
|
||||
args->clique.nChannels = rcclParamAllReduceNumChannels();
|
||||
@@ -477,16 +478,33 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, siz
|
||||
|
||||
void CliqueManager::WaitForBarrier()
|
||||
{
|
||||
int const verbose = rcclParamCliqueDebug();
|
||||
|
||||
// Sense inversion barrier
|
||||
m_cpuBarrierLocalSense = 1 - m_cpuBarrierLocalSense;
|
||||
|
||||
if (__sync_add_and_fetch(m_cpuBarrierGlobalCount, 1) == m_numRanks)
|
||||
int val = __sync_add_and_fetch(m_cpuBarrierGlobalCount, 1);
|
||||
if (verbose) INFO(NCCL_INIT, "Rank %d reaches barrier at %d", m_rank, val);
|
||||
|
||||
if (val == m_numRanks)
|
||||
{
|
||||
// Reset the barrier
|
||||
STORE(m_cpuBarrierGlobalCount, 0);
|
||||
STORE(m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
|
||||
} else {
|
||||
while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense);
|
||||
size_t counter = 0;
|
||||
while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense)
|
||||
{
|
||||
if (verbose)
|
||||
{
|
||||
counter++;
|
||||
if (counter == 100000000)
|
||||
{
|
||||
WARN("Rank %d waiting in CPU barrier: (%d != %d)", m_rank, *m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
|
||||
}
|
||||
}
|
||||
}
|
||||
if (verbose) INFO(NCCL_INIT, "Rank %d leaves CPU barrier", m_rank);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -201,6 +201,7 @@ struct CollectiveArgs {
|
||||
uint8_t bid;
|
||||
uint8_t nChannels;
|
||||
size_t count;
|
||||
int verbose;
|
||||
cliqueDevicePtrs_t* ptrs;
|
||||
} clique;
|
||||
// [/RCCL]
|
||||
|
||||
Ссылка в новой задаче
Block a user