Adding RCCL_CLIQUE_DEBUG to help debug experimental clique feature (#300)

Этот коммит содержится в:
gilbertlee-amd
2020-11-13 09:07:11 -07:00
коммит произвёл GitHub
родитель 4e68229c8b
Коммит c8d08a7c2f
4 изменённых файлов: 47 добавлений и 6 удалений
+11 -1
Просмотреть файл
@@ -36,6 +36,7 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
size_t const N = args->clique.count; // Total number of elements to reduce
int const nBlocks = args->clique.nChannels; // Total number of blocks assigned to this kernel (may be different than gridDim.x)
int const blockId = args->clique.bid; // 0-indexed blockIdx for this threadblock (may be different than blockIdx.x)
int const verbose = args->clique.verbose; // For debug purposes
int const rank = args->comm->rank; // Current rank
// Each threadblock works independently of others on a subsection of the input
@@ -46,6 +47,10 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
size_t const currBlockStop = min(currBlockStart + perBlockN, N);
size_t const blockN = currBlockStop - currBlockStart;
if (verbose && threadIdx.x == 0)
{
printf("Rank %d block %d of %d %lu -> %lu [%lu]\n", rank, blockId, nBlocks, currBlockStart, currBlockStop, blockN);
}
if (blockN > 0)
{
// Prepare input / output subarrays
@@ -69,7 +74,12 @@ __device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
// Even if there was nothing for this GPU to do, it must participate in a barrier
// because other GPUs may be modifying this GPUs output buffer still
if (blockId == 0) WaitForBarrier<NUM_RANKS>(cliquePtrs->barrier);
if (blockId == 0)
{
if (verbose && threadIdx.x == 0) printf("Rank %d enters GPU barrier\n", rank);
WaitForBarrier<NUM_RANKS>(cliquePtrs->barrier, rank, verbose);
if (verbose && threadIdx.x == 0) printf("Rank %d exits GPU barrier\n", rank);
}
}
#endif
+14 -2
Просмотреть файл
@@ -62,7 +62,7 @@ typedef struct
// Multi-GPU (on same node) barrier. One thread per grid per GPU updates barrier / waits
template <int NUM_RANKS>
__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier, int const rank, int const verbose)
{
if (threadIdx.x == 0)
{
@@ -71,6 +71,7 @@ __forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
int localSense = *barrier.localSense;
int val = __atomic_add_fetch(barrier.globalCount, 1, __ATOMIC_SEQ_CST);
if (verbose) printf("Rank %d arrived at GPU barrier %d\n", rank, val);
if (val == NUM_RANKS)
{
// Last arrival resets barrier
@@ -80,7 +81,18 @@ __forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
else
{
// Wait for all ranks to reach barrier
while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense);
int counter = 0;
while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense)
{
if (verbose)
{
counter++;
if (counter == 100000000)
{
printf("Rank %d waiting on GPU barrier: (%d != %d)", rank, *barrier.globalSense, localSense);
}
}
}
}
}
}
+21 -3
Просмотреть файл
@@ -48,6 +48,7 @@ int* CliqueManager::m_staticGpuBarrierMem = NULL;
RCCL_PARAM(EnableClique, "ENABLE_CLIQUE", 0); // Opt-in environment variable for clique-based kernels
RCCL_PARAM(AllReduceCliqueByteLimit, "CLIQUE_ALLREDUCE_BYTE_LIMIT", 2097152); // Max number of bytes to use clique-based kernels for all reduce
RCCL_PARAM(AllReduceNumChannels, "CLIQUE_ALLREDUCE_NCHANNELS", 4); // Number of channels to use for all-reduce
RCCL_PARAM(CliqueDebug, "CLIQUE_DEBUG", 0); // Emit debug messages
CliqueManager::CliqueManager(int const rank,
int const numRanks,
@@ -337,7 +338,7 @@ ncclResult_t CliqueManager::SetCliqueCollectiveArgs(CollectiveArgs* args)
// Prepare clique argments (NOTE: clique pointers are not ready yet)
int opIndex = args->opCount % NCCL_MAX_OPS;
args->clique.ptrs = &m_pinnedCliquePtrs[opIndex];
args->clique.verbose = rcclParamCliqueDebug();
// Determine number of channels to use for this collective
args->clique.nChannels = rcclParamAllReduceNumChannels();
@@ -477,16 +478,33 @@ ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, siz
void CliqueManager::WaitForBarrier()
{
int const verbose = rcclParamCliqueDebug();
// Sense inversion barrier
m_cpuBarrierLocalSense = 1 - m_cpuBarrierLocalSense;
if (__sync_add_and_fetch(m_cpuBarrierGlobalCount, 1) == m_numRanks)
int val = __sync_add_and_fetch(m_cpuBarrierGlobalCount, 1);
if (verbose) INFO(NCCL_INIT, "Rank %d reaches barrier at %d", m_rank, val);
if (val == m_numRanks)
{
// Reset the barrier
STORE(m_cpuBarrierGlobalCount, 0);
STORE(m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
} else {
while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense);
size_t counter = 0;
while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense)
{
if (verbose)
{
counter++;
if (counter == 100000000)
{
WARN("Rank %d waiting in CPU barrier: (%d != %d)", m_rank, *m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
}
}
}
if (verbose) INFO(NCCL_INIT, "Rank %d leaves CPU barrier", m_rank);
}
}
+1
Просмотреть файл
@@ -201,6 +201,7 @@ struct CollectiveArgs {
uint8_t bid;
uint8_t nChannels;
size_t count;
int verbose;
cliqueDevicePtrs_t* ptrs;
} clique;
// [/RCCL]