Merge remote-tracking branch 'nccl-tests/master' into topic/v2.13.4-sync

This commit is contained in:
Edgar Gabriel
2022-10-14 16:02:54 -05:00
21 changed files with 1925 additions and 672 deletions
+5 -2
View File
@@ -4,6 +4,9 @@
# See LICENCE.txt for license information
#
BUILDDIR ?= build
override BUILDDIR := $(abspath $(BUILDDIR))
.PHONY : all clean
default : src.build
@@ -14,7 +17,7 @@ all: ${TARGETS:%=%.build}
clean: ${TARGETS:%=%.clean}
%.build:
${MAKE} -C $* build
${MAKE} -C $* build BUILDDIR=${BUILDDIR}
%.clean:
${MAKE} -C $* clean
${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
+13 -4
View File
@@ -1,6 +1,6 @@
#
# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
# Modifications are Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -65,13 +65,22 @@ build: ${BIN_FILES}
clean:
rm -rf ${DST_DIR}
${DST_DIR}/%.o: %.cu common.h
TEST_VERIFIABLE_SRCDIR := ../verifiable
TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
include ../verifiable/verifiable.mk
${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o
${DST_DIR}/timer.o: timer.cc timer.h
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
@printf "Linking %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}"
+12 -24
View File
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,24 +8,15 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s", size, count, typeName);
}
#define ALIGN 4
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count/nranks;
*recvcount = (count/nranks)*nranks;
*sendInplaceOffset = count/nranks;
size_t base = (count/(ALIGN*nranks))*ALIGN;
*sendcount = base;
*recvcount = base*nranks;
*sendInplaceOffset = base;
*recvInplaceOffset = 0;
*paramcount = *sendcount;
*paramcount = base;
}
testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -35,18 +26,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
}
k++;
}
@@ -98,7 +86,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
}
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
}
return testSuccess;
}
+4 -19
View File
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
}
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
@@ -35,16 +23,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
int k = 0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
k++;
}
+7 -22
View File
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
}
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = (count/nranks)*nranks;
*recvcount = (count/nranks)*nranks;
@@ -35,19 +23,16 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
int k=0;
for (int i=0; i<args->nGpus; i++) {
char* str = getenv("NCCL_TESTS_DEVICE");
int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j));
size_t partcount = sendcount/nranks;
TESTCHECK(InitData(((char*)args->expected[k])+ j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
}
k++;
}
@@ -101,7 +86,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
}
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
}
return testSuccess;
}
+6 -21
View File
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
}
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
@@ -34,17 +22,14 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(args->expected[k], recvcount, type, rep, root));
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
TESTCHECK(InitData(args->expected[k], recvcount, 0, type, ncclSum, rep, 1, 0));
k++;
}
HIPCHECK(hipDeviceSynchronize());
@@ -114,7 +99,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
for (int i=0; i<type_count; i++) {
for (int j=begin_root; j<=end_root; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
}
}
return testSuccess;
+258 -401
View File
@@ -1,7 +1,7 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -11,11 +11,14 @@
#include "common.h"
#include <pthread.h>
#include <cstdio>
#include <type_traits>
#include <getopt.h>
#include <libgen.h>
//#define DEBUG_PRINT
#include "../verifiable/verifiable.h"
int test_ncclVersion = 0; // init'd with ncclGetVersion()
#if NCCL_MAJOR >= 2
@@ -54,6 +57,12 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
const char *test_memorytypes[nccl_NUM_MTYPES] = {"coarse", "fine", "host", "managed"};
// For libnccl's < 2.13
extern "C" __attribute__((weak)) char const* ncclGetLastError(ncclComm_t comm) {
return "";
}
int is_main_proc = 0;
thread_local int is_main_thread = 0;
// Command line parameter defaults
@@ -75,7 +84,10 @@ static int blocking_coll = 0;
static int memorytype = 0;
static int stress_cycles = 1;
static uint32_t cumask[4];
static int streamnull = 0;
static int timeout = 0;
static int cudaGraphLaunches = 0;
static int report_cputime = 0;
// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
static int average = 1;
static int numDevices = 1;
@@ -152,374 +164,164 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
return true;
}
double DeltaMaxValue(ncclDataType_t type) {
switch(type) {
case ncclHalf: return 1e-2;
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
case ncclBfloat16: return 1e-2;
#endif
case ncclFloat: return 1e-5;
case ncclDouble: return 1e-12;
case ncclInt:
#if NCCL_MAJOR >= 2
case ncclUint8:
//case ncclInt32:
case ncclUint32:
#endif
case ncclInt64:
case ncclUint64: return 1e-200;
}
return 1e-200;
}
template<typename T> __device__
double absDiff(T a, T b) {
return fabs((double)(b - a));
}
template<> __device__
double absDiff<half>(half a, half b) {
float x = __half2float(a);
float y = __half2float(b);
return fabs((double)(y-x));
}
template<typename T> __device__
float toFloat(T a) {
return (float)a;
}
template<> __device__
float toFloat(half a) {
return __half2float(a);
}
#if defined(RCCL_BFLOAT16)
template<> __device__
float toFloat(rccl_bfloat16 a) {
return (float)(a);
}
#endif
template<typename T, int BSIZE> __global__
void deltaKern(void* A_, void* B_, size_t count, double* max) {
const T* A = (const T*)A_;
const T* B = (const T*)B_;
__shared__ double temp[BSIZE];
int tid = blockIdx.x*blockDim.x + threadIdx.x;
double locmax = 0.0;
for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
double delta = absDiff(A[i], B[i]);
if( delta > locmax ) {
locmax = delta;
#ifdef DEBUG_PRINT
if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
#endif
}
}
tid = threadIdx.x;
temp[tid] = locmax;
for(int stride = BSIZE/2; stride > 1; stride>>=1) {
__syncthreads();
if( tid < stride )
temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
}
__syncthreads();
if( threadIdx.x == 0)
max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
}
testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
switch (type) {
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
case ncclBfloat16:
hipLaunchKernelGGL((deltaKern<rccl_bfloat16, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
#endif
case ncclHalf:
hipLaunchKernelGGL((deltaKern<half, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
case ncclFloat:
hipLaunchKernelGGL((deltaKern<float, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
case ncclDouble:
hipLaunchKernelGGL((deltaKern<double, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
case ncclChar:
#if NCCL_MAJOR >= 2
case ncclUint8:
#endif
hipLaunchKernelGGL((deltaKern<uint8_t, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
case ncclInt:
#if NCCL_MAJOR >= 2
case ncclUint32:
#endif
hipLaunchKernelGGL((deltaKern<uint32_t, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
case ncclInt64:
case ncclUint64:
hipLaunchKernelGGL((deltaKern<uint64_t, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
}
HIPCHECK(hipDeviceSynchronize());
for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
CUDACHECK(cudaDeviceSynchronize());
return testSuccess;
}
// For integer values, we use values between 0 and 255
template<typename T>
__device__ T testValue(const size_t offset, const int rep, const int rank) {
uint8_t v = (rep+rank+offset) % 256;
return (T)v;
testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
return testSuccess;
}
// For floating point datatype, we use values between 0 and 1 otherwise the
// Product operation will produce NaNs.
template<>
__device__ double testValue<double>(const size_t offset, const int rep, const int rank) {
return 1.0/(1.0+(double)testValue<int>(offset, rep, rank));
}
template<>
__device__ float testValue<float>(const size_t offset, const int rep, const int rank) {
return 1.0/(1.0+(float)testValue<int>(offset, rep, rank));
}
template<>
__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
return __float2half(testValue<float>(offset, rep, rank));
}
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
template<>
__device__ rccl_bfloat16 testValue<rccl_bfloat16>(const size_t offset, const int rep, const int rank) {
return rccl_bfloat16(testValue<float>(offset, rep, rank));
}
#endif
// Operations
template<typename T>
__device__ T ncclOpSum(T a, T b) { return a+b; }
template<typename T>
__device__ T ncclOpProd(T a, T b) { return a*b; }
template<typename T>
__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
template<typename T>
__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
// Definitions for half
template<>
__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); }
template<>
__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); }
template<>
__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; }
template<>
__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
template<typename T>
__device__ T ncclPPOpIdent(T x, int arg) { return x; }
template<typename T>
__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); }
template<typename T>
__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); }
template<>
__device__ half ncclPPOpMul(half x, int arg) {
return __float2half(__half2float(x)*float(arg));
}
template<>
__device__ half ncclPPOpDiv(half x, int n) {
return __float2half(__half2float(x)/n);
}
#if RCCL_BFLOAT16 == 1
template<>
__device__ rccl_bfloat16 ncclPPOpMul(rccl_bfloat16 x, int arg) {
return (rccl_bfloat16)((float)(x)*float(arg));
}
template<>
__device__ rccl_bfloat16 ncclPPOpDiv(rccl_bfloat16 x, int n) {
return (rccl_bfloat16)((float)(x)/(float)(n));;
}
#endif
__host__ __device__ int preMulScalar(int rank) {
return 1 + rank%2;
testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
return testSuccess;
}
template<typename T, T (*Op)(T, T), T(*PreOp)(T,int), T(*PostOp)(T,int)>
__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
T val = testValue<T>(o+offset, rep, 0);
val = PreOp(val, preMulScalar(0));
for (int i=1; i<nranks; i++) {
T val1 = testValue<T>(o+offset, rep, i);
val1 = PreOp(val1, preMulScalar(i));
val = Op(val, val1);
}
data[o] = PostOp(val, nranks);
void Barrier(struct threadArgs *args) {
thread_local int epoch = 0;
static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER};
static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER};
static int counter[2] = {0, 0};
pthread_mutex_lock(&lock[epoch]);
if(++counter[epoch] == args->nThreads)
pthread_cond_broadcast(&cond[epoch]);
if(args->thread+1 == args->nThreads) {
while(counter[epoch] != args->nThreads)
pthread_cond_wait(&cond[epoch], &lock[epoch]);
#ifdef MPI_SUPPORT
MPI_Barrier(MPI_COMM_WORLD);
#endif
counter[epoch] = 0;
pthread_cond_broadcast(&cond[epoch]);
}
else {
while(counter[epoch] != 0)
pthread_cond_wait(&cond[epoch], &lock[epoch]);
}
pthread_mutex_unlock(&lock[epoch]);
epoch ^= 1;
}
#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel<type, op<type>, preop<type>, postop<type> >
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
#define OPS(type) \
KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \
KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
#define OPS(type) \
KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
#else
#define OPS(type) \
KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
#endif
static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = {
OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
OPS(rccl_bfloat16)
#endif
};
testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
dim3 grid = { 32, 1, 1 };
dim3 block = { 256, 1, 1 };
void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
HIPCHECK(hipLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, hipStreamDefault));
return testSuccess;
}
// Inter-thread/process barrier+allreduce. The quality of the return value
// for average=0 (which means broadcast from rank=0) is dubious. The returned
// value will actually be the result of process-local broadcast from the local thread=0.
template<typename T>
__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
data[o] = testValue<T>(o, rep, rank);
}
void Allreduce(struct threadArgs* args, T* value, int average) {
thread_local int epoch = 0;
static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER};
static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER};
static T accumulator[2];
static int counter[2] = {0, 0};
static void* const initDataKerns[ncclNumTypes] = {
(void*)InitDataKernel< int8_t>,
(void*)InitDataKernel< uint8_t>,
(void*)InitDataKernel< int32_t>,
(void*)InitDataKernel<uint32_t>,
(void*)InitDataKernel< int64_t>,
(void*)InitDataKernel<uint64_t>,
(void*)InitDataKernel< half>,
(void*)InitDataKernel< float>,
(void*)InitDataKernel< double>,
#if RCCL_BFLOAT16 == 1 && NCCL_MAJOR >= 2
(void*)InitDataKernel<rccl_bfloat16>
#endif
};
template<typename T>
testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
T* ptr = (T*)dest;
hipLaunchKernelGGL((InitDataKernel), dim3(16), dim3(512), 0, 0, ptr, N, rep, rank);
return testSuccess;
}
testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
dim3 grid = { 32, 1, 1 };
dim3 block = { 256, 1, 1 };
void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
HIPCHECK(hipLaunchKernel(initDataKerns[type], grid, block, args, 0, hipStreamDefault));
return testSuccess;
}
void Barrier(struct threadArgs* args) {
while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
args->barrier[args->barrier_idx] = args->thread + 1;
if (args->thread+1 == args->nThreads) {
#ifdef MPI_SUPPORT
MPI_Barrier(MPI_COMM_WORLD);
#endif
args->barrier[args->barrier_idx] = 0;
pthread_mutex_lock(&lock[epoch]);
if(counter[epoch] == 0) {
if(average != 0 || args->thread == 0) accumulator[epoch] = *value;
} else {
while (args->barrier[args->barrier_idx]) pthread_yield();
}
args->barrier_idx=!args->barrier_idx;
}
// Inter-thread/process barrier+allreduce
void Allreduce(struct threadArgs* args, double* value, int average) {
while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
double val = *value;
if (args->thread > 0) {
double val2 = args->reduce[args->barrier_idx];
if (average == 1) val += val2;
if (average == 2) val = std::min(val, val2);
if (average == 3) val = std::max(val, val2);
}
if (average || args->thread == 0) args->reduce[args->barrier_idx] = val;
args->barrier[args->barrier_idx] = args->thread + 1;
if (args->thread+1 == args->nThreads) {
#ifdef MPI_SUPPORT
if (average != 0) {
MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
switch(average) {
case /*r0*/ 0: if(args->thread == 0) accumulator[epoch] = *value; break;
case /*avg*/1: accumulator[epoch] += *value; break;
case /*min*/2: accumulator[epoch] = std::min<T>(accumulator[epoch], *value); break;
case /*max*/3: accumulator[epoch] = std::max<T>(accumulator[epoch], *value); break;
case /*sum*/4: accumulator[epoch] += *value; break;
}
#endif
if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads;
args->reduce[1-args->barrier_idx] = 0;
args->barrier[args->barrier_idx] = 0;
} else {
while (args->barrier[args->barrier_idx]) pthread_yield();
}
*value = args->reduce[args->barrier_idx];
args->barrier_idx=!args->barrier_idx;
if(++counter[epoch] == args->nThreads)
pthread_cond_broadcast(&cond[epoch]);
if(args->thread+1 == args->nThreads) {
while(counter[epoch] != args->nThreads)
pthread_cond_wait(&cond[epoch], &lock[epoch]);
#ifdef MPI_SUPPORT
if(average != 0) {
static_assert(std::is_same<T, long long>::value || std::is_same<T, double>::value, "Allreduce<T> only for T in {long long, double}");
MPI_Datatype ty = std::is_same<T, long long>::value ? MPI_LONG_LONG :
std::is_same<T, double>::value ? MPI_DOUBLE :
MPI_Datatype();
MPI_Op op = average == 1 ? MPI_SUM :
average == 2 ? MPI_MIN :
average == 3 ? MPI_MAX :
average == 4 ? MPI_SUM : MPI_Op();
MPI_Allreduce(MPI_IN_PLACE, (void*)&accumulator[epoch], 1, ty, op, MPI_COMM_WORLD);
}
#endif
if(average == 1) accumulator[epoch] /= args->totalProcs*args->nThreads;
counter[epoch] = 0;
pthread_cond_broadcast(&cond[epoch]);
}
else {
while(counter[epoch] != 0)
pthread_cond_wait(&cond[epoch], &lock[epoch]);
}
pthread_mutex_unlock(&lock[epoch]);
*value = accumulator[epoch];
epoch ^= 1;
}
testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int64_t *wrongElts) {
int nranks = args->nProcs*args->nGpus*args->nThreads;
size_t count = args->expectedBytes/wordSize(type);
double maxDelta = 0.0;
int64_t *wrongPerGpu = nullptr;
CUDACHECK(hipHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostAllocMapped));
for (int i=0; i<args->nGpus*args->nRanks; i++) {
int device;
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i);
NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
HIPCHECK(hipSetDevice(device));
void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
maxDelta = std::max(*(args->deltaHost), maxDelta);
#ifdef DEBUG_PRINT
//if (rank == 0) {
int *expectedHost = (int *)malloc(args->expectedBytes);
int *dataHost = (int *)malloc(args->expectedBytes);
TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
hipMemcpy(expectedHost, args->expected[rank], args->expectedBytes, hipMemcpyDeviceToHost);
#if 1 && DEBUG_PRINT
if (args->reportErrors && wrongPerGpu[i] != 0) {
printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]);
char *expectedHost = (char*)malloc(args->expectedBytes);
char *dataHost = (char*)malloc(args->expectedBytes);
int eltsz = wordSize(type);
hipMemcpy(expectedHost, args->expected[i], args->expectedBytes, hipMemcpyDeviceToHost);
hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost);
int j, k, l;
for (j=0; j<args->expectedBytes/sizeof(int); j++)
if (expectedHost[j] != dataHost[j]) break;
k = j;
for (; j<args->expectedBytes/sizeof(int); j++)
if (expectedHost[j] == dataHost[j]) break;
l = j;
printf("\n Rank [%d] Expected: ", rank);
for (j=k; j<args->expectedBytes/sizeof(int) && j<l; j++) {
printf("%d:%d ", j, expectedHost[j]);
for(int j=0; j<args->expectedBytes/eltsz; j++) {
unsigned long long want, got;
want = 0;
memcpy(&want, expectedHost + j*eltsz, eltsz);
got = 0;
memcpy(&got, dataHost + j*eltsz, eltsz);
if(want != got) {
printf(" rank=%d elt[%d]: want=0x%llx got=0x%llx\n", rank, j, want, got);
}
}
printf("\n Rank [%d] Actual : ", rank);
for (j=k; j<args->expectedBytes/sizeof(int) && j<l; j++) {
printf("%d:%d ", j, dataHost[j]);
}
printf("\n");
free(expectedHost);
free(dataHost);
//}
}
#endif
}
double nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
*delta = maxDelta;
*wrongElts = 0;
for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
hipFree(wrongPerGpu);
if (args->reportErrors && *wrongElts) args->errors[0]++;
return testSuccess;
}
testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_t* comms) {
hipError_t hipErr;
int remaining = nStreams;
int* done = (int*)malloc(sizeof(int)*nStreams);
memset(done, 0, sizeof(int)*nStreams);
timer tim;
while (remaining) {
int idle = 1;
for (int i=0; i<nStreams; i++) {
@@ -548,11 +350,24 @@ testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_
NCCLCHECK(ncclAsyncErr);
}
}
double delta = tim.elapsed();
if (delta > timeout && timeout > 0) {
for (int i=0; i<ngpus; i++)
NCCLCHECK(ncclCommAbort(comms[i]));
char hostname[1024];
getHostName(hostname, 1024);
printf("%s: Test timeout (%ds) %s:%d\n",
hostname,
timeout,
__FILE__,__LINE__);
free(done);
return testTimeout;
}
#endif
}
// We might want to let other threads (including NCCL threads) use the CPU.
if (idle) pthread_yield();
if (idle) sched_yield();
}
free(done);
return testSuccess;
@@ -572,6 +387,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
int hipDev;
NCCLCHECK(ncclCommCuDevice(args->comms[i], &hipDev));
HIPCHECK(hipSetDevice(hipDev));
//CUDACHECK(cudaSetDevice(args->gpus[i])); EDGAR CHECK LATER
#endif
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i);
char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
@@ -590,19 +406,18 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
rccl_bfloat16 bf16;
#endif
};
int scalar = preMulScalar(rank);
switch(type) {
case ncclInt8: i8 = int8_t(scalar); break;
case ncclUint8: u8 = uint8_t(scalar); break;
case ncclInt32: i32 = int32_t(scalar); break;
case ncclUint32: u32 = uint32_t(scalar); break;
case ncclInt64: i64 = int32_t(scalar); break;
case ncclUint64: u64 = uint32_t(scalar); break;
case ncclFloat16: f16 = __float2half(float(scalar)); break;
case ncclFloat32: f32 = float(scalar); break;
case ncclFloat64: f64 = double(scalar); break;
case ncclInt8: i8 = ncclVerifiablePremulScalar<int8_t>(rank); break;
case ncclUint8: u8 = ncclVerifiablePremulScalar<uint8_t>(rank); break;
case ncclInt32: i32 = ncclVerifiablePremulScalar<int32_t>(rank); break;
case ncclUint32: u32 = ncclVerifiablePremulScalar<uint32_t>(rank); break;
case ncclInt64: i64 = ncclVerifiablePremulScalar<int64_t>(rank); break;
case ncclUint64: u64 = ncclVerifiablePremulScalar<uint64_t>(rank); break;
case ncclFloat16: f16 = ncclVerifiablePremulScalar<half>(rank); break;
case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
#if defined(RCCL_BFLOAT16)
case ncclBfloat16: bf16 = (rccl_bfloat16)(float(scalar)); break;
case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break;
#endif
}
NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
@@ -657,16 +472,17 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
if (cudaGraphLaunches >= 1) {
// Begin cuda graph capture
for (int i=0; i<args->nGpus*args->nRanks; i++) {
// Thread local mode is needed for:
// - Multi-thread mode
// - P2P pre-connect
// Thread local mdoe is needed for:
// - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
// - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
// Since pre-connect calls cudaMalloc, we cannot use global capture mode
HIPCHECK(hipStreamBeginCapture(args->streams[i], hipStreamCaptureModeThreadLocal));
}
}
#endif
// Performance Benchmark
auto start = std::chrono::high_resolution_clock::now();
timer tim;
for (int iter = 0; iter < iters; iter++) {
if (agg_iters>1) NCCLCHECK(ncclGroupStart());
for (int aiter = 0; aiter < agg_iters; aiter++) {
@@ -687,7 +503,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
}
// Resync CPU, restart timing, launch cuda graph
Barrier(args);
start = std::chrono::high_resolution_clock::now();
tim.reset();
for (int l=0; l<cudaGraphLaunches; l++) {
for (int i=0; i<args->nGpus*args->nRanks; i++) {
HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
@@ -696,10 +512,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
}
#endif
double cputimeSec = tim.elapsed()/(iters*agg_iters);
TESTCHECK(completeColl(args));
auto delta = std::chrono::high_resolution_clock::now() - start;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
double deltaSec = tim.elapsed();
deltaSec = deltaSec/(iters*agg_iters);
if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
Allreduce(args, &deltaSec, average);
@@ -719,8 +535,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
Barrier(args);
double maxDelta = 0;
bool error = false;
int64_t wrongElts = 0;
static __thread int rep = 0;
rep++;
if (datacheck) {
@@ -768,13 +583,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
}
#endif
TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
TESTCHECK(CheckData(args, type, op, root, in_place, &wrongElts));
//aggregate delta from all threads and procs
Allreduce(args, &maxDelta, 3);
long long wrongElts1 = wrongElts;
Allreduce(args, &wrongElts1, /*sum*/4);
wrongElts = wrongElts1;
}
double timeUsec = deltaSec*1.0E6;
double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6;
char timeStr[100];
if (timeUsec >= 10000.0) {
sprintf(timeStr, "%7.0f", timeUsec);
@@ -783,10 +600,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
} else {
sprintf(timeStr, "%7.2f", timeUsec);
}
if (datacheck) {
PRINT(" %7s %6.2f %6.2f %5.0le%s", timeStr, algBw, busBw, maxDelta, error ? "*" : "");
if (args->reportErrors) {
PRINT(" %7s %6.2f %6.2f %5g", timeStr, algBw, busBw, (double)wrongElts);
} else {
PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A");
PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A");
}
args->bw[0] += busBw;
@@ -809,6 +626,9 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
}
testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
// Sync to avoid first-call timeout
Barrier(args);
// Warm-up for large size
setupArgs(args->maxbytes, type, args);
for (int iter = 0; iter < warmup_iters; iter++) {
@@ -855,7 +675,7 @@ testResult_t threadInit(struct threadArgs* args) {
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
//set main thread again
is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
is_main_thread = (is_main_proc && args->thread == 0) ? 1 : 0;
NCCLCHECK(ncclGroupStart());
for (int i=0; i<args->nGpus; i++) {
@@ -863,6 +683,7 @@ testResult_t threadInit(struct threadArgs* args) {
if (enable_multiranks)
gpuid = gpuid % numDevices;
HIPCHECK(hipSetDevice(gpuid));
//CUDACHECK(cudaSetDevice(args->gpus[i]));
for (int j=0; j<args->nRanks; j++) {
int rank = (args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + j;
@@ -968,10 +789,13 @@ int main(int argc, char* argv[]) {
{"datatype", required_argument, 0, 'd'},
{"root", required_argument, 0, 'r'},
{"blocking", required_argument, 0, 'z'},
{"memory_type", required_argument, 0, 'y'},
{"stress_cycles", required_argument, 0, 's'},
{"cumask", required_argument, 0, 'u'},
{"memory_type", required_argument, 0, 'y'}, //RCCL
{"stress_cycles", required_argument, 0, 's'}, //RCCL
{"cumask", required_argument, 0, 'u'}, //RCCL
{"stream_null", required_argument, 0, 'y'}, //NCCL
{"timeout", required_argument, 0, 'T'}, //NCCL
{"cudagraph", required_argument, 0, 'G'},
{"report_cputime", required_argument, 0, 'C'},
{"average", required_argument, 0, 'a'},
#ifdef RCCL_MULTIRANKPERGPU
{"enable_multiranks", required_argument, 0, 'x'},
@@ -983,10 +807,12 @@ int main(int argc, char* argv[]) {
while(1) {
int c;
#ifdef RCCL_MULTIRANKPERGPU
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:R:x:", longopts, &longindex);
// EDGAR NOTE: y is used by 'memory_type' (a RCCL argument) and 'stream_null' (a NCCL argument)
// also not sure about G vs. hG (we had G, they have hG)
#ifdef RCCL_MULTIRANKPERGPU
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:R:x:", longopts, &longindex);
#else
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:", longopts, &longindex);
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:", longopts, &longindex);
#endif
if (c == -1)
@@ -1067,6 +893,12 @@ int main(int argc, char* argv[]) {
mask = strtok(NULL, ",");
};
}
break;
case 'y':
streamnull = strtol(optarg, NULL, 0);
break;
case 'T':
timeout = strtol(optarg, NULL, 0);
break;
case 'G':
#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && HIP_VERSION >= 50221310
@@ -1075,6 +907,9 @@ int main(int argc, char* argv[]) {
printf("Option -G (HIP graph) not supported before NCCL 2.9 + ROCm 5.2 Ignoring\n");
#endif
break;
case 'C':
report_cputime = strtol(optarg, NULL, 0);
break;
case 'a':
average = (int)strtol(optarg, NULL, 0);
break;
@@ -1114,15 +949,18 @@ int main(int argc, char* argv[]) {
"[-y,--memory_type <coarse/fine/host/managed>] \n\t"
"[-s,--stress_cycles <number of cycles>] \n\t"
"[-u,--cumask <d0,d1,d2,d3>] \n\t"
"[-y,--stream_null <0/1>] \n\t"
"[-T,--timeout <time in seconds>] \n\t"
"[-G,--cudagraph <num graph launches>] \n\t"
"[-C,--report_cputime <0/1>] \n\t"
"[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
#ifdef RCCL_MULTIRANKPERGPU
"[-x,--enable_multiranks <0/1> enable using multiple ranks per GPU] \n\t"
"[-R,--ranks_per_gpu] \n\t"
#endif
"[-h,--help]\n",
basename(argv[0]));
return 0;
basename(argv[0]));
return 0;
}
}
@@ -1161,26 +999,36 @@ int main(int argc, char* argv[]) {
}
testResult_t run() {
int nProcs = 1, proc = 0;
int totalProcs = 1, proc = 0, ncclProcs = 1, ncclProc = 0, color = 0;
int localRank = 0;
char hostname[1024];
getHostName(hostname, 1024);
#ifdef MPI_SUPPORT
MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
MPI_Comm_size(MPI_COMM_WORLD, &totalProcs);
MPI_Comm_rank(MPI_COMM_WORLD, &proc);
uint64_t hostHashs[nProcs];
uint64_t hostHashs[totalProcs];
hostHashs[proc] = getHostHash(hostname);
MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
for (int p=0; p<nProcs; p++) {
for (int p=0; p<totalProcs; p++) {
if (p == proc) break;
if (hostHashs[p] == hostHashs[proc]) localRank++;
}
#endif
is_main_thread = (proc == 0) ? 1 : 0;
PRINT("# nThreads: %d nGpus: %d nRanks: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d validation: %d \n", nThreads, nGpus, ranksPerGpu, minBytes, maxBytes,
(stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
char* str = getenv("NCCL_TESTS_SPLIT_MASK");
uint64_t mask = str ? strtoul(str, NULL, 16) : 0;
MPI_Comm mpi_comm;
color = proc & mask;
MPI_Comm_split(MPI_COMM_WORLD, color, proc, &mpi_comm);
MPI_Comm_size(mpi_comm, &ncclProcs);
MPI_Comm_rank(mpi_comm, &ncclProc);
#endif
is_main_thread = is_main_proc = (proc == 0) ? 1 : 0;
PRINT("# nThreads: %d nGpus: %d nRanks: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
nThreads, nGpus, ranksPerGpu, minBytes, maxBytes,
(stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches);
if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
PRINT("#\n");
@@ -1190,6 +1038,8 @@ testResult_t run() {
char line[MAX_LINE];
int len = 0;
size_t maxMem = ~0;
char* envstr = getenv("NCCL_TESTS_DEVICE");
int gpu0 = envstr ? atoi(envstr) : -1;
for (int i=0; i<nThreads*nGpus; i++) {
int hipDev = localRank*nThreads*nGpus+i;
if (enable_multiranks)
@@ -1207,11 +1057,11 @@ testResult_t run() {
}
}
#if MPI_SUPPORT
char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
char *lines = (proc == 0) ? (char *)malloc(totalProcs*MAX_LINE) : NULL;
// Gather all output in rank order to root (0)
MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
if (proc == 0) {
for (int p = 0; p < nProcs; p++)
for (int p = 0; p < totalProcs; p++)
PRINT("%s", lines+MAX_LINE*p);
free(lines);
}
@@ -1228,13 +1078,14 @@ testResult_t run() {
}
ncclUniqueId ncclId;
if (proc == 0) {
if (ncclProc == 0) {
NCCLCHECK(ncclGetUniqueId(&ncclId));
}
#ifdef MPI_SUPPORT
MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm);
#endif
<<<<<<< HEAD
int gpus[nGpus*nThreads*ranksPerGpu];
hipStream_t streams[nGpus*nThreads*ranksPerGpu];
void* sendbuffs[nGpus*nThreads*ranksPerGpu];
void* recvbuffs[nGpus*nThreads*ranksPerGpu];
@@ -1243,15 +1094,23 @@ testResult_t run() {
ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads*ranksPerGpu);
envstr = getenv("NCCL_TESTS_DEVICE");
gpu0 = envstr ? atoi(envstr) : -1;
for (int ii=0; ii<nGpus*nThreads; ii++) {
int gpuid = localRank*nThreads*nGpus+ii;
if (enable_multiranks)
gpuid = gpuid % numDevices;
HIPCHECK(hipSetDevice(gpuid));
for (int j=0; j<ranksPerGpu; j++) {
int i = ii*ranksPerGpu+j;
gpus[i] = gpu0 != -1 ? gpu0+ii : gpuid;
HIPCHECK(hipSetDevice(gpus[i]));
TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus*ranksPerGpu));
//PRINT("sendbuffs[%d]=%p(size=%lu) recvbuffs[%d]=%p(size=%lu)\n", i, sendbuffs[i], sendBytes, i, recvbuffs[i], recvBytes);
if (streamnull)
streams[i] = NULL;
else {
if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
PRINT("cumask: ");
for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
@@ -1259,19 +1118,20 @@ testResult_t run() {
HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
} else
HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
// initialize data buffer to avoid all zero data
}
#if 0 //EDGAR
// initialize data buffer to avoid all zero data
TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i));
}
HIPCHECK(hipDeviceSynchronize());
#endif //EDGAR
}
//if parallel init is not selected, use main thread to initialize NCCL
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus*ranksPerGpu);
if (!parallel_init) {
if (nProcs == 1 && !enable_multiranks) {
int gpuArray[nGpus*nThreads];
for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
} else {
NCCLCHECK(ncclGroupStart());
for (int ii=0; ii<nGpus*nThreads; ii++) {
@@ -1305,12 +1165,13 @@ testResult_t run() {
errors[t] = bw_count[t] = 0;
}
const char* timeStr = report_cputime ? "cputime" : "time";
PRINT("#\n");
print_header();
int* sync = (int*)calloc(2, sizeof(int));
int* barrier = (int*)calloc(2, sizeof(int));
double* reduce = (double*)calloc(2, sizeof(double));
PRINT("# %10s %12s %8s %6s %6s out-of-place in-place \n", "", "", "", "", "");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
struct testThread threads[nThreads];
memset(threads, 0, sizeof(struct testThread)*nThreads);
@@ -1324,11 +1185,12 @@ testResult_t run() {
threads[t].args.localNumDevices = numDevices;
threads[t].args.enable_multiranks = enable_multiranks;
threads[t].args.nRanks = ranksPerGpu;
threads[t].args.nProcs=nProcs;
threads[t].args.proc=proc;
threads[t].args.nProcs=ncclProcs;
threads[t].args.proc=ncclProc;
threads[t].args.nThreads=nThreads;
threads[t].args.thread=t;
threads[t].args.nGpus=nGpus;
threads[t].args.gpus=gpus+t*nGpus*ranksPerGpu;
threads[t].args.sendbuffs = sendbuffs+t*nGpus*ranksPerGpu;
threads[t].args.recvbuffs = recvbuffs+t*nGpus*ranksPerGpu;
threads[t].args.expected = expected+t*nGpus*ranksPerGpu;
@@ -1336,17 +1198,11 @@ testResult_t run() {
threads[t].args.comms=comms+t*nGpus*ranksPerGpu;
threads[t].args.streams=streams+t*nGpus*ranksPerGpu;
threads[t].args.barrier = (volatile int*)barrier;
threads[t].args.barrier_idx = 0;
threads[t].args.reduce = (volatile double*)reduce;
threads[t].args.sync = (volatile int*)sync;
threads[t].args.sync_idx = 0;
threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
threads[t].args.errors=errors+t;
threads[t].args.bw=bw+t;
threads[t].args.bw_count=bw_count+t;
threads[t].args.reportErrors = 1;
threads[t].args.reportErrors = datacheck;
threads[t].func = parallel_init ? threadInit : threadRunTests;
if (t)
@@ -1395,8 +1251,8 @@ testResult_t run() {
}
HIPCHECK(hipHostFree(delta));
char* str = getenv("NCCL_TESTS_MIN_BW");
double check_avg_bw = str ? atof(str) : -1;
envstr = getenv("NCCL_TESTS_MIN_BW");
double check_avg_bw = envstr ? atof(envstr) : -1;
bw[0] /= bw_count[0];
if (datacheck) PRINT("# Errors with asterisks indicate errors that have exceeded the maximum threshold.\n");
@@ -1408,6 +1264,7 @@ testResult_t run() {
#endif
// 'hip-memcheck --leak-check full' requires this
PRINT("%s\n", ncclGetLastError(NULL));
hipDeviceReset();
if (errors[0] || bw[0] < check_avg_bw*(0.9))
+34 -55
View File
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -16,6 +16,10 @@
#endif
#include <pthread.h>
#include "nccl1_compat.h"
#include "timer.h"
// For nccl.h < 2.13 since we define a weak fallback
extern "C" char const* ncclGetLastError(ncclComm_t comm);
#define HIPCHECK(cmd) do { \
hipError_t e = cmd; \
@@ -29,6 +33,21 @@
} \
} while(0)
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0)
#define NCCLCHECK(cmd) do { \
ncclResult_t res = cmd; \
if (res != ncclSuccess) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
printf("%s: Test NCCL failure %s:%d " \
"'%s / %s'\n", \
hostname,__FILE__,__LINE__, \
ncclGetErrorString(res), \
ncclGetLastError(NULL)); \
return testNcclError; \
} \
} while(0)
#else
#define NCCLCHECK(cmd) do { \
ncclResult_t res = cmd; \
if (res != ncclSuccess) { \
@@ -40,13 +59,15 @@
return testNcclError; \
} \
} while(0)
#endif
typedef enum {
testSuccess = 0,
testInternalError = 1,
testCudaError = 2,
testNcclError = 3,
testCuRandError = 4
testTimeout = 4,
testNumResults = 5
} testResult_t;
// Relay errors up and trace
@@ -96,11 +117,13 @@ struct threadArgs {
size_t stepbytes;
size_t stepfactor;
int totalProcs;
int nProcs;
int proc;
int nThreads;
int thread;
int nGpus;
int* gpus;
int localRank;
int localNumDevices;
int enable_multiranks;
@@ -116,14 +139,6 @@ struct threadArgs {
void** expected;
size_t expectedBytes;
volatile int* sync;
int sync_idx;
volatile int* barrier;
int barrier_idx;
volatile double* reduce;
int syncRank;
int syncNranks;
double* deltaHost;
int* errors;
double* bw;
int* bw_count;
@@ -141,19 +156,13 @@ struct testThread {
testResult_t ret;
};
#include <chrono>
// Provided by common.cu
extern void Barrier(struct threadArgs* args);
extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root);
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
// Provided by each coll
extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
extern void print_header();
#include <unistd.h>
static void getHostName(char* hostname, int maxlen) {
@@ -168,46 +177,15 @@ static void getHostName(char* hostname, int maxlen) {
#include <stdint.h>
static uint64_t getHash(const char* string, size_t n) {
// Based on DJB2a, result = result * 33 ^ char
static uint64_t getHostHash(const char* string) {
// Based on DJB2, result = result * 33 + char
uint64_t result = 5381;
for (size_t c = 0; c < n; c++) {
result = ((result << 5) + result) ^ string[c];
for (int c = 0; string[c] != '\0'; c++){
result = ((result << 5) + result) + string[c];
}
return result;
}
/* Generate a hash of the unique identifying string for this host
* that will be unique for both bare-metal and container instances
* Equivalent of a hash of;
*
* $(hostname)$(cat /proc/sys/kernel/random/boot_id)
*
*/
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
static uint64_t getHostHash(const char* hostname) {
char hostHash[1024];
// Fall back is the hostname if something fails
(void) strncpy(hostHash, hostname, sizeof(hostHash));
int offset = strlen(hostHash);
FILE *file = fopen(HOSTID_FILE, "r");
if (file != NULL) {
char *p;
if (fscanf(file, "%ms", &p) == 1) {
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
free(p);
}
}
fclose(file);
// Make sure the string is terminated
hostHash[sizeof(hostHash)-1]='\0';
return getHash(hostHash, strlen(hostHash));
}
static size_t wordSize(ncclDataType_t type) {
switch(type) {
case ncclChar:
@@ -233,7 +211,7 @@ static size_t wordSize(ncclDataType_t type) {
case ncclInt64:
case ncclUint64:
case ncclDouble:
//case ncclFloat64:
//case ncclFloat64:
return 8;
default: return 0;
}
@@ -290,6 +268,7 @@ static int ncclstringtomtype (char *str) {
return ncclCoarse;
}
extern int is_main_proc;
extern thread_local int is_main_thread;
#define PRINT if (is_main_thread) printf
+6 -21
View File
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include "hip/hip_runtime.h"
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
}
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count/nranks;
*recvcount = (count/nranks)*nranks;
@@ -35,20 +23,17 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
if (rank == root) {
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
TESTCHECK(InitData(((char*)args->expected[k]), nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
}
}
k++;
@@ -125,7 +110,7 @@ testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t typ
for (int i=0; i<type_count; i++) {
for (int j=begin_root; j<=end_root; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
}
}
return testSuccess;
+14 -21
View File
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,18 +10,6 @@
#define ALIGN 4
void print_header() {
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s", size, count, typeName);
}
void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
size_t base = (count/(ALIGN*nranks))*ALIGN;
*sendcount = base;
@@ -37,18 +26,15 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
}
k++;
}
@@ -116,9 +102,16 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t
run_typenames = test_typenames;
}
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
// Check if this is a power of 2
int nRanks = args->nProcs*args->nThreads*args->nGpus;
if (nRanks && !(nRanks & (nRanks - 1))) {
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
}
} else {
printf("nRanks %d is not a power of 2, skipping\n", nRanks);
}
return testSuccess;
}
+4 -19
View File
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "root",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6s %6i", size, count, typeName, opName, root);
}
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
@@ -35,16 +23,13 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
if (rank == root) TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
k++;
+10 -22
View File
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,24 +8,15 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
}
#define ALIGN 4
void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = (count/nranks)*nranks;
*recvcount = count/nranks;
size_t base = (count/(ALIGN*nranks))*ALIGN;
*sendcount = base*nranks;
*recvcount = base;
*sendInplaceOffset = 0;
*recvInplaceOffset = count/nranks;
*paramcount = *recvcount;
*recvInplaceOffset = base;
*paramcount = base;
}
testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -35,16 +26,13 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
TESTCHECK(InitDataReduce(args->expected[k], recvcount, rank*recvcount, type, op, rep, nranks));
k++;
+6 -21
View File
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
}
void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = (count/nranks)*nranks;
*recvcount = count/nranks;
@@ -34,17 +22,14 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(args->expected[k], recvcount, type, rep+rank*recvcount, root));
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
TESTCHECK(InitData(args->expected[k], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
k++;
}
@@ -120,7 +105,7 @@ testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t ty
for (int i=0; i<type_count; i++) {
for (int j=begin_root; j<=end_root; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
}
}
return testSuccess;
+5 -20
View File
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s", size, count, typeName);
}
void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
@@ -35,18 +23,15 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
int peer = (rank-1+nranks)%nranks;
TESTCHECK(InitData(args->expected[k], recvcount, type, rep, peer));
TESTCHECK(InitData(args->expected[k], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
k++;
}
HIPCHECK(hipDeviceSynchronize());
+28
View File
@@ -0,0 +1,28 @@
#include "timer.h"
// Make sure to compile this translation unit with the host compiler and not
// nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
#include <chrono>
namespace {
std::uint64_t now() {
using clock = std::chrono::steady_clock;
return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
}
}
timer::timer() {
t0 = now();
}
double timer::elapsed() const {
std::uint64_t t1 = now();
return 1.e-9*(t1 - t0);
}
double timer::reset() {
std::uint64_t t1 = now();
double ans = 1.e-9*(t1 - t0);
t0 = t1;
return ans;
}
+15
View File
@@ -0,0 +1,15 @@
#ifndef _408319ecdd5b47b28bf8f511c4fdf816
#define _408319ecdd5b47b28bf8f511c4fdf816
#include <cstdint>
// Can't include <chrono> because of bug with gcc 10.3.0
class timer {
std::uint64_t t0;
public:
timer();
double elapsed() const;
double reset();
};
#endif
+24
View File
@@ -0,0 +1,24 @@
include ../../makefiles/common.mk
.PHONY: all clean
BUILDDIR := $(abspath ../../build)
NCCLDIR := $(BUILDDIR)
NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include
DST_DIR := $(BUILDDIR)/test/verifiable
all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o
clean:
rm -rf $(DST_DIR)
TEST_VERIFIABLE_SRCDIR := .
TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
include verifiable.mk
self_test: $(DST_DIR)/self_test
$(DST_DIR)/self_test: verifiable.cu verifiable.h
@printf "Linking %s\n" $@
@mkdir -p $(DST_DIR)
$(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS)
+177
View File
@@ -0,0 +1,177 @@
/* Generate parameters for our error bound model of floating point average
* (sum of scaled values) by sampling sums of random sequences for each
* floating point type.
*
* The model has parameters "coef" and "power", where for two floats a & b,
* they are close enough if and only if:
* abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
*
* Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
*
* Compile with:
* nvcc -gencode=arch=compute_80,code=sm_80
*/
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdint>
#include <cuda_bf16.h>
#include <cuda_fp16.h>
using std::uint64_t;
using std::uint32_t;
using bfloat16 = __nv_bfloat16;
template<typename T>
struct float_traits;
template<>
struct float_traits<float> {
static constexpr int mantissa_bits = 23;
static constexpr int exponent_bits = 8;
using uint_t = uint32_t;
__device__ static float make(double x) { return (float)x; }
__device__ static float make(uint64_t x) { return (float)x; }
__device__ static double todouble(float x) { return x; }
__device__ static float add(float a, float b) { return a+b; }
__device__ static float mul(float a, float b) { return a*b; }
};
template<>
struct float_traits<double> {
static constexpr int mantissa_bits = 52;
static constexpr int exponent_bits = 11;
using uint_t = uint64_t;
__device__ static double make(double x) { return x; }
__device__ static double make(uint64_t x) { return (double)x; }
__device__ static double todouble(double x) { return x; }
__device__ static double add(double a, double b) { return a+b; }
__device__ static double mul(double a, double b) { return a*b; }
};
template<>
struct float_traits<half> {
static constexpr int mantissa_bits = 10;
static constexpr int exponent_bits = 5;
using uint_t = uint16_t;
__device__ static half make(double x) { return __double2half(x); }
__device__ static half make(uint64_t x) { return __int2half_rn(x); }
__device__ static double todouble(half x) { return __half2float(x); }
__device__ static half add(half a, half b) { return __hadd(a, b); }
__device__ static half mul(half a, half b) { return __hmul(a, b); }
};
template<>
struct float_traits<bfloat16> {
static constexpr int mantissa_bits = 7;
static constexpr int exponent_bits = 8;
using uint_t = uint16_t;
__device__ static bfloat16 make(double x) { return __double2bfloat16(x); }
__device__ static bfloat16 make(uint64_t x) { return __int2bfloat16_rn(x); }
__device__ static double todouble(bfloat16 x) { return __bfloat162float(x); }
__device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return __hadd(a, b); }
__device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return __hmul(a, b); }
};
template<typename F>
__device__ int compare(F a, F b) {
union { typename float_traits<F>::uint_t ua; F fa; };
union { typename float_traits<F>::uint_t ub; F fb; };
ua=0; ub=0;
fa=a; fb=b;
//std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
return ua < ub ? ub-ua : ua-ub;
}
struct xoshiro256ss {
uint64_t s[4];
__device__ xoshiro256ss(int seed) {
constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
for(int i=0; i < 4; i++)
s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
}
__device__ uint64_t operator()() {
auto rol64 = [](uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
};
uint64_t const result = rol64(s[1] * 5, 7) * 9;
uint64_t const t = s[1] << 17;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rol64(s[3], 45);
return result;
}
};
template<typename F>
__global__ void kernel() {
using traits = float_traits<F>;
constexpr int samps = 4<<10;
__shared__ F accf[samps];
__shared__ double accd[samps];
xoshiro256ss rng(threadIdx.x);
float expo_avg = 1;
for(int pass=0; pass < 2; pass++) {
F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
int err_max = 0;
float coef = 0;
double expo_sum = 0;
int expo_n = 0;
int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
//for(int round=0; round < 2; round++) {
for(int i=threadIdx.x; i < samps; i += blockDim.x) {
accf[i] = 0;
accd[i] = 0;
}
__syncthreads();
for(int r=0; r < max_ranks; r++) {
int err = 0;
for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
F f = traits::make(d);
accf[i] = traits::add(accf[i], traits::mul(scalar, f));
accd[i] += traits::todouble(f);
//if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
err = err > e ? err : e;
}
err = __reduce_max_sync(-1u, err);
err_max = err_max > err ? err_max : err;
if (r >= 2) {
// err = 1 + coef*pow(r,expo)
float c = float(err-1)/powf(float(r), expo_avg);
coef = coef > c ? coef : c;
}
if (r >= 2) {
double expo = log2f(1+err_max)/log2f(r);
expo_sum += expo;
expo_n++;
//if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
}
}
}
if(pass==0)
expo_avg = expo_sum/expo_n;
else if(threadIdx.x == 0)
std::printf(" coef=%1.10f expo=%1.10f\n", coef, expo_avg);
}
}
int main() {
std::printf("type=float:\n");
kernel<float><<<1,32>>>();
cudaDeviceSynchronize();
std::printf("\ntype=half:\n");
kernel<half><<<1,32>>>();
cudaDeviceSynchronize();
std::printf("\ntype=bfloat16:\n");
kernel<bfloat16><<<1,32>>>();
cudaDeviceSynchronize();
return 0;
}
File diff suppressed because it is too large Load Diff
+59
View File
@@ -0,0 +1,59 @@
#ifndef _d41d8cd98f00b204e9800998ecf8427e
#define _d41d8cd98f00b204e9800998ecf8427e
#include <cuda_runtime.h>
#include <stdint.h>
/* Routines for launching kernels that verify reduction results. A significant
* feature of these routines is they carefully craft floating point input
* to produce exactly predictable output.
*
* int elt_ty: actually just a ncclDataType_t
*
* int red_op: mostly just a ncclRedOp_t. Since PreMulSum ops are dynamically
* created, these are encoded as the value ncclNumOps and their scalar is
* assumed to be `ncclVerifiablePremulScalar(rank_me)`
*
* uint64_t seed: arbitrary 64-bits to use in seeding the random values
*
* intptr_t elt_ix0: index of first element pointed to by elts when generating
* random values. This makes it possible to generate subsequences independently
* as well as in aggregate.
*
* int rank_n: Number of contributions into the reduction. Non-reduction
* collectives like broadcast, gather, etc will always set this to one.
*
* int rank_me: Index of this contribution
*/
// Use this as the local scalar for PreMulSum ops
template<typename T>
__host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
return T(rank_me%2 == 0 ? 1.0f : 2.0f);
}
// Enqueue kernel to generate data which is to be reduced.
void ncclVerifiablePrepareInput(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
);
// Enqueue kernel to generate expected results of reduction.
void ncclVerifiablePrepareExpected(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
);
// Enqueue kernel to verify reduced data matches expectation. The number of
// failed elements is written to bad_elt_n which must be in cudaHost memory.
// If `expected == nullptr` then the expected results are generated on-the-fly
// which can be costly. Thus if you plan to run the same reduction multiple
// times it is advantageous to precompute the expected values with
// ncclVerifiablePrepareExpected and pass them as `expected` here.
void ncclVerifiableVerify(
void const *results, void const *expected, intptr_t elt_n, int elt_ty,
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
int64_t *bad_elt_n, cudaStream_t stream
);
#endif
+11
View File
@@ -0,0 +1,11 @@
# We requires both of the following paths to be set upon including this makefile
# TEST_VERIFIABLE_SRCDIR = <points to this directory>
# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
@printf "Compiling %s\n" $@
@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
$(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu