Merge remote-tracking branch 'nccl-tests/master' into topic/v2.13.4-sync
This commit is contained in:
@@ -4,6 +4,9 @@
|
||||
# See LICENCE.txt for license information
|
||||
#
|
||||
|
||||
BUILDDIR ?= build
|
||||
override BUILDDIR := $(abspath $(BUILDDIR))
|
||||
|
||||
.PHONY : all clean
|
||||
|
||||
default : src.build
|
||||
@@ -14,7 +17,7 @@ all: ${TARGETS:%=%.build}
|
||||
clean: ${TARGETS:%=%.clean}
|
||||
|
||||
%.build:
|
||||
${MAKE} -C $* build
|
||||
${MAKE} -C $* build BUILDDIR=${BUILDDIR}
|
||||
|
||||
%.clean:
|
||||
${MAKE} -C $* clean
|
||||
${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
|
||||
|
||||
+13
-4
@@ -1,6 +1,6 @@
|
||||
#
|
||||
# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications are Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
@@ -65,13 +65,22 @@ build: ${BIN_FILES}
|
||||
clean:
|
||||
rm -rf ${DST_DIR}
|
||||
|
||||
${DST_DIR}/%.o: %.cu common.h
|
||||
TEST_VERIFIABLE_SRCDIR := ../verifiable
|
||||
TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
|
||||
include ../verifiable/verifiable.mk
|
||||
|
||||
${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
|
||||
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
|
||||
|
||||
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o
|
||||
${DST_DIR}/timer.o: timer.cc timer.h
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
|
||||
|
||||
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
|
||||
@printf "Linking %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}"
|
||||
|
||||
+12
-24
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,24 +8,15 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s", size, count, typeName);
|
||||
}
|
||||
#define ALIGN 4
|
||||
|
||||
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count/nranks;
|
||||
*recvcount = (count/nranks)*nranks;
|
||||
*sendInplaceOffset = count/nranks;
|
||||
size_t base = (count/(ALIGN*nranks))*ALIGN;
|
||||
*sendcount = base;
|
||||
*recvcount = base*nranks;
|
||||
*sendInplaceOffset = base;
|
||||
*recvInplaceOffset = 0;
|
||||
*paramcount = *sendcount;
|
||||
*paramcount = base;
|
||||
}
|
||||
|
||||
testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
|
||||
@@ -35,18 +26,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
|
||||
for (int j=0; j<nranks; j++) {
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
|
||||
}
|
||||
k++;
|
||||
}
|
||||
@@ -98,7 +86,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
|
||||
}
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
+4
-19
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
|
||||
}
|
||||
|
||||
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
@@ -35,16 +23,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
|
||||
int k = 0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
|
||||
TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
|
||||
k++;
|
||||
}
|
||||
|
||||
+7
-22
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
|
||||
}
|
||||
|
||||
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = (count/nranks)*nranks;
|
||||
*recvcount = (count/nranks)*nranks;
|
||||
@@ -35,19 +23,16 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
char* str = getenv("NCCL_TESTS_DEVICE");
|
||||
int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
|
||||
for (int j=0; j<nranks; j++) {
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j));
|
||||
size_t partcount = sendcount/nranks;
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+ j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
|
||||
}
|
||||
k++;
|
||||
}
|
||||
@@ -101,7 +86,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
|
||||
}
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
+6
-21
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
|
||||
}
|
||||
|
||||
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
@@ -34,17 +22,14 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(args->expected[k], recvcount, type, rep, root));
|
||||
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
|
||||
TESTCHECK(InitData(args->expected[k], recvcount, 0, type, ncclSum, rep, 1, 0));
|
||||
k++;
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
@@ -114,7 +99,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
for (int j=begin_root; j<=end_root; j++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
|
||||
}
|
||||
}
|
||||
return testSuccess;
|
||||
|
||||
+258
-401
@@ -1,7 +1,7 @@
|
||||
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -11,11 +11,14 @@
|
||||
#include "common.h"
|
||||
#include <pthread.h>
|
||||
#include <cstdio>
|
||||
#include <type_traits>
|
||||
#include <getopt.h>
|
||||
#include <libgen.h>
|
||||
|
||||
//#define DEBUG_PRINT
|
||||
|
||||
#include "../verifiable/verifiable.h"
|
||||
|
||||
int test_ncclVersion = 0; // init'd with ncclGetVersion()
|
||||
|
||||
#if NCCL_MAJOR >= 2
|
||||
@@ -54,6 +57,12 @@ int test_ncclVersion = 0; // init'd with ncclGetVersion()
|
||||
|
||||
const char *test_memorytypes[nccl_NUM_MTYPES] = {"coarse", "fine", "host", "managed"};
|
||||
|
||||
// For libnccl's < 2.13
|
||||
extern "C" __attribute__((weak)) char const* ncclGetLastError(ncclComm_t comm) {
|
||||
return "";
|
||||
}
|
||||
|
||||
int is_main_proc = 0;
|
||||
thread_local int is_main_thread = 0;
|
||||
|
||||
// Command line parameter defaults
|
||||
@@ -75,7 +84,10 @@ static int blocking_coll = 0;
|
||||
static int memorytype = 0;
|
||||
static int stress_cycles = 1;
|
||||
static uint32_t cumask[4];
|
||||
static int streamnull = 0;
|
||||
static int timeout = 0;
|
||||
static int cudaGraphLaunches = 0;
|
||||
static int report_cputime = 0;
|
||||
// Report average iteration time: (0=RANK0,1=AVG,2=MIN,3=MAX)
|
||||
static int average = 1;
|
||||
static int numDevices = 1;
|
||||
@@ -152,374 +164,164 @@ static bool minReqVersion(int rmajor, int rminor, int rpatch)
|
||||
return true;
|
||||
}
|
||||
|
||||
double DeltaMaxValue(ncclDataType_t type) {
|
||||
switch(type) {
|
||||
case ncclHalf: return 1e-2;
|
||||
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
|
||||
case ncclBfloat16: return 1e-2;
|
||||
#endif
|
||||
case ncclFloat: return 1e-5;
|
||||
case ncclDouble: return 1e-12;
|
||||
case ncclInt:
|
||||
#if NCCL_MAJOR >= 2
|
||||
case ncclUint8:
|
||||
//case ncclInt32:
|
||||
case ncclUint32:
|
||||
#endif
|
||||
case ncclInt64:
|
||||
case ncclUint64: return 1e-200;
|
||||
}
|
||||
return 1e-200;
|
||||
}
|
||||
|
||||
template<typename T> __device__
|
||||
double absDiff(T a, T b) {
|
||||
return fabs((double)(b - a));
|
||||
}
|
||||
|
||||
template<> __device__
|
||||
double absDiff<half>(half a, half b) {
|
||||
float x = __half2float(a);
|
||||
float y = __half2float(b);
|
||||
return fabs((double)(y-x));
|
||||
}
|
||||
|
||||
template<typename T> __device__
|
||||
float toFloat(T a) {
|
||||
return (float)a;
|
||||
}
|
||||
template<> __device__
|
||||
float toFloat(half a) {
|
||||
return __half2float(a);
|
||||
}
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
template<> __device__
|
||||
float toFloat(rccl_bfloat16 a) {
|
||||
return (float)(a);
|
||||
}
|
||||
#endif
|
||||
|
||||
template<typename T, int BSIZE> __global__
|
||||
void deltaKern(void* A_, void* B_, size_t count, double* max) {
|
||||
const T* A = (const T*)A_;
|
||||
const T* B = (const T*)B_;
|
||||
__shared__ double temp[BSIZE];
|
||||
int tid = blockIdx.x*blockDim.x + threadIdx.x;
|
||||
double locmax = 0.0;
|
||||
for(size_t i=tid; i<count; i+=blockDim.x*gridDim.x) {
|
||||
|
||||
double delta = absDiff(A[i], B[i]);
|
||||
if( delta > locmax ) {
|
||||
locmax = delta;
|
||||
#ifdef DEBUG_PRINT
|
||||
if (delta > .1) printf("Error at %ld/%ld(%p) : %f != %f\n", i, count, B+i, toFloat(A[i]), toFloat(B[i]));
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
tid = threadIdx.x;
|
||||
temp[tid] = locmax;
|
||||
for(int stride = BSIZE/2; stride > 1; stride>>=1) {
|
||||
__syncthreads();
|
||||
if( tid < stride )
|
||||
temp[tid] = temp[tid] > temp[tid+stride] ? temp[tid] : temp[tid+stride];
|
||||
}
|
||||
__syncthreads();
|
||||
if( threadIdx.x == 0)
|
||||
max[blockIdx.x] = temp[0] > temp[1] ? temp[0] : temp[1];
|
||||
}
|
||||
|
||||
testResult_t CheckDelta(void* results, void* expected, size_t count, ncclDataType_t type, double* devmax) {
|
||||
switch (type) {
|
||||
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
|
||||
case ncclBfloat16:
|
||||
hipLaunchKernelGGL((deltaKern<rccl_bfloat16, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
|
||||
#endif
|
||||
case ncclHalf:
|
||||
hipLaunchKernelGGL((deltaKern<half, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
|
||||
case ncclFloat:
|
||||
hipLaunchKernelGGL((deltaKern<float, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
|
||||
case ncclDouble:
|
||||
hipLaunchKernelGGL((deltaKern<double, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
|
||||
case ncclChar:
|
||||
#if NCCL_MAJOR >= 2
|
||||
case ncclUint8:
|
||||
#endif
|
||||
hipLaunchKernelGGL((deltaKern<uint8_t, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
|
||||
case ncclInt:
|
||||
#if NCCL_MAJOR >= 2
|
||||
case ncclUint32:
|
||||
#endif
|
||||
hipLaunchKernelGGL((deltaKern<uint32_t, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
hipLaunchKernelGGL((deltaKern<uint64_t, 512>), dim3(1), dim3(512), 0, 0, results, expected, count, devmax); break;
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
for (int i=1; i<NUM_BLOCKS; i++) devmax[0] = std::max(devmax[0], devmax[i]);
|
||||
testResult_t CheckDelta(void* results, void* expected, size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int64_t *wrongEltN) {
|
||||
ncclVerifiableVerify(results, expected, count, (int)type, (int)op, nranks, seed, offset, wrongEltN, cudaStreamDefault);
|
||||
CUDACHECK(cudaDeviceSynchronize());
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
// For integer values, we use values between 0 and 255
|
||||
template<typename T>
|
||||
__device__ T testValue(const size_t offset, const int rep, const int rank) {
|
||||
uint8_t v = (rep+rank+offset) % 256;
|
||||
return (T)v;
|
||||
testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks) {
|
||||
ncclVerifiablePrepareExpected(data, count, (int)type, (int)op, nranks, seed, offset, cudaStreamDefault);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
// For floating point datatype, we use values between 0 and 1 otherwise the
|
||||
// Product operation will produce NaNs.
|
||||
template<>
|
||||
__device__ double testValue<double>(const size_t offset, const int rep, const int rank) {
|
||||
return 1.0/(1.0+(double)testValue<int>(offset, rep, rank));
|
||||
}
|
||||
template<>
|
||||
__device__ float testValue<float>(const size_t offset, const int rep, const int rank) {
|
||||
return 1.0/(1.0+(float)testValue<int>(offset, rep, rank));
|
||||
}
|
||||
template<>
|
||||
__device__ half testValue<half>(const size_t offset, const int rep, const int rank) {
|
||||
return __float2half(testValue<float>(offset, rep, rank));
|
||||
}
|
||||
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
|
||||
template<>
|
||||
__device__ rccl_bfloat16 testValue<rccl_bfloat16>(const size_t offset, const int rep, const int rank) {
|
||||
return rccl_bfloat16(testValue<float>(offset, rep, rank));
|
||||
}
|
||||
#endif
|
||||
|
||||
// Operations
|
||||
template<typename T>
|
||||
__device__ T ncclOpSum(T a, T b) { return a+b; }
|
||||
template<typename T>
|
||||
__device__ T ncclOpProd(T a, T b) { return a*b; }
|
||||
template<typename T>
|
||||
__device__ T ncclOpMax(T a, T b) { return a>b ? a : b; }
|
||||
template<typename T>
|
||||
__device__ T ncclOpMin(T a, T b) { return a<b ? a : b; }
|
||||
|
||||
// Definitions for half
|
||||
template<>
|
||||
__device__ half ncclOpSum(half a, half b) { return __float2half(__half2float(a)+__half2float(b)); }
|
||||
template<>
|
||||
__device__ half ncclOpProd(half a, half b) { return __float2half(__half2float(a)*__half2float(b)); }
|
||||
template<>
|
||||
__device__ half ncclOpMax(half a, half b) { return __half2float(a)>__half2float(b) ? a : b; }
|
||||
template<>
|
||||
__device__ half ncclOpMin(half a, half b) { return __half2float(a)<__half2float(b) ? a : b; }
|
||||
|
||||
template<typename T>
|
||||
__device__ T ncclPPOpIdent(T x, int arg) { return x; }
|
||||
template<typename T>
|
||||
__device__ T ncclPPOpMul(T x, int arg) { return x*T(arg); }
|
||||
template<typename T>
|
||||
__device__ T ncclPPOpDiv(T x, int arg) { return x/T(arg); }
|
||||
template<>
|
||||
__device__ half ncclPPOpMul(half x, int arg) {
|
||||
return __float2half(__half2float(x)*float(arg));
|
||||
}
|
||||
template<>
|
||||
__device__ half ncclPPOpDiv(half x, int n) {
|
||||
return __float2half(__half2float(x)/n);
|
||||
}
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
template<>
|
||||
__device__ rccl_bfloat16 ncclPPOpMul(rccl_bfloat16 x, int arg) {
|
||||
return (rccl_bfloat16)((float)(x)*float(arg));
|
||||
}
|
||||
template<>
|
||||
__device__ rccl_bfloat16 ncclPPOpDiv(rccl_bfloat16 x, int n) {
|
||||
return (rccl_bfloat16)((float)(x)/(float)(n));;
|
||||
}
|
||||
#endif
|
||||
|
||||
__host__ __device__ int preMulScalar(int rank) {
|
||||
return 1 + rank%2;
|
||||
testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, uint64_t seed, int nranks, int rank) {
|
||||
ncclVerifiablePrepareInput(data, count, (int)type, (int)op, nranks, rank, seed, offset, cudaStreamDefault);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
template<typename T, T (*Op)(T, T), T(*PreOp)(T,int), T(*PostOp)(T,int)>
|
||||
__global__ void InitDataReduceKernel(T* data, const size_t N, const size_t offset, const int rep, const int nranks) {
|
||||
for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x) {
|
||||
T val = testValue<T>(o+offset, rep, 0);
|
||||
val = PreOp(val, preMulScalar(0));
|
||||
for (int i=1; i<nranks; i++) {
|
||||
T val1 = testValue<T>(o+offset, rep, i);
|
||||
val1 = PreOp(val1, preMulScalar(i));
|
||||
val = Op(val, val1);
|
||||
}
|
||||
data[o] = PostOp(val, nranks);
|
||||
void Barrier(struct threadArgs *args) {
|
||||
thread_local int epoch = 0;
|
||||
static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER};
|
||||
static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER};
|
||||
static int counter[2] = {0, 0};
|
||||
|
||||
pthread_mutex_lock(&lock[epoch]);
|
||||
if(++counter[epoch] == args->nThreads)
|
||||
pthread_cond_broadcast(&cond[epoch]);
|
||||
|
||||
if(args->thread+1 == args->nThreads) {
|
||||
while(counter[epoch] != args->nThreads)
|
||||
pthread_cond_wait(&cond[epoch], &lock[epoch]);
|
||||
#ifdef MPI_SUPPORT
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
#endif
|
||||
counter[epoch] = 0;
|
||||
pthread_cond_broadcast(&cond[epoch]);
|
||||
}
|
||||
else {
|
||||
while(counter[epoch] != 0)
|
||||
pthread_cond_wait(&cond[epoch], &lock[epoch]);
|
||||
}
|
||||
pthread_mutex_unlock(&lock[epoch]);
|
||||
epoch ^= 1;
|
||||
}
|
||||
|
||||
#define KERN(type, op, preop, postop) (void*)InitDataReduceKernel<type, op<type>, preop<type>, postop<type> >
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0)
|
||||
#define OPS(type) \
|
||||
KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
|
||||
KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
|
||||
KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
|
||||
KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
|
||||
KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv), \
|
||||
KERN(type, ncclOpSum/*PreMulSum*/, ncclPPOpMul, ncclPPOpIdent)
|
||||
#elif NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
|
||||
#define OPS(type) \
|
||||
KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
|
||||
KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
|
||||
KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
|
||||
KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent), \
|
||||
KERN(type, ncclOpSum/*Avg*/, ncclPPOpIdent, ncclPPOpDiv)
|
||||
#else
|
||||
#define OPS(type) \
|
||||
KERN(type, ncclOpSum, ncclPPOpIdent, ncclPPOpIdent), \
|
||||
KERN(type, ncclOpProd, ncclPPOpIdent, ncclPPOpIdent), \
|
||||
KERN(type, ncclOpMax, ncclPPOpIdent, ncclPPOpIdent), \
|
||||
KERN(type, ncclOpMin, ncclPPOpIdent, ncclPPOpIdent)
|
||||
#endif
|
||||
|
||||
static void* const redInitDataKerns[test_opNumMax*ncclNumTypes] = {
|
||||
OPS(int8_t), OPS(uint8_t), OPS(int32_t), OPS(uint32_t), OPS(int64_t), OPS(uint64_t), OPS(half), OPS(float), OPS(double),
|
||||
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
|
||||
OPS(rccl_bfloat16)
|
||||
#endif
|
||||
};
|
||||
|
||||
testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks) {
|
||||
dim3 grid = { 32, 1, 1 };
|
||||
dim3 block = { 256, 1, 1 };
|
||||
void* args[5] = { (void*)&data, (void*)&count, (void*)&offset, (void*)&rep, (void*)&nranks };
|
||||
HIPCHECK(hipLaunchKernel(redInitDataKerns[type*test_opNumMax+op], grid, block, args, 0, hipStreamDefault));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
// Inter-thread/process barrier+allreduce. The quality of the return value
|
||||
// for average=0 (which means broadcast from rank=0) is dubious. The returned
|
||||
// value will actually be the result of process-local broadcast from the local thread=0.
|
||||
template<typename T>
|
||||
__global__ void InitDataKernel(T* data, const size_t N, const int rep, const int rank) {
|
||||
for (size_t o=blockIdx.x*blockDim.x+threadIdx.x; o<N; o+=gridDim.x*blockDim.x)
|
||||
data[o] = testValue<T>(o, rep, rank);
|
||||
}
|
||||
void Allreduce(struct threadArgs* args, T* value, int average) {
|
||||
thread_local int epoch = 0;
|
||||
static pthread_mutex_t lock[2] = {PTHREAD_MUTEX_INITIALIZER, PTHREAD_MUTEX_INITIALIZER};
|
||||
static pthread_cond_t cond[2] = {PTHREAD_COND_INITIALIZER, PTHREAD_COND_INITIALIZER};
|
||||
static T accumulator[2];
|
||||
static int counter[2] = {0, 0};
|
||||
|
||||
static void* const initDataKerns[ncclNumTypes] = {
|
||||
(void*)InitDataKernel< int8_t>,
|
||||
(void*)InitDataKernel< uint8_t>,
|
||||
(void*)InitDataKernel< int32_t>,
|
||||
(void*)InitDataKernel<uint32_t>,
|
||||
(void*)InitDataKernel< int64_t>,
|
||||
(void*)InitDataKernel<uint64_t>,
|
||||
(void*)InitDataKernel< half>,
|
||||
(void*)InitDataKernel< float>,
|
||||
(void*)InitDataKernel< double>,
|
||||
#if RCCL_BFLOAT16 == 1 && NCCL_MAJOR >= 2
|
||||
(void*)InitDataKernel<rccl_bfloat16>
|
||||
#endif
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
testResult_t InitDataType(void* dest, const size_t N, const int rep, const int rank) {
|
||||
T* ptr = (T*)dest;
|
||||
hipLaunchKernelGGL((InitDataKernel), dim3(16), dim3(512), 0, 0, ptr, N, rep, rank);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank) {
|
||||
dim3 grid = { 32, 1, 1 };
|
||||
dim3 block = { 256, 1, 1 };
|
||||
void* args[4] = { (void*)&data, (void*)&count, (void*)&rep, (void*)&rank };
|
||||
HIPCHECK(hipLaunchKernel(initDataKerns[type], grid, block, args, 0, hipStreamDefault));
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
void Barrier(struct threadArgs* args) {
|
||||
while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
|
||||
args->barrier[args->barrier_idx] = args->thread + 1;
|
||||
if (args->thread+1 == args->nThreads) {
|
||||
#ifdef MPI_SUPPORT
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
#endif
|
||||
args->barrier[args->barrier_idx] = 0;
|
||||
pthread_mutex_lock(&lock[epoch]);
|
||||
if(counter[epoch] == 0) {
|
||||
if(average != 0 || args->thread == 0) accumulator[epoch] = *value;
|
||||
} else {
|
||||
while (args->barrier[args->barrier_idx]) pthread_yield();
|
||||
}
|
||||
args->barrier_idx=!args->barrier_idx;
|
||||
}
|
||||
|
||||
// Inter-thread/process barrier+allreduce
|
||||
void Allreduce(struct threadArgs* args, double* value, int average) {
|
||||
while (args->barrier[args->barrier_idx] != args->thread) pthread_yield();
|
||||
double val = *value;
|
||||
if (args->thread > 0) {
|
||||
double val2 = args->reduce[args->barrier_idx];
|
||||
if (average == 1) val += val2;
|
||||
if (average == 2) val = std::min(val, val2);
|
||||
if (average == 3) val = std::max(val, val2);
|
||||
}
|
||||
if (average || args->thread == 0) args->reduce[args->barrier_idx] = val;
|
||||
args->barrier[args->barrier_idx] = args->thread + 1;
|
||||
if (args->thread+1 == args->nThreads) {
|
||||
#ifdef MPI_SUPPORT
|
||||
if (average != 0) {
|
||||
MPI_Op op = average == 1 ? MPI_SUM : average == 2 ? MPI_MIN : MPI_MAX;
|
||||
MPI_Allreduce(MPI_IN_PLACE, (void*)&args->reduce[args->barrier_idx], 1, MPI_DOUBLE, op, MPI_COMM_WORLD);
|
||||
switch(average) {
|
||||
case /*r0*/ 0: if(args->thread == 0) accumulator[epoch] = *value; break;
|
||||
case /*avg*/1: accumulator[epoch] += *value; break;
|
||||
case /*min*/2: accumulator[epoch] = std::min<T>(accumulator[epoch], *value); break;
|
||||
case /*max*/3: accumulator[epoch] = std::max<T>(accumulator[epoch], *value); break;
|
||||
case /*sum*/4: accumulator[epoch] += *value; break;
|
||||
}
|
||||
#endif
|
||||
if (average == 1) args->reduce[args->barrier_idx] /= args->nProcs*args->nThreads;
|
||||
args->reduce[1-args->barrier_idx] = 0;
|
||||
args->barrier[args->barrier_idx] = 0;
|
||||
} else {
|
||||
while (args->barrier[args->barrier_idx]) pthread_yield();
|
||||
}
|
||||
*value = args->reduce[args->barrier_idx];
|
||||
args->barrier_idx=!args->barrier_idx;
|
||||
|
||||
if(++counter[epoch] == args->nThreads)
|
||||
pthread_cond_broadcast(&cond[epoch]);
|
||||
|
||||
if(args->thread+1 == args->nThreads) {
|
||||
while(counter[epoch] != args->nThreads)
|
||||
pthread_cond_wait(&cond[epoch], &lock[epoch]);
|
||||
|
||||
#ifdef MPI_SUPPORT
|
||||
if(average != 0) {
|
||||
static_assert(std::is_same<T, long long>::value || std::is_same<T, double>::value, "Allreduce<T> only for T in {long long, double}");
|
||||
MPI_Datatype ty = std::is_same<T, long long>::value ? MPI_LONG_LONG :
|
||||
std::is_same<T, double>::value ? MPI_DOUBLE :
|
||||
MPI_Datatype();
|
||||
MPI_Op op = average == 1 ? MPI_SUM :
|
||||
average == 2 ? MPI_MIN :
|
||||
average == 3 ? MPI_MAX :
|
||||
average == 4 ? MPI_SUM : MPI_Op();
|
||||
MPI_Allreduce(MPI_IN_PLACE, (void*)&accumulator[epoch], 1, ty, op, MPI_COMM_WORLD);
|
||||
}
|
||||
#endif
|
||||
|
||||
if(average == 1) accumulator[epoch] /= args->totalProcs*args->nThreads;
|
||||
counter[epoch] = 0;
|
||||
pthread_cond_broadcast(&cond[epoch]);
|
||||
}
|
||||
else {
|
||||
while(counter[epoch] != 0)
|
||||
pthread_cond_wait(&cond[epoch], &lock[epoch]);
|
||||
}
|
||||
pthread_mutex_unlock(&lock[epoch]);
|
||||
|
||||
*value = accumulator[epoch];
|
||||
epoch ^= 1;
|
||||
}
|
||||
|
||||
testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, double *delta) {
|
||||
testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int in_place, int64_t *wrongElts) {
|
||||
int nranks = args->nProcs*args->nGpus*args->nThreads;
|
||||
size_t count = args->expectedBytes/wordSize(type);
|
||||
double maxDelta = 0.0;
|
||||
|
||||
int64_t *wrongPerGpu = nullptr;
|
||||
CUDACHECK(hipHostAlloc((void**)&wrongPerGpu, args->nGpus*sizeof(int64_t), hipHostAllocMapped));
|
||||
|
||||
for (int i=0; i<args->nGpus*args->nRanks; i++) {
|
||||
int device;
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i);
|
||||
NCCLCHECK(ncclCommCuDevice(args->comms[i], &device));
|
||||
HIPCHECK(hipSetDevice(device));
|
||||
void *data = in_place ? ((void *)((uintptr_t)args->recvbuffs[i] + args->recvInplaceOffset*rank)) : args->recvbuffs[i];
|
||||
TESTCHECK(CheckDelta(data , args->expected[i], count, type, args->deltaHost));
|
||||
maxDelta = std::max(*(args->deltaHost), maxDelta);
|
||||
|
||||
#ifdef DEBUG_PRINT
|
||||
//if (rank == 0) {
|
||||
int *expectedHost = (int *)malloc(args->expectedBytes);
|
||||
int *dataHost = (int *)malloc(args->expectedBytes);
|
||||
TESTCHECK(CheckDelta(data, args->expected[i], count, 0, type, op, 0, nranks, wrongPerGpu+i));
|
||||
|
||||
hipMemcpy(expectedHost, args->expected[rank], args->expectedBytes, hipMemcpyDeviceToHost);
|
||||
#if 1 && DEBUG_PRINT
|
||||
if (args->reportErrors && wrongPerGpu[i] != 0) {
|
||||
printf("rank=%d #wrong=%d\n", rank, (int)wrongPerGpu[i]);
|
||||
char *expectedHost = (char*)malloc(args->expectedBytes);
|
||||
char *dataHost = (char*)malloc(args->expectedBytes);
|
||||
int eltsz = wordSize(type);
|
||||
hipMemcpy(expectedHost, args->expected[i], args->expectedBytes, hipMemcpyDeviceToHost);
|
||||
hipMemcpy(dataHost, data, args->expectedBytes, hipMemcpyDeviceToHost);
|
||||
int j, k, l;
|
||||
for (j=0; j<args->expectedBytes/sizeof(int); j++)
|
||||
if (expectedHost[j] != dataHost[j]) break;
|
||||
k = j;
|
||||
for (; j<args->expectedBytes/sizeof(int); j++)
|
||||
if (expectedHost[j] == dataHost[j]) break;
|
||||
l = j;
|
||||
printf("\n Rank [%d] Expected: ", rank);
|
||||
for (j=k; j<args->expectedBytes/sizeof(int) && j<l; j++) {
|
||||
printf("%d:%d ", j, expectedHost[j]);
|
||||
|
||||
for(int j=0; j<args->expectedBytes/eltsz; j++) {
|
||||
unsigned long long want, got;
|
||||
want = 0;
|
||||
memcpy(&want, expectedHost + j*eltsz, eltsz);
|
||||
got = 0;
|
||||
memcpy(&got, dataHost + j*eltsz, eltsz);
|
||||
if(want != got) {
|
||||
printf(" rank=%d elt[%d]: want=0x%llx got=0x%llx\n", rank, j, want, got);
|
||||
}
|
||||
}
|
||||
printf("\n Rank [%d] Actual : ", rank);
|
||||
for (j=k; j<args->expectedBytes/sizeof(int) && j<l; j++) {
|
||||
printf("%d:%d ", j, dataHost[j]);
|
||||
}
|
||||
printf("\n");
|
||||
free(expectedHost);
|
||||
free(dataHost);
|
||||
//}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
double nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
|
||||
if (args->reportErrors && maxDelta > DeltaMaxValue(type)*(nranks - 1)) args->errors[0]++;
|
||||
*delta = maxDelta;
|
||||
|
||||
*wrongElts = 0;
|
||||
for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
|
||||
hipFree(wrongPerGpu);
|
||||
|
||||
if (args->reportErrors && *wrongElts) args->errors[0]++;
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
|
||||
testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_t* comms) {
|
||||
hipError_t hipErr;
|
||||
int remaining = nStreams;
|
||||
int* done = (int*)malloc(sizeof(int)*nStreams);
|
||||
memset(done, 0, sizeof(int)*nStreams);
|
||||
timer tim;
|
||||
|
||||
while (remaining) {
|
||||
int idle = 1;
|
||||
for (int i=0; i<nStreams; i++) {
|
||||
@@ -548,11 +350,24 @@ testResult_t testStreamSynchronize(int nStreams, hipStream_t* streams, ncclComm_
|
||||
NCCLCHECK(ncclAsyncErr);
|
||||
}
|
||||
}
|
||||
double delta = tim.elapsed();
|
||||
if (delta > timeout && timeout > 0) {
|
||||
for (int i=0; i<ngpus; i++)
|
||||
NCCLCHECK(ncclCommAbort(comms[i]));
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024);
|
||||
printf("%s: Test timeout (%ds) %s:%d\n",
|
||||
hostname,
|
||||
timeout,
|
||||
__FILE__,__LINE__);
|
||||
free(done);
|
||||
return testTimeout;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// We might want to let other threads (including NCCL threads) use the CPU.
|
||||
if (idle) pthread_yield();
|
||||
if (idle) sched_yield();
|
||||
}
|
||||
free(done);
|
||||
return testSuccess;
|
||||
@@ -572,6 +387,7 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
int hipDev;
|
||||
NCCLCHECK(ncclCommCuDevice(args->comms[i], &hipDev));
|
||||
HIPCHECK(hipSetDevice(hipDev));
|
||||
//CUDACHECK(cudaSetDevice(args->gpus[i])); EDGAR CHECK LATER
|
||||
#endif
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i);
|
||||
char* recvBuff = ((char*)args->recvbuffs[i]) + shift;
|
||||
@@ -590,19 +406,18 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
rccl_bfloat16 bf16;
|
||||
#endif
|
||||
};
|
||||
int scalar = preMulScalar(rank);
|
||||
switch(type) {
|
||||
case ncclInt8: i8 = int8_t(scalar); break;
|
||||
case ncclUint8: u8 = uint8_t(scalar); break;
|
||||
case ncclInt32: i32 = int32_t(scalar); break;
|
||||
case ncclUint32: u32 = uint32_t(scalar); break;
|
||||
case ncclInt64: i64 = int32_t(scalar); break;
|
||||
case ncclUint64: u64 = uint32_t(scalar); break;
|
||||
case ncclFloat16: f16 = __float2half(float(scalar)); break;
|
||||
case ncclFloat32: f32 = float(scalar); break;
|
||||
case ncclFloat64: f64 = double(scalar); break;
|
||||
case ncclInt8: i8 = ncclVerifiablePremulScalar<int8_t>(rank); break;
|
||||
case ncclUint8: u8 = ncclVerifiablePremulScalar<uint8_t>(rank); break;
|
||||
case ncclInt32: i32 = ncclVerifiablePremulScalar<int32_t>(rank); break;
|
||||
case ncclUint32: u32 = ncclVerifiablePremulScalar<uint32_t>(rank); break;
|
||||
case ncclInt64: i64 = ncclVerifiablePremulScalar<int64_t>(rank); break;
|
||||
case ncclUint64: u64 = ncclVerifiablePremulScalar<uint64_t>(rank); break;
|
||||
case ncclFloat16: f16 = ncclVerifiablePremulScalar<half>(rank); break;
|
||||
case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
|
||||
case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
case ncclBfloat16: bf16 = (rccl_bfloat16)(float(scalar)); break;
|
||||
case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<__nv_bfloat16>(rank); break;
|
||||
#endif
|
||||
}
|
||||
NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
|
||||
@@ -657,16 +472,17 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
if (cudaGraphLaunches >= 1) {
|
||||
// Begin cuda graph capture
|
||||
for (int i=0; i<args->nGpus*args->nRanks; i++) {
|
||||
// Thread local mode is needed for:
|
||||
// - Multi-thread mode
|
||||
// - P2P pre-connect
|
||||
// Thread local mdoe is needed for:
|
||||
// - Multi-thread mode: where graph capture and instantiation can happen concurrently across threads
|
||||
// - P2P pre-connect: when there is no warm-up, P2P pre-connect is done during graph capture.
|
||||
// Since pre-connect calls cudaMalloc, we cannot use global capture mode
|
||||
HIPCHECK(hipStreamBeginCapture(args->streams[i], hipStreamCaptureModeThreadLocal));
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Performance Benchmark
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
timer tim;
|
||||
for (int iter = 0; iter < iters; iter++) {
|
||||
if (agg_iters>1) NCCLCHECK(ncclGroupStart());
|
||||
for (int aiter = 0; aiter < agg_iters; aiter++) {
|
||||
@@ -687,7 +503,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
}
|
||||
// Resync CPU, restart timing, launch cuda graph
|
||||
Barrier(args);
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
tim.reset();
|
||||
for (int l=0; l<cudaGraphLaunches; l++) {
|
||||
for (int i=0; i<args->nGpus*args->nRanks; i++) {
|
||||
HIPCHECK(hipGraphLaunch(graphExec[i], args->streams[i]));
|
||||
@@ -696,10 +512,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
}
|
||||
#endif
|
||||
|
||||
double cputimeSec = tim.elapsed()/(iters*agg_iters);
|
||||
TESTCHECK(completeColl(args));
|
||||
|
||||
auto delta = std::chrono::high_resolution_clock::now() - start;
|
||||
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(delta).count();
|
||||
double deltaSec = tim.elapsed();
|
||||
deltaSec = deltaSec/(iters*agg_iters);
|
||||
if (cudaGraphLaunches >= 1) deltaSec = deltaSec/cudaGraphLaunches;
|
||||
Allreduce(args, &deltaSec, average);
|
||||
@@ -719,8 +535,7 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
|
||||
Barrier(args);
|
||||
|
||||
double maxDelta = 0;
|
||||
bool error = false;
|
||||
int64_t wrongElts = 0;
|
||||
static __thread int rep = 0;
|
||||
rep++;
|
||||
if (datacheck) {
|
||||
@@ -768,13 +583,15 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
}
|
||||
#endif
|
||||
|
||||
TESTCHECK(CheckData(args, type, op, root, in_place, &maxDelta));
|
||||
TESTCHECK(CheckData(args, type, op, root, in_place, &wrongElts));
|
||||
|
||||
//aggregate delta from all threads and procs
|
||||
Allreduce(args, &maxDelta, 3);
|
||||
long long wrongElts1 = wrongElts;
|
||||
Allreduce(args, &wrongElts1, /*sum*/4);
|
||||
wrongElts = wrongElts1;
|
||||
}
|
||||
|
||||
double timeUsec = deltaSec*1.0E6;
|
||||
double timeUsec = (report_cputime ? cputimeSec : deltaSec)*1.0E6;
|
||||
char timeStr[100];
|
||||
if (timeUsec >= 10000.0) {
|
||||
sprintf(timeStr, "%7.0f", timeUsec);
|
||||
@@ -783,10 +600,10 @@ testResult_t BenchTime(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
} else {
|
||||
sprintf(timeStr, "%7.2f", timeUsec);
|
||||
}
|
||||
if (datacheck) {
|
||||
PRINT(" %7s %6.2f %6.2f %5.0le%s", timeStr, algBw, busBw, maxDelta, error ? "*" : "");
|
||||
if (args->reportErrors) {
|
||||
PRINT(" %7s %6.2f %6.2f %5g", timeStr, algBw, busBw, (double)wrongElts);
|
||||
} else {
|
||||
PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A");
|
||||
PRINT(" %7s %6.2f %6.2f %5s", timeStr, algBw, busBw, "N/A");
|
||||
}
|
||||
|
||||
args->bw[0] += busBw;
|
||||
@@ -809,6 +626,9 @@ void setupArgs(size_t size, ncclDataType_t type, struct threadArgs* args) {
|
||||
}
|
||||
|
||||
testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root) {
|
||||
// Sync to avoid first-call timeout
|
||||
Barrier(args);
|
||||
|
||||
// Warm-up for large size
|
||||
setupArgs(args->maxbytes, type, args);
|
||||
for (int iter = 0; iter < warmup_iters; iter++) {
|
||||
@@ -855,7 +675,7 @@ testResult_t threadInit(struct threadArgs* args) {
|
||||
int nranks = args->nProcs*args->nThreads*args->nGpus*args->nRanks;
|
||||
|
||||
//set main thread again
|
||||
is_main_thread = (args->proc == 0 && args->thread == 0) ? 1 : 0;
|
||||
is_main_thread = (is_main_proc && args->thread == 0) ? 1 : 0;
|
||||
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
@@ -863,6 +683,7 @@ testResult_t threadInit(struct threadArgs* args) {
|
||||
if (enable_multiranks)
|
||||
gpuid = gpuid % numDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
//CUDACHECK(cudaSetDevice(args->gpus[i]));
|
||||
|
||||
for (int j=0; j<args->nRanks; j++) {
|
||||
int rank = (args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + j;
|
||||
@@ -968,10 +789,13 @@ int main(int argc, char* argv[]) {
|
||||
{"datatype", required_argument, 0, 'd'},
|
||||
{"root", required_argument, 0, 'r'},
|
||||
{"blocking", required_argument, 0, 'z'},
|
||||
{"memory_type", required_argument, 0, 'y'},
|
||||
{"stress_cycles", required_argument, 0, 's'},
|
||||
{"cumask", required_argument, 0, 'u'},
|
||||
{"memory_type", required_argument, 0, 'y'}, //RCCL
|
||||
{"stress_cycles", required_argument, 0, 's'}, //RCCL
|
||||
{"cumask", required_argument, 0, 'u'}, //RCCL
|
||||
{"stream_null", required_argument, 0, 'y'}, //NCCL
|
||||
{"timeout", required_argument, 0, 'T'}, //NCCL
|
||||
{"cudagraph", required_argument, 0, 'G'},
|
||||
{"report_cputime", required_argument, 0, 'C'},
|
||||
{"average", required_argument, 0, 'a'},
|
||||
#ifdef RCCL_MULTIRANKPERGPU
|
||||
{"enable_multiranks", required_argument, 0, 'x'},
|
||||
@@ -983,10 +807,12 @@ int main(int argc, char* argv[]) {
|
||||
|
||||
while(1) {
|
||||
int c;
|
||||
#ifdef RCCL_MULTIRANKPERGPU
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:R:x:", longopts, &longindex);
|
||||
// EDGAR NOTE: y is used by 'memory_type' (a RCCL argument) and 'stream_null' (a NCCL argument)
|
||||
// also not sure about G vs. hG (we had G, they have hG)
|
||||
#ifdef RCCL_MULTIRANKPERGPU
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:R:x:", longopts, &longindex);
|
||||
#else
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z:G:a:y:s:u:h:", longopts, &longindex);
|
||||
c = getopt_long(argc, argv, "t:g:b:e:i:f:n:m:w:p:c:o:d:r:z :y :T:G:C:a :y :s:u:h:", longopts, &longindex);
|
||||
#endif
|
||||
|
||||
if (c == -1)
|
||||
@@ -1067,6 +893,12 @@ int main(int argc, char* argv[]) {
|
||||
mask = strtok(NULL, ",");
|
||||
};
|
||||
}
|
||||
break;
|
||||
case 'y':
|
||||
streamnull = strtol(optarg, NULL, 0);
|
||||
break;
|
||||
case 'T':
|
||||
timeout = strtol(optarg, NULL, 0);
|
||||
break;
|
||||
case 'G':
|
||||
#if (NCCL_MAJOR > 2 || (NCCL_MAJOR >= 2 && NCCL_MINOR >= 9)) && HIP_VERSION >= 50221310
|
||||
@@ -1075,6 +907,9 @@ int main(int argc, char* argv[]) {
|
||||
printf("Option -G (HIP graph) not supported before NCCL 2.9 + ROCm 5.2 Ignoring\n");
|
||||
#endif
|
||||
break;
|
||||
case 'C':
|
||||
report_cputime = strtol(optarg, NULL, 0);
|
||||
break;
|
||||
case 'a':
|
||||
average = (int)strtol(optarg, NULL, 0);
|
||||
break;
|
||||
@@ -1114,15 +949,18 @@ int main(int argc, char* argv[]) {
|
||||
"[-y,--memory_type <coarse/fine/host/managed>] \n\t"
|
||||
"[-s,--stress_cycles <number of cycles>] \n\t"
|
||||
"[-u,--cumask <d0,d1,d2,d3>] \n\t"
|
||||
"[-y,--stream_null <0/1>] \n\t"
|
||||
"[-T,--timeout <time in seconds>] \n\t"
|
||||
"[-G,--cudagraph <num graph launches>] \n\t"
|
||||
"[-C,--report_cputime <0/1>] \n\t"
|
||||
"[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
|
||||
#ifdef RCCL_MULTIRANKPERGPU
|
||||
"[-x,--enable_multiranks <0/1> enable using multiple ranks per GPU] \n\t"
|
||||
"[-R,--ranks_per_gpu] \n\t"
|
||||
#endif
|
||||
"[-h,--help]\n",
|
||||
basename(argv[0]));
|
||||
return 0;
|
||||
basename(argv[0]));
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1161,26 +999,36 @@ int main(int argc, char* argv[]) {
|
||||
}
|
||||
|
||||
testResult_t run() {
|
||||
int nProcs = 1, proc = 0;
|
||||
int totalProcs = 1, proc = 0, ncclProcs = 1, ncclProc = 0, color = 0;
|
||||
int localRank = 0;
|
||||
char hostname[1024];
|
||||
getHostName(hostname, 1024);
|
||||
|
||||
#ifdef MPI_SUPPORT
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &nProcs);
|
||||
MPI_Comm_size(MPI_COMM_WORLD, &totalProcs);
|
||||
MPI_Comm_rank(MPI_COMM_WORLD, &proc);
|
||||
uint64_t hostHashs[nProcs];
|
||||
uint64_t hostHashs[totalProcs];
|
||||
hostHashs[proc] = getHostHash(hostname);
|
||||
MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD);
|
||||
for (int p=0; p<nProcs; p++) {
|
||||
for (int p=0; p<totalProcs; p++) {
|
||||
if (p == proc) break;
|
||||
if (hostHashs[p] == hostHashs[proc]) localRank++;
|
||||
}
|
||||
#endif
|
||||
is_main_thread = (proc == 0) ? 1 : 0;
|
||||
|
||||
PRINT("# nThreads: %d nGpus: %d nRanks: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d validation: %d \n", nThreads, nGpus, ranksPerGpu, minBytes, maxBytes,
|
||||
(stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes", warmup_iters, iters, datacheck);
|
||||
char* str = getenv("NCCL_TESTS_SPLIT_MASK");
|
||||
uint64_t mask = str ? strtoul(str, NULL, 16) : 0;
|
||||
MPI_Comm mpi_comm;
|
||||
color = proc & mask;
|
||||
MPI_Comm_split(MPI_COMM_WORLD, color, proc, &mpi_comm);
|
||||
MPI_Comm_size(mpi_comm, &ncclProcs);
|
||||
MPI_Comm_rank(mpi_comm, &ncclProc);
|
||||
#endif
|
||||
is_main_thread = is_main_proc = (proc == 0) ? 1 : 0;
|
||||
|
||||
PRINT("# nThreads: %d nGpus: %d nRanks: %d minBytes: %ld maxBytes: %ld step: %ld(%s) warmupIters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
|
||||
nThreads, nGpus, ranksPerGpu, minBytes, maxBytes,
|
||||
(stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
|
||||
warmup_iters, iters, agg_iters, datacheck, cudaGraphLaunches);
|
||||
if (blocking_coll) PRINT("# Blocking Enabled: wait for completion and barrier after each collective \n");
|
||||
if (parallel_init) PRINT("# Parallel Init Enabled: threads call into NcclInitRank concurrently \n");
|
||||
PRINT("#\n");
|
||||
@@ -1190,6 +1038,8 @@ testResult_t run() {
|
||||
char line[MAX_LINE];
|
||||
int len = 0;
|
||||
size_t maxMem = ~0;
|
||||
char* envstr = getenv("NCCL_TESTS_DEVICE");
|
||||
int gpu0 = envstr ? atoi(envstr) : -1;
|
||||
for (int i=0; i<nThreads*nGpus; i++) {
|
||||
int hipDev = localRank*nThreads*nGpus+i;
|
||||
if (enable_multiranks)
|
||||
@@ -1207,11 +1057,11 @@ testResult_t run() {
|
||||
}
|
||||
}
|
||||
#if MPI_SUPPORT
|
||||
char *lines = (proc == 0) ? (char *)malloc(nProcs*MAX_LINE) : NULL;
|
||||
char *lines = (proc == 0) ? (char *)malloc(totalProcs*MAX_LINE) : NULL;
|
||||
// Gather all output in rank order to root (0)
|
||||
MPI_Gather(line, MAX_LINE, MPI_BYTE, lines, MAX_LINE, MPI_BYTE, 0, MPI_COMM_WORLD);
|
||||
if (proc == 0) {
|
||||
for (int p = 0; p < nProcs; p++)
|
||||
for (int p = 0; p < totalProcs; p++)
|
||||
PRINT("%s", lines+MAX_LINE*p);
|
||||
free(lines);
|
||||
}
|
||||
@@ -1228,13 +1078,14 @@ testResult_t run() {
|
||||
}
|
||||
|
||||
ncclUniqueId ncclId;
|
||||
if (proc == 0) {
|
||||
if (ncclProc == 0) {
|
||||
NCCLCHECK(ncclGetUniqueId(&ncclId));
|
||||
}
|
||||
#ifdef MPI_SUPPORT
|
||||
MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, MPI_COMM_WORLD);
|
||||
MPI_Barrier(MPI_COMM_WORLD);
|
||||
MPI_Bcast(&ncclId, sizeof(ncclId), MPI_BYTE, 0, mpi_comm);
|
||||
#endif
|
||||
<<<<<<< HEAD
|
||||
int gpus[nGpus*nThreads*ranksPerGpu];
|
||||
hipStream_t streams[nGpus*nThreads*ranksPerGpu];
|
||||
void* sendbuffs[nGpus*nThreads*ranksPerGpu];
|
||||
void* recvbuffs[nGpus*nThreads*ranksPerGpu];
|
||||
@@ -1243,15 +1094,23 @@ testResult_t run() {
|
||||
|
||||
ncclTestEngine.getBuffSize(&sendBytes, &recvBytes, (size_t)maxBytes, (size_t)nProcs*nGpus*nThreads*ranksPerGpu);
|
||||
|
||||
envstr = getenv("NCCL_TESTS_DEVICE");
|
||||
gpu0 = envstr ? atoi(envstr) : -1;
|
||||
for (int ii=0; ii<nGpus*nThreads; ii++) {
|
||||
int gpuid = localRank*nThreads*nGpus+ii;
|
||||
if (enable_multiranks)
|
||||
gpuid = gpuid % numDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
|
||||
for (int j=0; j<ranksPerGpu; j++) {
|
||||
int i = ii*ranksPerGpu+j;
|
||||
gpus[i] = gpu0 != -1 ? gpu0+ii : gpuid;
|
||||
HIPCHECK(hipSetDevice(gpus[i]));
|
||||
|
||||
TESTCHECK(AllocateBuffs(sendbuffs+i, sendBytes, recvbuffs+i, recvBytes, expected+i, (size_t)maxBytes, nProcs*nThreads*nGpus*ranksPerGpu));
|
||||
//PRINT("sendbuffs[%d]=%p(size=%lu) recvbuffs[%d]=%p(size=%lu)\n", i, sendbuffs[i], sendBytes, i, recvbuffs[i], recvBytes);
|
||||
if (streamnull)
|
||||
streams[i] = NULL;
|
||||
else {
|
||||
if (cumask[0] || cumask[1] || cumask[2] || cumask[3]) {
|
||||
PRINT("cumask: ");
|
||||
for (int i = 0; i < 4 ; i++) PRINT("%x,", cumask[i]);
|
||||
@@ -1259,19 +1118,20 @@ testResult_t run() {
|
||||
HIPCHECK(hipExtStreamCreateWithCUMask(streams+i, 4, cumask));
|
||||
} else
|
||||
HIPCHECK(hipStreamCreateWithFlags(streams+i, hipStreamNonBlocking));
|
||||
// initialize data buffer to avoid all zero data
|
||||
}
|
||||
#if 0 //EDGAR
|
||||
// initialize data buffer to avoid all zero data
|
||||
TESTCHECK(InitData(sendbuffs[i], sendBytes, ncclUint8, 0, i));
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
#endif //EDGAR
|
||||
}
|
||||
|
||||
//if parallel init is not selected, use main thread to initialize NCCL
|
||||
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus*ranksPerGpu);
|
||||
if (!parallel_init) {
|
||||
if (nProcs == 1 && !enable_multiranks) {
|
||||
int gpuArray[nGpus*nThreads];
|
||||
for (int i=0; i<nGpus*nThreads; i++) gpuArray[i] = i;
|
||||
NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpuArray));
|
||||
NCCLCHECK(ncclCommInitAll(comms, nGpus*nThreads, gpus));
|
||||
} else {
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int ii=0; ii<nGpus*nThreads; ii++) {
|
||||
@@ -1305,12 +1165,13 @@ testResult_t run() {
|
||||
errors[t] = bw_count[t] = 0;
|
||||
}
|
||||
|
||||
const char* timeStr = report_cputime ? "cputime" : "time";
|
||||
PRINT("#\n");
|
||||
print_header();
|
||||
|
||||
int* sync = (int*)calloc(2, sizeof(int));
|
||||
int* barrier = (int*)calloc(2, sizeof(int));
|
||||
double* reduce = (double*)calloc(2, sizeof(double));
|
||||
PRINT("# %10s %12s %8s %6s %6s out-of-place in-place \n", "", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %6s %7s %6s %6s %6s\n", "size", "count", "type", "redop", "root",
|
||||
timeStr, "algbw", "busbw", "#wrong", timeStr, "algbw", "busbw", "#wrong");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
|
||||
struct testThread threads[nThreads];
|
||||
memset(threads, 0, sizeof(struct testThread)*nThreads);
|
||||
@@ -1324,11 +1185,12 @@ testResult_t run() {
|
||||
threads[t].args.localNumDevices = numDevices;
|
||||
threads[t].args.enable_multiranks = enable_multiranks;
|
||||
threads[t].args.nRanks = ranksPerGpu;
|
||||
threads[t].args.nProcs=nProcs;
|
||||
threads[t].args.proc=proc;
|
||||
threads[t].args.nProcs=ncclProcs;
|
||||
threads[t].args.proc=ncclProc;
|
||||
threads[t].args.nThreads=nThreads;
|
||||
threads[t].args.thread=t;
|
||||
threads[t].args.nGpus=nGpus;
|
||||
threads[t].args.gpus=gpus+t*nGpus*ranksPerGpu;
|
||||
threads[t].args.sendbuffs = sendbuffs+t*nGpus*ranksPerGpu;
|
||||
threads[t].args.recvbuffs = recvbuffs+t*nGpus*ranksPerGpu;
|
||||
threads[t].args.expected = expected+t*nGpus*ranksPerGpu;
|
||||
@@ -1336,17 +1198,11 @@ testResult_t run() {
|
||||
threads[t].args.comms=comms+t*nGpus*ranksPerGpu;
|
||||
threads[t].args.streams=streams+t*nGpus*ranksPerGpu;
|
||||
|
||||
threads[t].args.barrier = (volatile int*)barrier;
|
||||
threads[t].args.barrier_idx = 0;
|
||||
threads[t].args.reduce = (volatile double*)reduce;
|
||||
threads[t].args.sync = (volatile int*)sync;
|
||||
threads[t].args.sync_idx = 0;
|
||||
threads[t].args.deltaHost = (delta + t*NUM_BLOCKS);
|
||||
threads[t].args.errors=errors+t;
|
||||
threads[t].args.bw=bw+t;
|
||||
threads[t].args.bw_count=bw_count+t;
|
||||
|
||||
threads[t].args.reportErrors = 1;
|
||||
threads[t].args.reportErrors = datacheck;
|
||||
|
||||
threads[t].func = parallel_init ? threadInit : threadRunTests;
|
||||
if (t)
|
||||
@@ -1395,8 +1251,8 @@ testResult_t run() {
|
||||
}
|
||||
HIPCHECK(hipHostFree(delta));
|
||||
|
||||
char* str = getenv("NCCL_TESTS_MIN_BW");
|
||||
double check_avg_bw = str ? atof(str) : -1;
|
||||
envstr = getenv("NCCL_TESTS_MIN_BW");
|
||||
double check_avg_bw = envstr ? atof(envstr) : -1;
|
||||
bw[0] /= bw_count[0];
|
||||
|
||||
if (datacheck) PRINT("# Errors with asterisks indicate errors that have exceeded the maximum threshold.\n");
|
||||
@@ -1408,6 +1264,7 @@ testResult_t run() {
|
||||
#endif
|
||||
|
||||
// 'hip-memcheck --leak-check full' requires this
|
||||
PRINT("%s\n", ncclGetLastError(NULL));
|
||||
hipDeviceReset();
|
||||
|
||||
if (errors[0] || bw[0] < check_avg_bw*(0.9))
|
||||
|
||||
+34
-55
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -16,6 +16,10 @@
|
||||
#endif
|
||||
#include <pthread.h>
|
||||
#include "nccl1_compat.h"
|
||||
#include "timer.h"
|
||||
|
||||
// For nccl.h < 2.13 since we define a weak fallback
|
||||
extern "C" char const* ncclGetLastError(ncclComm_t comm);
|
||||
|
||||
#define HIPCHECK(cmd) do { \
|
||||
hipError_t e = cmd; \
|
||||
@@ -29,6 +33,21 @@
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0)
|
||||
#define NCCLCHECK(cmd) do { \
|
||||
ncclResult_t res = cmd; \
|
||||
if (res != ncclSuccess) { \
|
||||
char hostname[1024]; \
|
||||
getHostName(hostname, 1024); \
|
||||
printf("%s: Test NCCL failure %s:%d " \
|
||||
"'%s / %s'\n", \
|
||||
hostname,__FILE__,__LINE__, \
|
||||
ncclGetErrorString(res), \
|
||||
ncclGetLastError(NULL)); \
|
||||
return testNcclError; \
|
||||
} \
|
||||
} while(0)
|
||||
#else
|
||||
#define NCCLCHECK(cmd) do { \
|
||||
ncclResult_t res = cmd; \
|
||||
if (res != ncclSuccess) { \
|
||||
@@ -40,13 +59,15 @@
|
||||
return testNcclError; \
|
||||
} \
|
||||
} while(0)
|
||||
#endif
|
||||
|
||||
typedef enum {
|
||||
testSuccess = 0,
|
||||
testInternalError = 1,
|
||||
testCudaError = 2,
|
||||
testNcclError = 3,
|
||||
testCuRandError = 4
|
||||
testTimeout = 4,
|
||||
testNumResults = 5
|
||||
} testResult_t;
|
||||
|
||||
// Relay errors up and trace
|
||||
@@ -96,11 +117,13 @@ struct threadArgs {
|
||||
size_t stepbytes;
|
||||
size_t stepfactor;
|
||||
|
||||
int totalProcs;
|
||||
int nProcs;
|
||||
int proc;
|
||||
int nThreads;
|
||||
int thread;
|
||||
int nGpus;
|
||||
int* gpus;
|
||||
int localRank;
|
||||
int localNumDevices;
|
||||
int enable_multiranks;
|
||||
@@ -116,14 +139,6 @@ struct threadArgs {
|
||||
|
||||
void** expected;
|
||||
size_t expectedBytes;
|
||||
volatile int* sync;
|
||||
int sync_idx;
|
||||
volatile int* barrier;
|
||||
int barrier_idx;
|
||||
volatile double* reduce;
|
||||
int syncRank;
|
||||
int syncNranks;
|
||||
double* deltaHost;
|
||||
int* errors;
|
||||
double* bw;
|
||||
int* bw_count;
|
||||
@@ -141,19 +156,13 @@ struct testThread {
|
||||
testResult_t ret;
|
||||
};
|
||||
|
||||
#include <chrono>
|
||||
|
||||
// Provided by common.cu
|
||||
extern void Barrier(struct threadArgs* args);
|
||||
extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root);
|
||||
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
|
||||
extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
|
||||
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
|
||||
extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
|
||||
extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
|
||||
|
||||
// Provided by each coll
|
||||
extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
|
||||
extern void print_header();
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
static void getHostName(char* hostname, int maxlen) {
|
||||
@@ -168,46 +177,15 @@ static void getHostName(char* hostname, int maxlen) {
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
static uint64_t getHash(const char* string, size_t n) {
|
||||
// Based on DJB2a, result = result * 33 ^ char
|
||||
static uint64_t getHostHash(const char* string) {
|
||||
// Based on DJB2, result = result * 33 + char
|
||||
uint64_t result = 5381;
|
||||
for (size_t c = 0; c < n; c++) {
|
||||
result = ((result << 5) + result) ^ string[c];
|
||||
for (int c = 0; string[c] != '\0'; c++){
|
||||
result = ((result << 5) + result) + string[c];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Generate a hash of the unique identifying string for this host
|
||||
* that will be unique for both bare-metal and container instances
|
||||
* Equivalent of a hash of;
|
||||
*
|
||||
* $(hostname)$(cat /proc/sys/kernel/random/boot_id)
|
||||
*
|
||||
*/
|
||||
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
|
||||
static uint64_t getHostHash(const char* hostname) {
|
||||
char hostHash[1024];
|
||||
|
||||
// Fall back is the hostname if something fails
|
||||
(void) strncpy(hostHash, hostname, sizeof(hostHash));
|
||||
int offset = strlen(hostHash);
|
||||
|
||||
FILE *file = fopen(HOSTID_FILE, "r");
|
||||
if (file != NULL) {
|
||||
char *p;
|
||||
if (fscanf(file, "%ms", &p) == 1) {
|
||||
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
|
||||
free(p);
|
||||
}
|
||||
}
|
||||
fclose(file);
|
||||
|
||||
// Make sure the string is terminated
|
||||
hostHash[sizeof(hostHash)-1]='\0';
|
||||
|
||||
return getHash(hostHash, strlen(hostHash));
|
||||
}
|
||||
|
||||
static size_t wordSize(ncclDataType_t type) {
|
||||
switch(type) {
|
||||
case ncclChar:
|
||||
@@ -233,7 +211,7 @@ static size_t wordSize(ncclDataType_t type) {
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
case ncclDouble:
|
||||
//case ncclFloat64:
|
||||
//case ncclFloat64:
|
||||
return 8;
|
||||
default: return 0;
|
||||
}
|
||||
@@ -290,6 +268,7 @@ static int ncclstringtomtype (char *str) {
|
||||
return ncclCoarse;
|
||||
}
|
||||
|
||||
extern int is_main_proc;
|
||||
extern thread_local int is_main_thread;
|
||||
#define PRINT if (is_main_thread) printf
|
||||
|
||||
|
||||
+6
-21
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
|
||||
}
|
||||
|
||||
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count/nranks;
|
||||
*recvcount = (count/nranks)*nranks;
|
||||
@@ -35,20 +23,17 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
|
||||
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
|
||||
if (rank == root) {
|
||||
for (int j=0; j<nranks; j++) {
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
|
||||
TESTCHECK(InitData(((char*)args->expected[k]), nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
|
||||
}
|
||||
}
|
||||
k++;
|
||||
@@ -125,7 +110,7 @@ testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t typ
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
for (int j=begin_root; j<=end_root; j++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
|
||||
}
|
||||
}
|
||||
return testSuccess;
|
||||
|
||||
+14
-21
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -9,18 +10,6 @@
|
||||
|
||||
#define ALIGN 4
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s", size, count, typeName);
|
||||
}
|
||||
|
||||
void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
size_t base = (count/(ALIGN*nranks))*ALIGN;
|
||||
*sendcount = base;
|
||||
@@ -37,18 +26,15 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
|
||||
for (int j=0; j<nranks; j++) {
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
|
||||
}
|
||||
k++;
|
||||
}
|
||||
@@ -116,9 +102,16 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t
|
||||
run_typenames = test_typenames;
|
||||
}
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
|
||||
// Check if this is a power of 2
|
||||
int nRanks = args->nProcs*args->nThreads*args->nGpus;
|
||||
if (nRanks && !(nRanks & (nRanks - 1))) {
|
||||
for (int i=0; i<type_count; i++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
|
||||
}
|
||||
} else {
|
||||
printf("nRanks %d is not a power of 2, skipping\n", nRanks);
|
||||
}
|
||||
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
|
||||
+4
-19
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "root",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6s %6i", size, count, typeName, opName, root);
|
||||
}
|
||||
|
||||
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
@@ -35,16 +23,13 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
|
||||
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
|
||||
if (rank == root) TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
|
||||
k++;
|
||||
|
||||
+10
-22
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,24 +8,15 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
|
||||
}
|
||||
#define ALIGN 4
|
||||
|
||||
void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = (count/nranks)*nranks;
|
||||
*recvcount = count/nranks;
|
||||
size_t base = (count/(ALIGN*nranks))*ALIGN;
|
||||
*sendcount = base*nranks;
|
||||
*recvcount = base;
|
||||
*sendInplaceOffset = 0;
|
||||
*recvInplaceOffset = count/nranks;
|
||||
*paramcount = *recvcount;
|
||||
*recvInplaceOffset = base;
|
||||
*paramcount = base;
|
||||
}
|
||||
|
||||
testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
|
||||
@@ -35,16 +26,13 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
|
||||
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
|
||||
TESTCHECK(InitDataReduce(args->expected[k], recvcount, rank*recvcount, type, op, rep, nranks));
|
||||
k++;
|
||||
|
||||
+6
-21
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
|
||||
}
|
||||
|
||||
void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = (count/nranks)*nranks;
|
||||
*recvcount = count/nranks;
|
||||
@@ -34,17 +22,14 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(args->expected[k], recvcount, type, rep+rank*recvcount, root));
|
||||
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
|
||||
TESTCHECK(InitData(args->expected[k], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
|
||||
k++;
|
||||
|
||||
}
|
||||
@@ -120,7 +105,7 @@ testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t ty
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
for (int j=begin_root; j<=end_root; j++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
|
||||
}
|
||||
}
|
||||
return testSuccess;
|
||||
|
||||
+5
-20
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s", size, count, typeName);
|
||||
}
|
||||
|
||||
void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
@@ -35,18 +23,15 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
|
||||
int peer = (rank-1+nranks)%nranks;
|
||||
TESTCHECK(InitData(args->expected[k], recvcount, type, rep, peer));
|
||||
TESTCHECK(InitData(args->expected[k], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
|
||||
k++;
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
#include "timer.h"
|
||||
|
||||
// Make sure to compile this translation unit with the host compiler and not
|
||||
// nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
|
||||
#include <chrono>
|
||||
|
||||
namespace {
|
||||
std::uint64_t now() {
|
||||
using clock = std::chrono::steady_clock;
|
||||
return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
|
||||
}
|
||||
}
|
||||
|
||||
timer::timer() {
|
||||
t0 = now();
|
||||
}
|
||||
|
||||
double timer::elapsed() const {
|
||||
std::uint64_t t1 = now();
|
||||
return 1.e-9*(t1 - t0);
|
||||
}
|
||||
|
||||
double timer::reset() {
|
||||
std::uint64_t t1 = now();
|
||||
double ans = 1.e-9*(t1 - t0);
|
||||
t0 = t1;
|
||||
return ans;
|
||||
}
|
||||
+15
@@ -0,0 +1,15 @@
|
||||
#ifndef _408319ecdd5b47b28bf8f511c4fdf816
|
||||
#define _408319ecdd5b47b28bf8f511c4fdf816
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
// Can't include <chrono> because of bug with gcc 10.3.0
|
||||
class timer {
|
||||
std::uint64_t t0;
|
||||
public:
|
||||
timer();
|
||||
double elapsed() const;
|
||||
double reset();
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,24 @@
|
||||
include ../../makefiles/common.mk
|
||||
|
||||
.PHONY: all clean
|
||||
|
||||
BUILDDIR := $(abspath ../../build)
|
||||
NCCLDIR := $(BUILDDIR)
|
||||
NVCUFLAGS += -I$(NCCLDIR)/include/ -I../include
|
||||
DST_DIR := $(BUILDDIR)/test/verifiable
|
||||
|
||||
all: $(DST_DIR)/self_test $(DST_DIR)/verifiable.o
|
||||
|
||||
clean:
|
||||
rm -rf $(DST_DIR)
|
||||
|
||||
TEST_VERIFIABLE_SRCDIR := .
|
||||
TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
|
||||
include verifiable.mk
|
||||
|
||||
self_test: $(DST_DIR)/self_test
|
||||
|
||||
$(DST_DIR)/self_test: verifiable.cu verifiable.h
|
||||
@printf "Linking %s\n" $@
|
||||
@mkdir -p $(DST_DIR)
|
||||
$(NVCC) -o $@ $(NVCUFLAGS) -DSELF_TEST=1 verifiable.cu $(NVLDFLAGS)
|
||||
@@ -0,0 +1,177 @@
|
||||
/* Generate parameters for our error bound model of floating point average
|
||||
* (sum of scaled values) by sampling sums of random sequences for each
|
||||
* floating point type.
|
||||
*
|
||||
* The model has parameters "coef" and "power", where for two floats a & b,
|
||||
* they are close enough if and only if:
|
||||
* abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
|
||||
*
|
||||
* Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
|
||||
*
|
||||
* Compile with:
|
||||
* nvcc -gencode=arch=compute_80,code=sm_80
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <cuda_bf16.h>
|
||||
#include <cuda_fp16.h>
|
||||
|
||||
using std::uint64_t;
|
||||
using std::uint32_t;
|
||||
using bfloat16 = __nv_bfloat16;
|
||||
|
||||
template<typename T>
|
||||
struct float_traits;
|
||||
|
||||
template<>
|
||||
struct float_traits<float> {
|
||||
static constexpr int mantissa_bits = 23;
|
||||
static constexpr int exponent_bits = 8;
|
||||
using uint_t = uint32_t;
|
||||
__device__ static float make(double x) { return (float)x; }
|
||||
__device__ static float make(uint64_t x) { return (float)x; }
|
||||
__device__ static double todouble(float x) { return x; }
|
||||
__device__ static float add(float a, float b) { return a+b; }
|
||||
__device__ static float mul(float a, float b) { return a*b; }
|
||||
};
|
||||
template<>
|
||||
struct float_traits<double> {
|
||||
static constexpr int mantissa_bits = 52;
|
||||
static constexpr int exponent_bits = 11;
|
||||
using uint_t = uint64_t;
|
||||
__device__ static double make(double x) { return x; }
|
||||
__device__ static double make(uint64_t x) { return (double)x; }
|
||||
__device__ static double todouble(double x) { return x; }
|
||||
__device__ static double add(double a, double b) { return a+b; }
|
||||
__device__ static double mul(double a, double b) { return a*b; }
|
||||
};
|
||||
template<>
|
||||
struct float_traits<half> {
|
||||
static constexpr int mantissa_bits = 10;
|
||||
static constexpr int exponent_bits = 5;
|
||||
using uint_t = uint16_t;
|
||||
__device__ static half make(double x) { return __double2half(x); }
|
||||
__device__ static half make(uint64_t x) { return __int2half_rn(x); }
|
||||
__device__ static double todouble(half x) { return __half2float(x); }
|
||||
__device__ static half add(half a, half b) { return __hadd(a, b); }
|
||||
__device__ static half mul(half a, half b) { return __hmul(a, b); }
|
||||
};
|
||||
template<>
|
||||
struct float_traits<bfloat16> {
|
||||
static constexpr int mantissa_bits = 7;
|
||||
static constexpr int exponent_bits = 8;
|
||||
using uint_t = uint16_t;
|
||||
__device__ static bfloat16 make(double x) { return __double2bfloat16(x); }
|
||||
__device__ static bfloat16 make(uint64_t x) { return __int2bfloat16_rn(x); }
|
||||
__device__ static double todouble(bfloat16 x) { return __bfloat162float(x); }
|
||||
__device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return __hadd(a, b); }
|
||||
__device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return __hmul(a, b); }
|
||||
};
|
||||
|
||||
template<typename F>
|
||||
__device__ int compare(F a, F b) {
|
||||
union { typename float_traits<F>::uint_t ua; F fa; };
|
||||
union { typename float_traits<F>::uint_t ub; F fb; };
|
||||
ua=0; ub=0;
|
||||
fa=a; fb=b;
|
||||
//std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
|
||||
return ua < ub ? ub-ua : ua-ub;
|
||||
}
|
||||
|
||||
struct xoshiro256ss {
|
||||
uint64_t s[4];
|
||||
__device__ xoshiro256ss(int seed) {
|
||||
constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
|
||||
for(int i=0; i < 4; i++)
|
||||
s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
|
||||
}
|
||||
__device__ uint64_t operator()() {
|
||||
auto rol64 = [](uint64_t x, int k) {
|
||||
return (x << k) | (x >> (64 - k));
|
||||
};
|
||||
uint64_t const result = rol64(s[1] * 5, 7) * 9;
|
||||
uint64_t const t = s[1] << 17;
|
||||
s[2] ^= s[0];
|
||||
s[3] ^= s[1];
|
||||
s[1] ^= s[2];
|
||||
s[0] ^= s[3];
|
||||
s[2] ^= t;
|
||||
s[3] = rol64(s[3], 45);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
template<typename F>
|
||||
__global__ void kernel() {
|
||||
using traits = float_traits<F>;
|
||||
constexpr int samps = 4<<10;
|
||||
__shared__ F accf[samps];
|
||||
__shared__ double accd[samps];
|
||||
|
||||
xoshiro256ss rng(threadIdx.x);
|
||||
float expo_avg = 1;
|
||||
for(int pass=0; pass < 2; pass++) {
|
||||
F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
|
||||
int err_max = 0;
|
||||
float coef = 0;
|
||||
double expo_sum = 0;
|
||||
int expo_n = 0;
|
||||
int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
|
||||
for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
|
||||
//for(int round=0; round < 2; round++) {
|
||||
for(int i=threadIdx.x; i < samps; i += blockDim.x) {
|
||||
accf[i] = 0;
|
||||
accd[i] = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
for(int r=0; r < max_ranks; r++) {
|
||||
int err = 0;
|
||||
for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
|
||||
constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
|
||||
double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
|
||||
F f = traits::make(d);
|
||||
accf[i] = traits::add(accf[i], traits::mul(scalar, f));
|
||||
accd[i] += traits::todouble(f);
|
||||
//if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
|
||||
int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
|
||||
err = err > e ? err : e;
|
||||
}
|
||||
err = __reduce_max_sync(-1u, err);
|
||||
err_max = err_max > err ? err_max : err;
|
||||
if (r >= 2) {
|
||||
// err = 1 + coef*pow(r,expo)
|
||||
float c = float(err-1)/powf(float(r), expo_avg);
|
||||
coef = coef > c ? coef : c;
|
||||
}
|
||||
if (r >= 2) {
|
||||
double expo = log2f(1+err_max)/log2f(r);
|
||||
expo_sum += expo;
|
||||
expo_n++;
|
||||
//if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(pass==0)
|
||||
expo_avg = expo_sum/expo_n;
|
||||
else if(threadIdx.x == 0)
|
||||
std::printf(" coef=%1.10f expo=%1.10f\n", coef, expo_avg);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
std::printf("type=float:\n");
|
||||
kernel<float><<<1,32>>>();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
std::printf("\ntype=half:\n");
|
||||
kernel<half><<<1,32>>>();
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
std::printf("\ntype=bfloat16:\n");
|
||||
kernel<bfloat16><<<1,32>>>();
|
||||
cudaDeviceSynchronize();
|
||||
return 0;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,59 @@
|
||||
#ifndef _d41d8cd98f00b204e9800998ecf8427e
|
||||
#define _d41d8cd98f00b204e9800998ecf8427e
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/* Routines for launching kernels that verify reduction results. A significant
|
||||
* feature of these routines is they carefully craft floating point input
|
||||
* to produce exactly predictable output.
|
||||
*
|
||||
* int elt_ty: actually just a ncclDataType_t
|
||||
*
|
||||
* int red_op: mostly just a ncclRedOp_t. Since PreMulSum ops are dynamically
|
||||
* created, these are encoded as the value ncclNumOps and their scalar is
|
||||
* assumed to be `ncclVerifiablePremulScalar(rank_me)`
|
||||
*
|
||||
* uint64_t seed: arbitrary 64-bits to use in seeding the random values
|
||||
*
|
||||
* intptr_t elt_ix0: index of first element pointed to by elts when generating
|
||||
* random values. This makes it possible to generate subsequences independently
|
||||
* as well as in aggregate.
|
||||
*
|
||||
* int rank_n: Number of contributions into the reduction. Non-reduction
|
||||
* collectives like broadcast, gather, etc will always set this to one.
|
||||
*
|
||||
* int rank_me: Index of this contribution
|
||||
*/
|
||||
|
||||
// Use this as the local scalar for PreMulSum ops
|
||||
template<typename T>
|
||||
__host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
|
||||
return T(rank_me%2 == 0 ? 1.0f : 2.0f);
|
||||
}
|
||||
|
||||
// Enqueue kernel to generate data which is to be reduced.
|
||||
void ncclVerifiablePrepareInput(
|
||||
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
);
|
||||
|
||||
// Enqueue kernel to generate expected results of reduction.
|
||||
void ncclVerifiablePrepareExpected(
|
||||
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
|
||||
uint64_t seed, intptr_t elt_ix0, cudaStream_t stream
|
||||
);
|
||||
|
||||
// Enqueue kernel to verify reduced data matches expectation. The number of
|
||||
// failed elements is written to bad_elt_n which must be in cudaHost memory.
|
||||
// If `expected == nullptr` then the expected results are generated on-the-fly
|
||||
// which can be costly. Thus if you plan to run the same reduction multiple
|
||||
// times it is advantageous to precompute the expected values with
|
||||
// ncclVerifiablePrepareExpected and pass them as `expected` here.
|
||||
void ncclVerifiableVerify(
|
||||
void const *results, void const *expected, intptr_t elt_n, int elt_ty,
|
||||
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
|
||||
int64_t *bad_elt_n, cudaStream_t stream
|
||||
);
|
||||
#endif
|
||||
@@ -0,0 +1,11 @@
|
||||
# We requires both of the following paths to be set upon including this makefile
|
||||
# TEST_VERIFIABLE_SRCDIR = <points to this directory>
|
||||
# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
|
||||
|
||||
TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
|
||||
TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
|
||||
|
||||
$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
|
||||
@printf "Compiling %s\n" $@
|
||||
@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
|
||||
$(NVCC) -o $@ $(NVCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
|
||||
Reference in New Issue
Block a user