Merge remote-tracking branch 'nccl-tests/master' into develop

Этот коммит содержится в:
BertanDogancay
2025-07-23 14:22:18 -05:00
родитель 2c255c4763 97ee098516
Коммит 50a26637fb
10 изменённых файлов: 278 добавлений и 63 удалений
+12 -7
Просмотреть файл
@@ -4,7 +4,7 @@ These tests check both the performance and the correctness of RCCL operations. T
## Build
To build the tests, just type `make`.
To build the tests, just type `make` or `make -j`
If HIP is not installed in `/opt/rocm`, you may specify `HIP_HOME`. Similarly, if RCCL (`librccl.so`) is not installed in `/opt/rocm/lib/`, you may specify `NCCL_HOME` and `CUSTOM_RCCL_LIB`.
@@ -75,12 +75,14 @@ RCCL Tests can run on multiple processes, multiple threads, and multiple HIP dev
### Quick examples
Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
```shell
$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
```
Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes :
(NB: The rccl-tests binaries must be compiled with `MPI=1` for this case)
```shell
$ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
```
@@ -138,8 +140,8 @@ All tests support the same set of arguments :
* `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
* `-G,--hipgraph <num graph launches>` Capture iterations as a HIP graph and then replay specified number of times. Default : 0.
* `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
* `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
* `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default: disabled.
* `-R,--local_register <0/1/2>` enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0.
* `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
* `-F,--cache_flush <cache flush after every -F iteration>` Enable cache flush after every -F iteration. Default : 0 (No cache flush).
* `-O,--out_of_place <0=in-place only, 1=out-of-place only>`. Default: both.
* `-q,--delay <delay>` Delay between out-of-place and in-place runs (in microseconds). Default: 10.
@@ -158,9 +160,12 @@ with the same color will end up in the same group. The resulting group is printe
`NCCL_TESTS_SPLIT_MASK="<value>"` is equivalent to `NCCL_TESTS_SPLIT="&<value>"`.
Here are a few examples:
- `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating on the network)
- `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
- `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
- `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8"`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating over the inter-node network)
- `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
- `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
Note that the reported bandwidth is per group, hence to get the total bandwidth used by all groups, one must multiply by the number of groups.
@@ -178,6 +183,6 @@ $ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3
## Copyright
NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
+28 -4
Просмотреть файл
@@ -1,9 +1,10 @@
#
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
# Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
#
include common.mk
ROCM_PATH ?= /opt/rocm
MPI_HOME ?= /usr/lib/x86_64-linux-gnu
@@ -21,6 +22,10 @@ HIPCUFLAGS := -std=c++14
LDFLAGS :=
HIPLDFLAGS :=
MPI ?= 0 # Set to 1 to enable MPI support (multi-process/multi-node)
NAME_SUFFIX ?= # e.g. _mpi when using MPI=1
DSO ?= 0 # Set to 1 to create and use libverifiable.so to reduce binary size
HIP_VERSION = $(strip $(shell which $(HIPCONFIG) >/dev/null && $(HIPCONFIG) --version))
HIP_MAJOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 1)
HIP_MINOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 2)
@@ -126,7 +131,7 @@ DST_DIR := $(BUILDDIR)
SRC_FILES := $(wildcard *.cu)
OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf${NAME_SUFFIX})
GIT_VERSION_FILE := ${DST_DIR}/src/git_version.cpp
GIT_REV := $(shell git log --pretty=format:'%h' -n 1)
@@ -157,20 +162,39 @@ ${HIPIFY_DIR}/%.h: %.h
@mkdir -p ${HIPIFY_DIR}
hipify-perl -quiet-warnings $< > $@
.PRECIOUS: ${DST_DIR}/%.o
${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<
${DST_DIR}/%$(NAME_SUFFIX).o: %.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<
${DST_DIR}/timer.o: timer.cc timer.h
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
$(CXX) $(CXXFLAGS) -o $@ -c $<
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
ifeq ($(DSO), 1)
${DST_DIR}/%_perf$(NAME_SUFFIX): ${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_LIBS) $(DST_DIR)/src/git_version.cpp
@printf "Linking %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
$(HIPCC) -o $@ $^ $(HIPLDFLAGS) -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable -Xlinker "--enable-new-dtags" -Xlinker "-rpath,\$$ORIGIN:\$$ORIGIN/verifiable"
else
${DST_DIR}/%_perf$(NAME_SUFFIX):${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
@printf "Linking %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
$(HIPCC) -o $@ $^ $(HIPLDFLAGS)
endif
clean_intermediates:
rm -f ${DST_DIR}/*.o $(TEST_VERIFIABLE_OBJS)
+87 -28
Просмотреть файл
@@ -41,19 +41,19 @@ bool IsArchMatch(char const* arch, char const* target) {
#if NCCL_MAJOR >= 2
ncclDataType_t test_types[ncclNumTypes] = {
ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
#if RCCL_BFLOAT16 == 1
#if HAVE_BF16
, ncclBfloat16
#endif
#if RCCL_FLOAT8 == 1
#if HAVE_FP8
, ncclFloat8e4m3, ncclFloat8e5m2
#endif
};
const char *test_typenames[ncclNumTypes] = {
"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
#if RCCL_BFLOAT16 == 1
#if HAVE_BF16
, "bfloat16"
#endif
#if RCCL_FLOAT8 == 1
#if HAVE_FP8
, "fp8_e4m3", "fp8_e5m2"
#endif
};
@@ -122,8 +122,11 @@ static int enable_in_place = 1;
static int enable_cache_flush = 0;
static int enable_rotating_tensor = 0;
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
#define LOCAL_REGISTER 1
#define SYMMETRIC_REGISTER 2
static int local_register = 0;
#endif
static int minCudaArch = 1<<30;
Reporter::Reporter(std::string fileName, std::string outputFormat) : _outputFormat(outputFormat) {
if (!fileName.empty()) {
@@ -203,7 +206,6 @@ void Reporter::addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size
}
bool Reporter::isMainThread() { return is_main_thread == 1; }
static int minCudaArch = 1<<30;
#define NUM_BLOCKS 32
@@ -550,10 +552,10 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
union {
int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
half f16; float f32; double f64;
#if defined(RCCL_BFLOAT16)
#if HAVE_BF16
hip_bfloat16 bf16;
#endif
#if defined(RCCL_FLOAT8)
#if HAVE_FP8
rccl_float8 fp8_e4m3; rccl_bfloat8 fp8_e5m2;
#endif
};
@@ -567,14 +569,14 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
case ncclFloat16: f16 = ncclVerifiablePremulScalar<half>(rank); break;
case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
#if defined(RCCL_BFLOAT16)
#if HAVE_BF16
case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<hip_bfloat16>(rank); break;
#endif
#if defined(RCCL_FLOAT8)
#if HAVE_FP8
case ncclFloat8e4m3: fp8_e4m3 = ncclVerifiablePremulScalar<rccl_float8>(rank); break;
case ncclFloat8e5m2 : fp8_e5m2 = ncclVerifiablePremulScalar<rccl_bfloat8>(rank); break;
#endif
case ncclNumTypes: break;
default: break; // Just to silence clang
}
NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
}
@@ -957,20 +959,38 @@ testResult_t threadInit(struct threadArgs* args) {
}
NCCLCHECK(ncclGroupEnd());
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
NCCLCHECK(ncclGroupStart());
void **sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*args->nGpus) : NULL;
void **recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*args->nGpus) : NULL;
for (int i=0; i<args->nGpus; i++) {
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
NCCLCHECK(ncclCommWindowRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
NCCLCHECK(ncclCommWindowRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, (ncclWindow_t*)&recvRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
} else
#endif
{
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
}
}
NCCLCHECK(ncclGroupEnd());
#endif
TESTCHECK(threadRunTests(args));
for (int i=0; i<args->nGpus; i++) {
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
NCCLCHECK(ncclCommWindowDeregister(args->comms[i], (ncclWindow_t)sendRegHandles[i]));
NCCLCHECK(ncclCommWindowDeregister(args->comms[i], (ncclWindow_t)recvRegHandles[i]));
} else
#endif
{
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
}
#endif
NCCLCHECK(ncclCommDestroy(args->comms[i]));
}
@@ -1046,17 +1066,20 @@ int main(int argc, char* argv[]) {
test_typenum = 9;
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
test_opnum++; // ncclAvg
#if defined(RCCL_BFLOAT16)
test_typenum++; // bfloat16
#endif
#if defined(RCCL_FLOAT8)
test_typenum++; // fp8_e4m3
test_typenum++; // fp8_e5m2
#endif
}
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
test_opnum++; // PreMulSum
}
#if defined(RCCL_BFLOAT16)
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
test_typenum++; // bfloat16
}
#endif
#if defined(RCCL_FLOAT8)
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
test_typenum += 2; // fp8 e4m3,e5m2
}
#endif
#endif
// Parse args
@@ -1194,8 +1217,10 @@ int main(int argc, char* argv[]) {
break;
case 'R':
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if ((int)strtol(optarg, NULL, 0)) {
local_register = 1;
local_register = (int)strtol(optarg, NULL, 0);
if (local_register == SYMMETRIC_REGISTER && test_ncclVersion < NCCL_VERSION(2,27,0)) {
printf("Option -R 2 (symmetric) is not supported before NCCL 2.27. Defaulting to local registration\n");
local_register = LOCAL_REGISTER;
}
#else
printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
@@ -1269,7 +1294,7 @@ int main(int argc, char* argv[]) {
"[-G,--cudagraph <num graph launches>] \n\t"
"[-C,--report_cputime <0/1>] \n\t"
"[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
"[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t"
"[-R,--local_register <0/1/2> enable local (1) or symmetric (2) buffer registration on send/recv buffers (default: disable (0))] \n\t"
"[-Y,--memory_type <coarse/fine/host/managed>] \n\t"
"[-u,--cumask <d0,d1,d2,d3>] \n\t"
"[-O,--out_of_place <0/1>] \n\t"
@@ -1486,6 +1511,22 @@ testResult_t run() {
#ifdef MPI_SUPPORT
MPI_Allreduce(MPI_IN_PLACE, &minCudaArch, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
#endif
#if defined(RCCL_FLOAT8)
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) && test_ncclVersion >= NCCL_VERSION(2,24,0)) {
if (minCudaArch < 900) { // Filter out fp8 on pre-Hopper hardware
int n = 0;
for (int i=0; i < test_typenum; i++) {
if (!(test_types[i] == ncclFloat8e4m3 || test_types[i] == ncclFloat8e5m2)) {
test_types[n] = test_types[i];
test_typenames[n] = test_typenames[i];
n += 1;
}
}
test_typenum = n;
}
}
#endif
//if parallel init is not selected, use main thread to initialize NCCL
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
@@ -1504,12 +1545,22 @@ testResult_t run() {
NCCLCHECK(ncclGroupEnd());
}
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
NCCLCHECK(ncclGroupStart());
sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
for (int i=0; i<nGpus*nThreads; i++) {
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], &sendbuffs[i], maxBytes, &sendRegHandles[i]));
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], &recvbuffs[i], maxBytes, &recvRegHandles[i]));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
NCCLCHECK(ncclCommWindowRegister(comms[i], sendbuffs[i], maxBytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
NCCLCHECK(ncclCommWindowRegister(comms[i], recvbuffs[i], maxBytes, (ncclWindow_t*)&recvRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
} else
#endif
{
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], maxBytes, &sendRegHandles[i]));
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
}
}
NCCLCHECK(ncclGroupEnd());
#endif
}
@@ -1607,8 +1658,16 @@ testResult_t run() {
if (!parallel_init) {
for(int i=0; i<nGpus*nThreads; ++i) {
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
NCCLCHECK(ncclCommWindowDeregister(comms[i], (ncclWindow_t)sendRegHandles[i]));
NCCLCHECK(ncclCommWindowDeregister(comms[i], (ncclWindow_t)recvRegHandles[i]));
} else
#endif
{
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
}
#endif
NCCLCHECK(ncclCommDestroy(comms[i]));
}
+24 -2
Просмотреть файл
@@ -252,20 +252,42 @@ static uint64_t getHostHash(const char* hostname) {
return getHash(hostHash, strlen(hostHash));
}
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
#define HAVE_BF16 1
#else
#define HAVE_BF16 0
#endif
#if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
#define HAVE_FP8 1
#else
#define HAVE_FP8 0
#endif
#if NCCL_MAJOR >= 2
#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
#undef HAVE_BF16
#define HAVE_BF16 1
#if defined(__CUDA_FP8_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0)
#undef HAVE_FP8
#define HAVE_FP8 1
#endif
#endif
#endif
static size_t wordSize(ncclDataType_t type) {
switch(type) {
case ncclChar:
#if NCCL_MAJOR >= 2
//case ncclInt8:
case ncclUint8:
#if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
#if HAVE_FP8
case ncclFloat8e4m3:
case ncclFloat8e5m2:
#endif
#endif
return 1;
case ncclHalf:
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
#if HAVE_BF16
case ncclBfloat16:
#endif
//case ncclFloat16:
+80
Просмотреть файл
@@ -0,0 +1,80 @@
#
# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
CUDA_HOME ?= /usr/local/cuda
PREFIX ?= /usr/local
VERBOSE ?= 0
DEBUG ?= 0
CUDA_LIB ?= $(CUDA_HOME)/lib64
CUDA_INC ?= $(CUDA_HOME)/include
NVCC ?= $(CUDA_HOME)/bin/nvcc
CUDARTLIB ?= cudart
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
# Better define NVCC_GENCODE in your environment to the minimal set
# of archs to reduce compile time.
ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
# Add Blackwell but drop Pascal & Volta support if we're using CUDA13.0 or above
NVCC_GENCODE ?= -gencode=arch=compute_75,code=sm_75 \
-gencode=arch=compute_80,code=sm_80 \
-gencode=arch=compute_90,code=sm_90 \
-gencode=arch=compute_100,code=sm_100 \
-gencode=arch=compute_120,code=sm_120 \
-gencode=arch=compute_120,code=compute_120
else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
# Include Blackwell support if we're using CUDA12.8 or above
NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_80,code=sm_80 \
-gencode=arch=compute_90,code=sm_90 \
-gencode=arch=compute_100,code=sm_100 \
-gencode=arch=compute_120,code=sm_120 \
-gencode=arch=compute_120,code=compute_120
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_80,code=sm_80 \
-gencode=arch=compute_90,code=sm_90 \
-gencode=arch=compute_90,code=compute_90
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_80,code=sm_80 \
-gencode=arch=compute_80,code=compute_80
else
NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_70,code=sm_70 \
-gencode=arch=compute_70,code=compute_70
endif
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
CXXFLAGS := -std=c++11
LDFLAGS := -L${CUDA_LIB} -lcudart -lrt
NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
ifeq ($(DEBUG), 0)
NVCUFLAGS += -O3 -g
CXXFLAGS += -O3 -g
else
NVCUFLAGS += -O0 -G -g
CXXFLAGS += -O0 -g -ggdb3
endif
ifneq ($(VERBOSE), 0)
NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
else
.SILENT:
endif
+3 -3
Просмотреть файл
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
# Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
@@ -68,7 +68,7 @@ include verifiable.mk
self_test: $(DST_DIR)/self_test
$(DST_DIR)/self_test: verifiable.cu verifiable.h
$(DST_DIR)/self_test: main.cu $(TEST_VERIFIABLE_LIBS)
@printf "Linking %s\n" $@
@mkdir -p $(DST_DIR)
$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 $< -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable $(HIPLDFLAGS) -Xlinker "-rpath=\$$ORIGIN"
+18
Просмотреть файл
@@ -0,0 +1,18 @@
/*************************************************************************
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#include <cuda_runtime.h>
#include <iostream>
#define NCCL_VERIFIABLE_SELF_TEST 1
#include "verifiable.h"
int main(int arg_n, char **args) {
std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
cudaSetDevice(0);
ncclVerifiableLaunchSelfTest();
cudaDeviceSynchronize();
return 0;
}
+10 -15
Просмотреть файл
@@ -105,7 +105,7 @@ template<typename T>
struct IsIntegral: std::is_integral<T> {};
template<>
struct IsIntegral<__half>: std::false_type {};
#if RCCL_BFLOAT16 == 1
#if HAVE_ncclBfloat16
template<>
struct IsIntegral<hip_bfloat16>: std::false_type {};
#endif
@@ -150,6 +150,10 @@ namespace {
__host__ __device__ __half castTo<__half>(float x) {
return __float2half(x);
}
// template<>
// __host__ __device__ __half castTo<half>(double x) {
// return __double2half(x);
// }
template<>
__host__ __device__ half castTo<__half>(uint64_t x) {
return __ull2half_rn(x);
@@ -864,7 +868,7 @@ __host__ __device__ void genOutput(
namespace {
template<typename T>
__host__ __device__ void genInput(
T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t rng, intptr_t index,
T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t rng, intptr_t index,
std::false_type /*integral*/
) {
// We can't control the nranks divisor in avareages so to control error we
@@ -953,7 +957,6 @@ __host__ __device__ T genOutput(
////////////////////////////////////////////////////////////////////////////////
#if !SELF_TEST
namespace {
template<typename T, typename ReduceFn>
__global__ void __launch_bounds__(512, 1) prepareInput2(
@@ -1040,11 +1043,9 @@ hipError_t ncclVerifiablePrepareInput(
}
#undef CASE_OP
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if !SELF_TEST
namespace {
template<typename T, typename ReduceFn>
__global__ void __launch_bounds__(512, 1) prepareExpected2(
@@ -1130,7 +1131,6 @@ hipError_t ncclVerifiablePrepareExpected(
}
#undef CASE_OP
}
#endif
////////////////////////////////////////////////////////////////////////////////
@@ -1151,7 +1151,6 @@ __host__ __device__ uint64_t calcDelta(T a, T b) {
////////////////////////////////////////////////////////////////////////////////
#if !SELF_TEST
namespace {
template<typename T>
__global__ void __launch_bounds__(512, 1) verifyPrepared(
@@ -1245,7 +1244,6 @@ hipError_t verifyInline1(
ReduceAvg opavg{rank_n};
ReducePreMulSum oppremulsum;
void *args[8] = {&results, &elt_n, nullptr, &rank_n, &seed, &elt_ix0, &tolerance, &bad_elt_n};
#define CASE_OP(op) \
if(rank_n == 1) { \
fn = (void const*)&verifyInline2<T, Uint, ReduceNil>; \
@@ -1336,13 +1334,10 @@ hipError_t ncclVerifiableVerify(
}
#undef CASE_TY
}
#endif
////////////////////////////////////////////////////////////////////////////////
#if SELF_TEST
#include <iostream>
namespace {
template<typename T, typename Op>
__device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) {
//if(!std::is_same<T,half>::value) return;
@@ -1397,16 +1392,16 @@ __global__ void __launch_bounds__(512, 1) sweep() {
#if HAVE_ncclBfloat16
sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
#endif
#if HAVE_ncclfp8 && __HIP_DEVICE_COMPILE__
#if HAVE_ncclfp8_DEVICE
sweep1<rccl_float8>(ncclFloat8e4m3, "fp8_e4m3");
sweep1<rccl_bfloat8>(ncclFloat8e5m2, "fp8_e5m2");
#endif
sweep1<float>(ncclFloat32, "float");
sweep1<double>(ncclFloat64, "double");
}
}
void ncclVerifiableLaunchSelfTest() {
sweep<<<1,512>>>();
sweep<<<1,512>>>();
}
#endif
#endif
+5
Просмотреть файл
@@ -63,4 +63,9 @@ hipError_t ncclVerifiableVerify(
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
int64_t *bad_elt_n, cudaStream_t stream
);
#ifdef NCCL_VERIFIABLE_SELF_TEST
void ncclVerifiableLaunchSelfTest();
#endif
#endif
+11 -4
Просмотреть файл
@@ -1,15 +1,16 @@
# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
# Modifications Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
# Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
# We requires both of the following paths to be set upon including this makefile
# We require both of the following paths to be set upon including this makefile
# TEST_VERIFIABLE_SRCDIR = <points to this directory>
# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
# TEST_VERIFIABLE_BUILDDIR = <points to destination of .so file>
TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
TEST_VERIFIABLE_LIBS = $(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so
${HIPIFY_DIR}/verifiable.cu.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
@printf "Hipifying %-35s > %s\n" $< $@
@@ -31,3 +32,9 @@ $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPI
@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
$(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so: $(TEST_VERIFIABLE_OBJS)
@printf "Creating DSO %s\n" $@
@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
$(CC) -shared -o $@.0 $^ -Wl,-soname,$(notdir $@).0
ln -sf $(notdir $@).0 $@