Merge remote-tracking branch 'nccl-tests/master' into develop
Этот коммит содержится в:
+12
-7
@@ -4,7 +4,7 @@ These tests check both the performance and the correctness of RCCL operations. T
|
||||
|
||||
## Build
|
||||
|
||||
To build the tests, just type `make`.
|
||||
To build the tests, just type `make` or `make -j`
|
||||
|
||||
If HIP is not installed in `/opt/rocm`, you may specify `HIP_HOME`. Similarly, if RCCL (`librccl.so`) is not installed in `/opt/rocm/lib/`, you may specify `NCCL_HOME` and `CUSTOM_RCCL_LIB`.
|
||||
|
||||
@@ -75,12 +75,14 @@ RCCL Tests can run on multiple processes, multiple threads, and multiple HIP dev
|
||||
### Quick examples
|
||||
|
||||
Run on single node with 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
|
||||
|
||||
```shell
|
||||
$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
|
||||
```
|
||||
|
||||
Run 64 MPI processes on nodes with 8 GPUs each, for a total of 64 GPUs spread across 8 nodes :
|
||||
(NB: The rccl-tests binaries must be compiled with `MPI=1` for this case)
|
||||
|
||||
```shell
|
||||
$ mpirun -np 64 -N 8 ./build/all_reduce_perf -b 8 -e 8G -f 2 -g 1
|
||||
```
|
||||
@@ -138,8 +140,8 @@ All tests support the same set of arguments :
|
||||
* `-z,--blocking <0/1>` Make RCCL collective blocking, i.e. have CPUs wait and sync after each collective. Default : 0.
|
||||
* `-G,--hipgraph <num graph launches>` Capture iterations as a HIP graph and then replay specified number of times. Default : 0.
|
||||
* `-C,--report_cputime <0/1>]` Report CPU time instead of latency. Default : 0.
|
||||
* `-R,--local_register <1/0>` enable local buffer registration on send/recv buffers. Default : 0.
|
||||
* `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default: disabled.
|
||||
* `-R,--local_register <0/1/2>` enable local (1) or symmetric (2) buffer registration on send/recv buffers. Default : 0.
|
||||
* `-T,--timeout <time in seconds>` timeout each test after specified number of seconds. Default : disabled.
|
||||
* `-F,--cache_flush <cache flush after every -F iteration>` Enable cache flush after every -F iteration. Default : 0 (No cache flush).
|
||||
* `-O,--out_of_place <0=in-place only, 1=out-of-place only>`. Default: both.
|
||||
* `-q,--delay <delay>` Delay between out-of-place and in-place runs (in microseconds). Default: 10.
|
||||
@@ -158,9 +160,12 @@ with the same color will end up in the same group. The resulting group is printe
|
||||
`NCCL_TESTS_SPLIT_MASK="<value>"` is equivalent to `NCCL_TESTS_SPLIT="&<value>"`.
|
||||
|
||||
Here are a few examples:
|
||||
- `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating on the network)
|
||||
- `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
|
||||
- `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
|
||||
|
||||
- `NCCL_TESTS_SPLIT="AND 0x7"` or `NCCL_TESTS_SPLIT="MOD 8"`: On systems with 8 GPUs, run 8 parallel operations, each with 1 GPU per node (purely communicating over the inter-node network)
|
||||
|
||||
- `NCCL_TESTS_SPLIT="OR 0x7"` or `NCCL_TESTS_SPLIT="DIV 8"`: On systems with 8 GPUs, run one operation per node, purely intra-node.
|
||||
|
||||
- `NCCL_TESTS_SPLIT="AND 0x1"` or `NCCL_TESTS_SPLIT="MOD 2"`: Run two operations, each operation using every other rank.
|
||||
|
||||
Note that the reported bandwidth is per group, hence to get the total bandwidth used by all groups, one must multiply by the number of groups.
|
||||
|
||||
@@ -178,6 +183,6 @@ $ LD_LIBRARY_PATH=/path/to/rccl-install/lib/ HSA_FORCE_FINE_GRAIN_PCIE=1 python3
|
||||
|
||||
## Copyright
|
||||
|
||||
NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
NCCL tests are provided under the BSD license. All source code and accompanying documentation is copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
All modifications are copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
+28
-4
@@ -1,9 +1,10 @@
|
||||
#
|
||||
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
include common.mk
|
||||
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
MPI_HOME ?= /usr/lib/x86_64-linux-gnu
|
||||
@@ -21,6 +22,10 @@ HIPCUFLAGS := -std=c++14
|
||||
LDFLAGS :=
|
||||
HIPLDFLAGS :=
|
||||
|
||||
MPI ?= 0 # Set to 1 to enable MPI support (multi-process/multi-node)
|
||||
NAME_SUFFIX ?= # e.g. _mpi when using MPI=1
|
||||
DSO ?= 0 # Set to 1 to create and use libverifiable.so to reduce binary size
|
||||
|
||||
HIP_VERSION = $(strip $(shell which $(HIPCONFIG) >/dev/null && $(HIPCONFIG) --version))
|
||||
HIP_MAJOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 1)
|
||||
HIP_MINOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 2)
|
||||
@@ -126,7 +131,7 @@ DST_DIR := $(BUILDDIR)
|
||||
SRC_FILES := $(wildcard *.cu)
|
||||
OBJ_FILES := $(SRC_FILES:%.cu=${DST_DIR}/%.o)
|
||||
BIN_FILES_LIST := all_reduce all_gather broadcast reduce_scatter reduce alltoall scatter gather sendrecv alltoallv hypercube
|
||||
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf)
|
||||
BIN_FILES := $(BIN_FILES_LIST:%=${DST_DIR}/%_perf${NAME_SUFFIX})
|
||||
|
||||
GIT_VERSION_FILE := ${DST_DIR}/src/git_version.cpp
|
||||
GIT_REV := $(shell git log --pretty=format:'%h' -n 1)
|
||||
@@ -157,20 +162,39 @@ ${HIPIFY_DIR}/%.h: %.h
|
||||
@mkdir -p ${HIPIFY_DIR}
|
||||
hipify-perl -quiet-warnings $< > $@
|
||||
|
||||
.PRECIOUS: ${DST_DIR}/%.o
|
||||
|
||||
${DST_DIR}/%.o: ${HIPIFY_DIR}/%.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
|
||||
$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<
|
||||
|
||||
${DST_DIR}/%$(NAME_SUFFIX).o: %.cu.cpp ${HIPIFY_DIR}/common.h $(TEST_VERIFIABLE_HDRS) $(GIT_VERSION_FILE)
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
echo "$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<"
|
||||
$(HIPCC) $(HIPCUFLAGS) -I. -c -o $@ $<
|
||||
|
||||
${DST_DIR}/timer.o: timer.cc timer.h
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
||||
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
|
||||
ifeq ($(DSO), 1)
|
||||
${DST_DIR}/%_perf$(NAME_SUFFIX): ${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_LIBS) $(DST_DIR)/src/git_version.cpp
|
||||
@printf "Linking %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
|
||||
$(HIPCC) -o $@ $^ $(HIPLDFLAGS) -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable -Xlinker "--enable-new-dtags" -Xlinker "-rpath,\$$ORIGIN:\$$ORIGIN/verifiable"
|
||||
else
|
||||
${DST_DIR}/%_perf$(NAME_SUFFIX):${DST_DIR}/%.o ${DST_DIR}/common$(NAME_SUFFIX).o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS) $(DST_DIR)/src/git_version.cpp
|
||||
@printf "Linking %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
echo "$(HIPCC) -o $@ $^ $(HIPLDFLAGS)"
|
||||
$(HIPCC) -o $@ $^ $(HIPLDFLAGS)
|
||||
endif
|
||||
|
||||
clean_intermediates:
|
||||
rm -f ${DST_DIR}/*.o $(TEST_VERIFIABLE_OBJS)
|
||||
|
||||
|
||||
+87
-28
@@ -41,19 +41,19 @@ bool IsArchMatch(char const* arch, char const* target) {
|
||||
#if NCCL_MAJOR >= 2
|
||||
ncclDataType_t test_types[ncclNumTypes] = {
|
||||
ncclInt8, ncclUint8, ncclInt32, ncclUint32, ncclInt64, ncclUint64, ncclHalf, ncclFloat, ncclDouble
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
#if HAVE_BF16
|
||||
, ncclBfloat16
|
||||
#endif
|
||||
#if RCCL_FLOAT8 == 1
|
||||
#if HAVE_FP8
|
||||
, ncclFloat8e4m3, ncclFloat8e5m2
|
||||
#endif
|
||||
};
|
||||
const char *test_typenames[ncclNumTypes] = {
|
||||
"int8", "uint8", "int32", "uint32", "int64", "uint64", "half", "float", "double"
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
#if HAVE_BF16
|
||||
, "bfloat16"
|
||||
#endif
|
||||
#if RCCL_FLOAT8 == 1
|
||||
#if HAVE_FP8
|
||||
, "fp8_e4m3", "fp8_e5m2"
|
||||
#endif
|
||||
};
|
||||
@@ -122,8 +122,11 @@ static int enable_in_place = 1;
|
||||
static int enable_cache_flush = 0;
|
||||
static int enable_rotating_tensor = 0;
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
|
||||
#define LOCAL_REGISTER 1
|
||||
#define SYMMETRIC_REGISTER 2
|
||||
static int local_register = 0;
|
||||
#endif
|
||||
static int minCudaArch = 1<<30;
|
||||
|
||||
Reporter::Reporter(std::string fileName, std::string outputFormat) : _outputFormat(outputFormat) {
|
||||
if (!fileName.empty()) {
|
||||
@@ -203,7 +206,6 @@ void Reporter::addResult(int gpusPerRank, int ranksPerNode, int totalRanks, size
|
||||
}
|
||||
|
||||
bool Reporter::isMainThread() { return is_main_thread == 1; }
|
||||
static int minCudaArch = 1<<30;
|
||||
|
||||
#define NUM_BLOCKS 32
|
||||
|
||||
@@ -550,10 +552,10 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
union {
|
||||
int8_t i8; uint8_t u8; int32_t i32; uint32_t u32; int64_t i64; uint64_t u64;
|
||||
half f16; float f32; double f64;
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
#if HAVE_BF16
|
||||
hip_bfloat16 bf16;
|
||||
#endif
|
||||
#if defined(RCCL_FLOAT8)
|
||||
#if HAVE_FP8
|
||||
rccl_float8 fp8_e4m3; rccl_bfloat8 fp8_e5m2;
|
||||
#endif
|
||||
};
|
||||
@@ -567,14 +569,14 @@ testResult_t startColl(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
||||
case ncclFloat16: f16 = ncclVerifiablePremulScalar<half>(rank); break;
|
||||
case ncclFloat32: f32 = ncclVerifiablePremulScalar<float>(rank); break;
|
||||
case ncclFloat64: f64 = ncclVerifiablePremulScalar<double>(rank); break;
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
#if HAVE_BF16
|
||||
case ncclBfloat16: bf16 = ncclVerifiablePremulScalar<hip_bfloat16>(rank); break;
|
||||
#endif
|
||||
#if defined(RCCL_FLOAT8)
|
||||
#if HAVE_FP8
|
||||
case ncclFloat8e4m3: fp8_e4m3 = ncclVerifiablePremulScalar<rccl_float8>(rank); break;
|
||||
case ncclFloat8e5m2 : fp8_e5m2 = ncclVerifiablePremulScalar<rccl_bfloat8>(rank); break;
|
||||
#endif
|
||||
case ncclNumTypes: break;
|
||||
default: break; // Just to silence clang
|
||||
}
|
||||
NCCLCHECK(ncclRedOpCreatePreMulSum(&op, &u64, type, ncclScalarHostImmediate, args->comms[i]));
|
||||
}
|
||||
@@ -957,20 +959,38 @@ testResult_t threadInit(struct threadArgs* args) {
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
void **sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*args->nGpus) : NULL;
|
||||
void **recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*args->nGpus) : NULL;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
|
||||
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
|
||||
if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
|
||||
NCCLCHECK(ncclCommWindowRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
|
||||
NCCLCHECK(ncclCommWindowRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, (ncclWindow_t*)&recvRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->sendbuffs[i], args->maxbytes, &sendRegHandles[i]));
|
||||
if (local_register) NCCLCHECK(ncclCommRegister(args->comms[i], args->recvbuffs[i], args->maxbytes, &recvRegHandles[i]));
|
||||
}
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
#endif
|
||||
|
||||
TESTCHECK(threadRunTests(args));
|
||||
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
|
||||
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
|
||||
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
|
||||
if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
|
||||
NCCLCHECK(ncclCommWindowDeregister(args->comms[i], (ncclWindow_t)sendRegHandles[i]));
|
||||
NCCLCHECK(ncclCommWindowDeregister(args->comms[i], (ncclWindow_t)recvRegHandles[i]));
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], sendRegHandles[i]));
|
||||
if (local_register) NCCLCHECK(ncclCommDeregister(args->comms[i], recvRegHandles[i]));
|
||||
}
|
||||
#endif
|
||||
NCCLCHECK(ncclCommDestroy(args->comms[i]));
|
||||
}
|
||||
@@ -1046,17 +1066,20 @@ int main(int argc, char* argv[]) {
|
||||
test_typenum = 9;
|
||||
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
|
||||
test_opnum++; // ncclAvg
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
test_typenum++; // bfloat16
|
||||
#endif
|
||||
#if defined(RCCL_FLOAT8)
|
||||
test_typenum++; // fp8_e4m3
|
||||
test_typenum++; // fp8_e5m2
|
||||
#endif
|
||||
}
|
||||
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,11,0) && test_ncclVersion >= NCCL_VERSION(2,11,0)) {
|
||||
test_opnum++; // PreMulSum
|
||||
}
|
||||
#if defined(RCCL_BFLOAT16)
|
||||
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
|
||||
test_typenum++; // bfloat16
|
||||
}
|
||||
#endif
|
||||
#if defined(RCCL_FLOAT8)
|
||||
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0) && test_ncclVersion >= NCCL_VERSION(2,10,0)) {
|
||||
test_typenum += 2; // fp8 e4m3,e5m2
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Parse args
|
||||
@@ -1194,8 +1217,10 @@ int main(int argc, char* argv[]) {
|
||||
break;
|
||||
case 'R':
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
|
||||
if ((int)strtol(optarg, NULL, 0)) {
|
||||
local_register = 1;
|
||||
local_register = (int)strtol(optarg, NULL, 0);
|
||||
if (local_register == SYMMETRIC_REGISTER && test_ncclVersion < NCCL_VERSION(2,27,0)) {
|
||||
printf("Option -R 2 (symmetric) is not supported before NCCL 2.27. Defaulting to local registration\n");
|
||||
local_register = LOCAL_REGISTER;
|
||||
}
|
||||
#else
|
||||
printf("Option -R (register) is not supported before NCCL 2.19. Ignoring\n");
|
||||
@@ -1269,7 +1294,7 @@ int main(int argc, char* argv[]) {
|
||||
"[-G,--cudagraph <num graph launches>] \n\t"
|
||||
"[-C,--report_cputime <0/1>] \n\t"
|
||||
"[-a,--average <0/1/2/3> report average iteration time <0=RANK0/1=AVG/2=MIN/3=MAX>] \n\t"
|
||||
"[-R,--local_register <1/0> enable local buffer registration on send/recv buffers (default: disable)] \n\t"
|
||||
"[-R,--local_register <0/1/2> enable local (1) or symmetric (2) buffer registration on send/recv buffers (default: disable (0))] \n\t"
|
||||
"[-Y,--memory_type <coarse/fine/host/managed>] \n\t"
|
||||
"[-u,--cumask <d0,d1,d2,d3>] \n\t"
|
||||
"[-O,--out_of_place <0/1>] \n\t"
|
||||
@@ -1486,6 +1511,22 @@ testResult_t run() {
|
||||
#ifdef MPI_SUPPORT
|
||||
MPI_Allreduce(MPI_IN_PLACE, &minCudaArch, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
|
||||
#endif
|
||||
#if defined(RCCL_FLOAT8)
|
||||
if (NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0) && test_ncclVersion >= NCCL_VERSION(2,24,0)) {
|
||||
if (minCudaArch < 900) { // Filter out fp8 on pre-Hopper hardware
|
||||
int n = 0;
|
||||
for (int i=0; i < test_typenum; i++) {
|
||||
if (!(test_types[i] == ncclFloat8e4m3 || test_types[i] == ncclFloat8e5m2)) {
|
||||
test_types[n] = test_types[i];
|
||||
test_typenames[n] = test_typenames[i];
|
||||
n += 1;
|
||||
}
|
||||
}
|
||||
test_typenum = n;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
//if parallel init is not selected, use main thread to initialize NCCL
|
||||
ncclComm_t* comms = (ncclComm_t*)malloc(sizeof(ncclComm_t)*nThreads*nGpus);
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
|
||||
@@ -1504,12 +1545,22 @@ testResult_t run() {
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
}
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
sendRegHandles = (local_register) ? (void **)malloc(sizeof(*sendRegHandles)*nThreads*nGpus) : NULL;
|
||||
recvRegHandles = (local_register) ? (void **)malloc(sizeof(*recvRegHandles)*nThreads*nGpus) : NULL;
|
||||
for (int i=0; i<nGpus*nThreads; i++) {
|
||||
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], &sendbuffs[i], maxBytes, &sendRegHandles[i]));
|
||||
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], &recvbuffs[i], maxBytes, &recvRegHandles[i]));
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
|
||||
if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
|
||||
NCCLCHECK(ncclCommWindowRegister(comms[i], sendbuffs[i], maxBytes, (ncclWindow_t*)&sendRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
|
||||
NCCLCHECK(ncclCommWindowRegister(comms[i], recvbuffs[i], maxBytes, (ncclWindow_t*)&recvRegHandles[i], NCCL_WIN_COLL_SYMMETRIC));
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], sendbuffs[i], maxBytes, &sendRegHandles[i]));
|
||||
if (local_register) NCCLCHECK(ncclCommRegister(comms[i], recvbuffs[i], maxBytes, &recvRegHandles[i]));
|
||||
}
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -1607,8 +1658,16 @@ testResult_t run() {
|
||||
if (!parallel_init) {
|
||||
for(int i=0; i<nGpus*nThreads; ++i) {
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,19,0)
|
||||
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
|
||||
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,27,0)
|
||||
if (test_ncclVersion >= NCCL_VERSION(2,27,0) && (local_register == SYMMETRIC_REGISTER)) {
|
||||
NCCLCHECK(ncclCommWindowDeregister(comms[i], (ncclWindow_t)sendRegHandles[i]));
|
||||
NCCLCHECK(ncclCommWindowDeregister(comms[i], (ncclWindow_t)recvRegHandles[i]));
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], sendRegHandles[i]));
|
||||
if (local_register) NCCLCHECK(ncclCommDeregister(comms[i], recvRegHandles[i]));
|
||||
}
|
||||
#endif
|
||||
NCCLCHECK(ncclCommDestroy(comms[i]));
|
||||
}
|
||||
|
||||
+24
-2
@@ -252,20 +252,42 @@ static uint64_t getHostHash(const char* hostname) {
|
||||
return getHash(hostHash, strlen(hostHash));
|
||||
}
|
||||
|
||||
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
|
||||
#define HAVE_BF16 1
|
||||
#else
|
||||
#define HAVE_BF16 0
|
||||
#endif
|
||||
#if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
|
||||
#define HAVE_FP8 1
|
||||
#else
|
||||
#define HAVE_FP8 0
|
||||
#endif
|
||||
|
||||
#if NCCL_MAJOR >= 2
|
||||
#if defined(__CUDA_BF16_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,10,0)
|
||||
#undef HAVE_BF16
|
||||
#define HAVE_BF16 1
|
||||
#if defined(__CUDA_FP8_TYPES_EXIST__) && NCCL_VERSION_CODE >= NCCL_VERSION(2,24,0)
|
||||
#undef HAVE_FP8
|
||||
#define HAVE_FP8 1
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static size_t wordSize(ncclDataType_t type) {
|
||||
switch(type) {
|
||||
case ncclChar:
|
||||
#if NCCL_MAJOR >= 2
|
||||
//case ncclInt8:
|
||||
case ncclUint8:
|
||||
#if NCCL_MAJOR >= 2 && RCCL_FLOAT8 == 1
|
||||
#if HAVE_FP8
|
||||
case ncclFloat8e4m3:
|
||||
case ncclFloat8e5m2:
|
||||
#endif
|
||||
#endif
|
||||
return 1;
|
||||
case ncclHalf:
|
||||
#if NCCL_MAJOR >= 2 && RCCL_BFLOAT16 == 1
|
||||
#if HAVE_BF16
|
||||
case ncclBfloat16:
|
||||
#endif
|
||||
//case ncclFloat16:
|
||||
|
||||
@@ -0,0 +1,80 @@
|
||||
#
|
||||
# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
CUDA_HOME ?= /usr/local/cuda
|
||||
PREFIX ?= /usr/local
|
||||
VERBOSE ?= 0
|
||||
DEBUG ?= 0
|
||||
|
||||
CUDA_LIB ?= $(CUDA_HOME)/lib64
|
||||
CUDA_INC ?= $(CUDA_HOME)/include
|
||||
NVCC ?= $(CUDA_HOME)/bin/nvcc
|
||||
CUDARTLIB ?= cudart
|
||||
|
||||
CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//'))
|
||||
CUDA_MAJOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 1)
|
||||
CUDA_MINOR = $(shell echo $(CUDA_VERSION) | cut -d "." -f 2)
|
||||
|
||||
# Better define NVCC_GENCODE in your environment to the minimal set
|
||||
# of archs to reduce compile time.
|
||||
ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 13; echo $$?),0)
|
||||
# Add Blackwell but drop Pascal & Volta support if we're using CUDA13.0 or above
|
||||
NVCC_GENCODE ?= -gencode=arch=compute_75,code=sm_75 \
|
||||
-gencode=arch=compute_80,code=sm_80 \
|
||||
-gencode=arch=compute_90,code=sm_90 \
|
||||
-gencode=arch=compute_100,code=sm_100 \
|
||||
-gencode=arch=compute_120,code=sm_120 \
|
||||
-gencode=arch=compute_120,code=compute_120
|
||||
else ifeq ($(shell test "0$(CUDA_MAJOR)" -eq 12 -a "0$(CUDA_MINOR)" -ge 8; echo $$?),0)
|
||||
# Include Blackwell support if we're using CUDA12.8 or above
|
||||
NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
|
||||
-gencode=arch=compute_61,code=sm_61 \
|
||||
-gencode=arch=compute_70,code=sm_70 \
|
||||
-gencode=arch=compute_80,code=sm_80 \
|
||||
-gencode=arch=compute_90,code=sm_90 \
|
||||
-gencode=arch=compute_100,code=sm_100 \
|
||||
-gencode=arch=compute_120,code=sm_120 \
|
||||
-gencode=arch=compute_120,code=compute_120
|
||||
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 12; echo $$?),0)
|
||||
NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
|
||||
-gencode=arch=compute_61,code=sm_61 \
|
||||
-gencode=arch=compute_70,code=sm_70 \
|
||||
-gencode=arch=compute_80,code=sm_80 \
|
||||
-gencode=arch=compute_90,code=sm_90 \
|
||||
-gencode=arch=compute_90,code=compute_90
|
||||
else ifeq ($(shell test "0$(CUDA_MAJOR)" -ge 11; echo $$?),0)
|
||||
NVCC_GENCODE ?= -gencode=arch=compute_60,code=sm_60 \
|
||||
-gencode=arch=compute_61,code=sm_61 \
|
||||
-gencode=arch=compute_70,code=sm_70 \
|
||||
-gencode=arch=compute_80,code=sm_80 \
|
||||
-gencode=arch=compute_80,code=compute_80
|
||||
else
|
||||
NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \
|
||||
-gencode=arch=compute_50,code=sm_50 \
|
||||
-gencode=arch=compute_60,code=sm_60 \
|
||||
-gencode=arch=compute_61,code=sm_61 \
|
||||
-gencode=arch=compute_70,code=sm_70 \
|
||||
-gencode=arch=compute_70,code=compute_70
|
||||
endif
|
||||
|
||||
NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11
|
||||
CXXFLAGS := -std=c++11
|
||||
|
||||
LDFLAGS := -L${CUDA_LIB} -lcudart -lrt
|
||||
NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt
|
||||
|
||||
ifeq ($(DEBUG), 0)
|
||||
NVCUFLAGS += -O3 -g
|
||||
CXXFLAGS += -O3 -g
|
||||
else
|
||||
NVCUFLAGS += -O0 -G -g
|
||||
CXXFLAGS += -O0 -g -ggdb3
|
||||
endif
|
||||
|
||||
ifneq ($(VERBOSE), 0)
|
||||
NVCUFLAGS += -Xcompiler -Wall,-Wextra,-Wno-unused-parameter
|
||||
else
|
||||
.SILENT:
|
||||
endif
|
||||
@@ -1,5 +1,5 @@
|
||||
#
|
||||
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
# Copyright (c) 2015-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications are Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
@@ -68,7 +68,7 @@ include verifiable.mk
|
||||
|
||||
self_test: $(DST_DIR)/self_test
|
||||
|
||||
$(DST_DIR)/self_test: verifiable.cu verifiable.h
|
||||
$(DST_DIR)/self_test: main.cu $(TEST_VERIFIABLE_LIBS)
|
||||
@printf "Linking %s\n" $@
|
||||
@mkdir -p $(DST_DIR)
|
||||
$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
|
||||
$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 $< -L$(TEST_VERIFIABLE_BUILDDIR) -lverifiable $(HIPLDFLAGS) -Xlinker "-rpath=\$$ORIGIN"
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#include <cuda_runtime.h>
|
||||
#include <iostream>
|
||||
|
||||
#define NCCL_VERIFIABLE_SELF_TEST 1
|
||||
#include "verifiable.h"
|
||||
|
||||
int main(int arg_n, char **args) {
|
||||
std::cerr<<"You are hoping to see no output beyond this line."<<std::endl;
|
||||
cudaSetDevice(0);
|
||||
ncclVerifiableLaunchSelfTest();
|
||||
cudaDeviceSynchronize();
|
||||
return 0;
|
||||
}
|
||||
@@ -105,7 +105,7 @@ template<typename T>
|
||||
struct IsIntegral: std::is_integral<T> {};
|
||||
template<>
|
||||
struct IsIntegral<__half>: std::false_type {};
|
||||
#if RCCL_BFLOAT16 == 1
|
||||
#if HAVE_ncclBfloat16
|
||||
template<>
|
||||
struct IsIntegral<hip_bfloat16>: std::false_type {};
|
||||
#endif
|
||||
@@ -150,6 +150,10 @@ namespace {
|
||||
__host__ __device__ __half castTo<__half>(float x) {
|
||||
return __float2half(x);
|
||||
}
|
||||
// template<>
|
||||
// __host__ __device__ __half castTo<half>(double x) {
|
||||
// return __double2half(x);
|
||||
// }
|
||||
template<>
|
||||
__host__ __device__ half castTo<__half>(uint64_t x) {
|
||||
return __ull2half_rn(x);
|
||||
@@ -864,7 +868,7 @@ __host__ __device__ void genOutput(
|
||||
namespace {
|
||||
template<typename T>
|
||||
__host__ __device__ void genInput(
|
||||
T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t rng, intptr_t index,
|
||||
T &ans, ReduceAvg, int rank_n, int rank_me, uint64_t rng, intptr_t index,
|
||||
std::false_type /*integral*/
|
||||
) {
|
||||
// We can't control the nranks divisor in avareages so to control error we
|
||||
@@ -953,7 +957,6 @@ __host__ __device__ T genOutput(
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if !SELF_TEST
|
||||
namespace {
|
||||
template<typename T, typename ReduceFn>
|
||||
__global__ void __launch_bounds__(512, 1) prepareInput2(
|
||||
@@ -1040,11 +1043,9 @@ hipError_t ncclVerifiablePrepareInput(
|
||||
}
|
||||
#undef CASE_OP
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if !SELF_TEST
|
||||
namespace {
|
||||
template<typename T, typename ReduceFn>
|
||||
__global__ void __launch_bounds__(512, 1) prepareExpected2(
|
||||
@@ -1130,7 +1131,6 @@ hipError_t ncclVerifiablePrepareExpected(
|
||||
}
|
||||
#undef CASE_OP
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@@ -1151,7 +1151,6 @@ __host__ __device__ uint64_t calcDelta(T a, T b) {
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if !SELF_TEST
|
||||
namespace {
|
||||
template<typename T>
|
||||
__global__ void __launch_bounds__(512, 1) verifyPrepared(
|
||||
@@ -1245,7 +1244,6 @@ hipError_t verifyInline1(
|
||||
ReduceAvg opavg{rank_n};
|
||||
ReducePreMulSum oppremulsum;
|
||||
void *args[8] = {&results, &elt_n, nullptr, &rank_n, &seed, &elt_ix0, &tolerance, &bad_elt_n};
|
||||
|
||||
#define CASE_OP(op) \
|
||||
if(rank_n == 1) { \
|
||||
fn = (void const*)&verifyInline2<T, Uint, ReduceNil>; \
|
||||
@@ -1336,13 +1334,10 @@ hipError_t ncclVerifiableVerify(
|
||||
}
|
||||
#undef CASE_TY
|
||||
}
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if SELF_TEST
|
||||
#include <iostream>
|
||||
|
||||
namespace {
|
||||
template<typename T, typename Op>
|
||||
__device__ void sweep2(int ty, char const *tyname, Op op, char const *opname, int rank_n) {
|
||||
//if(!std::is_same<T,half>::value) return;
|
||||
@@ -1397,16 +1392,16 @@ __global__ void __launch_bounds__(512, 1) sweep() {
|
||||
#if HAVE_ncclBfloat16
|
||||
sweep1<hip_bfloat16>(ncclBfloat16, "bfloat16");
|
||||
#endif
|
||||
#if HAVE_ncclfp8 && __HIP_DEVICE_COMPILE__
|
||||
#if HAVE_ncclfp8_DEVICE
|
||||
sweep1<rccl_float8>(ncclFloat8e4m3, "fp8_e4m3");
|
||||
sweep1<rccl_bfloat8>(ncclFloat8e5m2, "fp8_e5m2");
|
||||
#endif
|
||||
sweep1<float>(ncclFloat32, "float");
|
||||
sweep1<double>(ncclFloat64, "double");
|
||||
}
|
||||
}
|
||||
|
||||
void ncclVerifiableLaunchSelfTest() {
|
||||
sweep<<<1,512>>>();
|
||||
sweep<<<1,512>>>();
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
@@ -63,4 +63,9 @@ hipError_t ncclVerifiableVerify(
|
||||
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
|
||||
int64_t *bad_elt_n, cudaStream_t stream
|
||||
);
|
||||
|
||||
#ifdef NCCL_VERIFIABLE_SELF_TEST
|
||||
void ncclVerifiableLaunchSelfTest();
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,15 +1,16 @@
|
||||
# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications Copyright (c) 2020-2024 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (c) 2016-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications Copyright (c) 2020-2025 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
|
||||
|
||||
# We requires both of the following paths to be set upon including this makefile
|
||||
# We require both of the following paths to be set upon including this makefile
|
||||
# TEST_VERIFIABLE_SRCDIR = <points to this directory>
|
||||
# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
|
||||
# TEST_VERIFIABLE_BUILDDIR = <points to destination of .so file>
|
||||
|
||||
TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
|
||||
TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
|
||||
TEST_VERIFIABLE_LIBS = $(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so
|
||||
|
||||
${HIPIFY_DIR}/verifiable.cu.cpp: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
|
||||
@printf "Hipifying %-35s > %s\n" $< $@
|
||||
@@ -31,3 +32,9 @@ $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(HIPIFY_DIR)/verifiable.cu.cpp $(HIPI
|
||||
@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
|
||||
echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
|
||||
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
|
||||
|
||||
$(TEST_VERIFIABLE_BUILDDIR)/libverifiable.so: $(TEST_VERIFIABLE_OBJS)
|
||||
@printf "Creating DSO %s\n" $@
|
||||
@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
|
||||
$(CC) -shared -o $@.0 $^ -Wl,-soname,$(notdir $@).0
|
||||
ln -sf $(notdir $@).0 $@
|
||||
|
||||
Ссылка в новой задаче
Block a user