Merge branch 'develop'

[ROCm/rccl-tests commit: 56a5bb0486]
这个提交包含在:
akolliasAMD
2024-01-30 12:50:38 -05:00
当前提交 73f1f3cb3a
修改 25 个文件,包含 2115 行新增757 行删除
+33 -10
查看文件
@@ -2,7 +2,7 @@
# Copyright 2022 Advanced Micro Devices, Inc.
# ########################################################################
#Adding pthread flag for linking
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so mpi_bin_dir mpi_base_lib_dir mpi_inc_dir)
find_program(MPI_MPICXX ${mpi_compiler} PATHS ${mpi_bin_dir} NO_DEFAULT_PATH)
if (MPI_MPICXX)
@@ -44,8 +44,6 @@ set(CMAKE_CXX_STANDARD 14)
# Get additional packages required
find_package(ROCM 0.7.3 CONFIG REQUIRED PATHS "${ROCM_PATH}")
find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}")
include(ROCMSetupVersion)
include(ROCMCreatePackage)
include(ROCMInstallTargets)
@@ -55,12 +53,37 @@ include(ROCMClients)
# Build variables
option(NO_MPI "Build RCCL-tests without MPI support.")
option(MPI_PATH "Use MPI in the specified directory.")
## Get default GPU targets using rocm_check_target_ids
rocm_check_target_ids(
DEFAULT_AMDGPU_TARGETS
TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030"
)
set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for these tests to target.")
# Default GPU architectures to build
#==================================================================================================
set(DEFAULT_GPUS
gfx803
gfx900:xnack-
gfx906:xnack-
gfx908:xnack-
gfx90a:xnack-
gfx90a:xnack+
gfx940
gfx941
gfx942
gfx1030
gfx1100
gfx1101
gfx1102)
set(AMDGPU_TARGETS ${DEFAULT_GPUS} CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined.")
## Determine which GPU architectures to build for
if (COMMAND rocm_check_target_ids)
message(STATUS "Checking for ROCm support for GPU targets:")
rocm_check_target_ids(SUPPORTED_GPUS TARGETS "${AMDGPU_TARGETS}")
else()
message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs")
set(SUPPORTED_GPUS ${DEFAULT_GPUS})
endif()
set(GPU_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "List of specific GPU architectures to build for.")
message(STATUS "Compiling for ${GPU_TARGETS}")
find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}")
if (NOT NO_MPI)
# CHECK for MPI Path first. User requested this directory explicitely
@@ -108,7 +131,7 @@ if (NOT NO_MPI)
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi3/bin /usr/lib64/mpi/gcc/openmpi3 /usr/lib64/mpi/gcc/openmpi3/include)
endif()
# Check for Open MPI v2 SLES installation
if (NOT MPI_MPICXX)
check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi2/bin /usr/lib64/mpi/gcc/openmpi2 /usr/lib64/mpi/gcc/openmpi2/include)
+7 -4
查看文件
@@ -4,9 +4,12 @@
# See LICENCE.txt for license information
#
.PHONY : all clean
BUILDDIR ?= build
override BUILDDIR := $(abspath $(BUILDDIR))
default : src.build
.PHONY: all clean
default: src.build
TARGETS=$(filter-out src/hypercube.cu, $(wildcard src/*))
@@ -14,7 +17,7 @@ all: ${TARGETS:%=%.build}
clean: ${TARGETS:%=%.clean}
%.build:
${MAKE} -C $* build
${MAKE} -C $* build BUILDDIR=${BUILDDIR}
%.clean:
${MAKE} -C $* clean
${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
+3 -3
查看文件
@@ -46,9 +46,9 @@ Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
```
Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each :
Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
```shell
$ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
```
### Performance
@@ -66,7 +66,7 @@ All tests support the same set of arguments :
* `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
* `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
* Increments can be either fixed or a multiplication factor. Only one of those should be used
* `-i,--stepbytes <increment size>` fixed increment between sizes. Default : (max-min)/10.
* `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M.
* `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
* RCCL operations arguments
* `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
+4 -10
查看文件
@@ -3,12 +3,11 @@
# ########################################################################
# Compile common object library
set_property(SOURCE common.cu PROPERTY LANGUAGE CXX)
add_library(rccl_common OBJECT common.cu)
set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX)
add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu)
target_link_libraries(rccl_common roc::rccl hip::device)
if(USE_MPI)
target_link_libraries(rccl_common roc::rccl MPI::MPI_CXX)
else()
target_link_libraries(rccl_common roc::rccl)
target_link_libraries(rccl_common MPI::MPI_CXX)
endif()
function(add_relative_test test_name test_target)
@@ -38,11 +37,6 @@ function(add_rccl_test TEST)
PRIVATE
rccl_common
)
if (NOT WIN32)
foreach(amdgpu_target ${AMDGPU_TARGETS})
target_link_libraries(${TEST_TARGET} PRIVATE --amdgpu-target=${amdgpu_target})
endforeach()
endif()
set_target_properties(
${TEST_TARGET}
PROPERTIES
+15 -7
查看文件
@@ -1,6 +1,6 @@
#
# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
# Modifications are Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
#
@@ -20,11 +20,10 @@ LDFLAGS :=
HIPLDFLAGS :=
ifneq ($(NCCL_HOME), "")
HIPCUFLAGS += -I$(NCCL_HOME) -I$(NCCL_HOME)/rccl/include
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
endif
HIPCUFLAGS += -I$(ROCM_PATH)/include
HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl
HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread
@@ -65,13 +64,22 @@ build: ${BIN_FILES}
clean:
rm -rf ${DST_DIR}
${DST_DIR}/%.o: %.cu common.h
TEST_VERIFIABLE_SRCDIR := ../verifiable
TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
include ../verifiable/verifiable.mk
${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o
${DST_DIR}/timer.o: timer.cc timer.h
@printf "Compiling %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
@printf "Linking %-35s > %s\n" $< $@
@mkdir -p ${DST_DIR}
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}"
+12 -24
查看文件
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,24 +8,15 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s", size, count, typeName);
}
#define ALIGN 4
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count/nranks;
*recvcount = (count/nranks)*nranks;
*sendInplaceOffset = count/nranks;
size_t base = (count/(ALIGN*nranks))*ALIGN;
*sendcount = base;
*recvcount = base*nranks;
*sendInplaceOffset = base;
*recvInplaceOffset = 0;
*paramcount = *sendcount;
*paramcount = base;
}
testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -35,18 +26,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
}
k++;
}
@@ -98,7 +86,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
}
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
}
return testSuccess;
}
+4 -19
查看文件
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
}
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
@@ -35,16 +23,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
int k = 0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
k++;
}
+7 -22
查看文件
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
}
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = (count/nranks)*nranks;
*recvcount = (count/nranks)*nranks;
@@ -35,19 +23,16 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
int k=0;
for (int i=0; i<args->nGpus; i++) {
char* str = getenv("NCCL_TESTS_DEVICE");
int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j));
size_t partcount = sendcount/nranks;
TESTCHECK(InitData(((char*)args->expected[k])+ j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
}
k++;
}
@@ -101,7 +86,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
}
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
}
return testSuccess;
}
+41 -46
查看文件
@@ -10,18 +10,6 @@
#define USE_RCCL_GATHER_SCATTER
void print_header() {
PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %6s %6s", size, count, typeName, opName);
}
void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
if (count < nranks*nranks/2) {
*sendcount = 0;
@@ -45,17 +33,14 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
int k=0;
for (int i=0; i<args->nGpus; i++) {
char* str = getenv("NCCL_TESTS_DEVICE");
int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
#if 0
int *dataHost = (int *)malloc(args->sendBytes);
hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
@@ -66,24 +51,25 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
printf("\n");
free(dataHost);
#endif
size_t rdisp = 0;
size_t data_count = sendcount*2/nranks;
size_t chunksize = data_count/nranks;
for (int j=0; j<nranks; j++) {
size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
if ((j+rank)%nranks == 0)
size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
if ((j+rank)%nranks == 0)
rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
size_t sdisp = 0;
for (int k=0; k<nranks; k++) {
scount = ((k+j)%nranks)*chunksize;
if ((k+j)%nranks == 0)
scount += (sendcount-chunksize*(nranks-1)*nranks/2);
if (k == rank)
break;
sdisp += scount;
}
TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, type, rep+sdisp, j));
rdisp += rcount;
size_t sdisp = 0;
for (int kk=0; kk<nranks; kk++) {
scount = ((kk+j)%nranks)*chunksize;
if ((kk+j)%nranks == 0)
scount += (sendcount-chunksize*(nranks-1)*nranks/2);
if (kk == rank)
break;
sdisp += scount;
}
TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
rdisp += rcount;
}
k++;
}
@@ -107,11 +93,16 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
NCCLCHECK(ncclCommCount(comm, &nranks));
int rank;
NCCLCHECK(ncclCommUserRank(comm, &rank));
#define MAX_ALLTOALLV_RANKS 256
static size_t sendcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS];
if (count == 0) return testSuccess;
if (nranks > MAX_ALLTOALLV_RANKS) {
printf("Number of ranks %d exceeds limit %d\n", nranks, MAX_ALLTOALLV_RANKS);
size_t *sendcounts, *recvcounts, *sdispls, *rdispls;
sendcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
recvcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
sdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
rdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
if (sendcounts == nullptr || recvcounts == nullptr || sdispls == nullptr || rdispls == nullptr) {
printf("failed to allocate buffers for alltoallv\n");
return testNcclError;
}
@@ -121,10 +112,10 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
size_t scount = ((i+rank)%nranks)*chunksize;
if ((i+rank)%nranks == 0)
scount += (count*nranks-chunksize*(nranks-1)*nranks/2);
sendcounts[i+rank*MAX_ALLTOALLV_RANKS] = recvcounts[i+rank*MAX_ALLTOALLV_RANKS] = scount;
sdispls[i+rank*MAX_ALLTOALLV_RANKS] = rdispls[i+rank*MAX_ALLTOALLV_RANKS] = disp;
sendcounts[i+rank*nranks] = recvcounts[i+rank*nranks] = scount;
sdispls[i+rank*nranks] = rdispls[i+rank*nranks] = disp;
disp += scount;
//printf("%d->%d: sendcounts/recvcounts %lx sdispls/rdispls %lx\n", rank, i, sendcounts[i+rank*MAX_ALLTOALLV_RANKS]*wordSize(type), sdispls[i+rank*MAX_ALLTOALLV_RANKS]*wordSize(type));
//printf("%d->%d: sendcounts/recvcounts %lx sdispls/rdispls %lx\n", rank, i, sendcounts[i+rank*nranks]*wordSize(type), sdispls[i+rank*nranks]*wordSize(type));
}
#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
@@ -132,23 +123,23 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
return testNcclError;
#else
#if defined(RCCL_ALLTOALLV) && defined(USE_RCCL_GATHER_SCATTER)
NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts+rank*MAX_ALLTOALLV_RANKS, sdispls+rank*MAX_ALLTOALLV_RANKS, recvbuff, recvcounts+rank*MAX_ALLTOALLV_RANKS, rdispls+rank*MAX_ALLTOALLV_RANKS, type, comm, stream));
NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts+rank*nranks, sdispls+rank*nranks, recvbuff, recvcounts+rank*nranks, rdispls+rank*nranks, type, comm, stream));
#else
NCCLCHECK(ncclGroupStart());
for (int r=0; r<nranks; r++) {
if (sendcounts[r+rank*MAX_ALLTOALLV_RANKS] != 0) {
if (sendcounts[r+rank*nranks] != 0) {
NCCLCHECK(ncclSend(
((char*)sendbuff) + sdispls[r+rank*MAX_ALLTOALLV_RANKS] * wordSize(type),
sendcounts[r+rank*MAX_ALLTOALLV_RANKS],
((char*)sendbuff) + sdispls[r+rank*nranks] * wordSize(type),
sendcounts[r+rank*nranks],
type,
r,
comm,
stream));
}
if (recvcounts[r+rank*MAX_ALLTOALLV_RANKS] != 0) {
if (recvcounts[r+rank*nranks] != 0) {
NCCLCHECK(ncclRecv(
((char*)recvbuff) + rdispls[r+rank*MAX_ALLTOALLV_RANKS] * wordSize(type),
recvcounts[r+rank*MAX_ALLTOALLV_RANKS],
((char*)recvbuff) + rdispls[r+rank*nranks] * wordSize(type),
recvcounts[r+rank*nranks],
type,
r,
comm,
@@ -157,8 +148,12 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
}
NCCLCHECK(ncclGroupEnd());
#endif
return testSuccess;
#endif
free(sendcounts);
free(recvcounts);
free(sdispls);
free(rdispls);
return testSuccess;
}
struct testColl alltoAllTest = {
+6 -21
查看文件
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
}
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
@@ -34,17 +22,14 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(args->expected[k], recvcount, type, rep, root));
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
TESTCHECK(InitData(args->expected[k], recvcount, 0, type, ncclSum, rep, 1, 0));
k++;
}
HIPCHECK(hipDeviceSynchronize());
@@ -114,7 +99,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
for (int i=0; i<type_count; i++) {
for (int j=begin_root; j<=end_root; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
}
}
return testSuccess;
文件差异内容过多而无法显示 加载差异
+37 -26
查看文件
@@ -1,13 +1,13 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef __COMMON_H__
#define __COMMON_H__
#include "rccl.h"
#include "rccl/rccl.h"
#include <stdio.h>
#include <cstdint>
#include <algorithm>
@@ -16,19 +16,38 @@
#endif
#include <pthread.h>
#include "nccl1_compat.h"
#include "timer.h"
#define HIPCHECK(cmd) do { \
hipError_t e = cmd; \
if( e != hipSuccess ) { \
// For nccl.h < 2.13 since we define a weak fallback
extern "C" char const* ncclGetLastError(ncclComm_t comm);
#define HIPCHECK(cmd) do { \
hipError_t e = cmd; \
if( e != hipSuccess ) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
printf("%s: Test HIP failure %s:%d '%s'\n", \
printf("%s: Test HIP failure %s:%d '%s'\n", \
hostname, \
__FILE__,__LINE__,hipGetErrorString(e)); \
__FILE__,__LINE__,hipGetErrorString(e)); \
return testCudaError; \
} \
} while(0)
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0)
#define NCCLCHECK(cmd) do { \
ncclResult_t res = cmd; \
if (res != ncclSuccess) { \
char hostname[1024]; \
getHostName(hostname, 1024); \
printf("%s: Test NCCL failure %s:%d " \
"'%s / %s'\n", \
hostname,__FILE__,__LINE__, \
ncclGetErrorString(res), \
ncclGetLastError(NULL)); \
return testNcclError; \
} \
} while(0)
#else
#define NCCLCHECK(cmd) do { \
ncclResult_t res = cmd; \
if (res != ncclSuccess) { \
@@ -40,13 +59,15 @@
return testNcclError; \
} \
} while(0)
#endif
typedef enum {
testSuccess = 0,
testInternalError = 1,
testCudaError = 2,
testNcclError = 3,
testCuRandError = 4
testTimeout = 4,
testNumResults = 5
} testResult_t;
// Relay errors up and trace
@@ -96,14 +117,17 @@ struct threadArgs {
size_t stepbytes;
size_t stepfactor;
int totalProcs;
int nProcs;
int proc;
int nThreads;
int thread;
int nGpus;
int* gpus;
int localRank;
int localNumDevices;
int enable_multiranks;
int enable_out_of_place;
int nRanks;
void** sendbuffs;
size_t sendBytes;
@@ -116,14 +140,6 @@ struct threadArgs {
void** expected;
size_t expectedBytes;
volatile int* sync;
int sync_idx;
volatile int* barrier;
int barrier_idx;
volatile double* reduce;
int syncRank;
int syncNranks;
double* deltaHost;
int* errors;
double* bw;
int* bw_count;
@@ -141,19 +157,13 @@ struct testThread {
testResult_t ret;
};
#include <chrono>
// Provided by common.cu
extern void Barrier(struct threadArgs* args);
extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root);
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
// Provided by each coll
extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
extern void print_header();
#include <unistd.h>
static void getHostName(char* hostname, int maxlen) {
@@ -233,7 +243,7 @@ static size_t wordSize(ncclDataType_t type) {
case ncclInt64:
case ncclUint64:
case ncclDouble:
//case ncclFloat64:
//case ncclFloat64:
return 8;
default: return 0;
}
@@ -290,6 +300,7 @@ static int ncclstringtomtype (char *str) {
return ncclCoarse;
}
extern int is_main_proc;
extern thread_local int is_main_thread;
#define PRINT if (is_main_thread) printf
+6 -21
查看文件
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include "hip/hip_runtime.h"
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
}
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count/nranks;
*recvcount = (count/nranks)*nranks;
@@ -35,20 +23,17 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
if (rank == root) {
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
TESTCHECK(InitData(((char*)args->expected[k]), nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
}
}
k++;
@@ -125,7 +110,7 @@ testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t typ
for (int i=0; i<type_count; i++) {
for (int j=begin_root; j<=end_root; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
}
}
return testSuccess;
+14 -21
查看文件
@@ -1,5 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -9,18 +10,6 @@
#define ALIGN 4
void print_header() {
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s", size, count, typeName);
}
void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
size_t base = (count/(ALIGN*nranks))*ALIGN;
*sendcount = base;
@@ -37,18 +26,15 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
for (int j=0; j<nranks; j++) {
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
}
k++;
}
@@ -116,9 +102,16 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t
run_typenames = test_typenames;
}
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
// Check if this is a power of 2
int nRanks = args->nProcs*args->nThreads*args->nGpus;
if (nRanks && !(nRanks & (nRanks - 1))) {
for (int i=0; i<type_count; i++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
}
} else {
printf("nRanks %d is not a power of 2, skipping\n", nRanks);
}
return testSuccess;
}
+4 -19
查看文件
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "root",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6s %6i", size, count, typeName, opName, root);
}
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
@@ -35,16 +23,13 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
if (rank == root) TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
k++;
+10 -22
查看文件
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,24 +8,15 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
}
#define ALIGN 4
void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = (count/nranks)*nranks;
*recvcount = count/nranks;
size_t base = (count/(ALIGN*nranks))*ALIGN;
*sendcount = base*nranks;
*recvcount = base;
*sendInplaceOffset = 0;
*recvInplaceOffset = count/nranks;
*paramcount = *recvcount;
*recvInplaceOffset = base;
*paramcount = base;
}
testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
@@ -35,16 +26,13 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
TESTCHECK(InitDataReduce(args->expected[k], recvcount, rank*recvcount, type, op, rep, nranks));
k++;
+6 -21
查看文件
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
}
void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = (count/nranks)*nranks;
*recvcount = count/nranks;
@@ -34,17 +22,14 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(args->expected[k], recvcount, type, rep+rank*recvcount, root));
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
TESTCHECK(InitData(args->expected[k], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
k++;
}
@@ -120,7 +105,7 @@ testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t ty
for (int i=0; i<type_count; i++) {
for (int j=begin_root; j<=end_root; j++) {
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
}
}
return testSuccess;
+5 -20
查看文件
@@ -1,6 +1,6 @@
/*************************************************************************
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
@@ -8,18 +8,6 @@
#include <hip/hip_runtime.h>
#include "common.h"
void print_header() {
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
}
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
PRINT("%12li %12li %8s", size, count, typeName);
}
void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
*sendcount = count;
*recvcount = count;
@@ -35,18 +23,15 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
int k=0;
for (int i=0; i<args->nGpus; i++) {
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
if (args->enable_multiranks)
gpuid = gpuid % args->localNumDevices;
HIPCHECK(hipSetDevice(gpuid));
HIPCHECK(hipSetDevice(args->gpus[i]));
for (int l=0; l<args->nRanks; l++) {
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
TESTCHECK(InitData(data, sendcount, type, rep, rank));
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
int peer = (rank-1+nranks)%nranks;
TESTCHECK(InitData(args->expected[k], recvcount, type, rep, peer));
TESTCHECK(InitData(args->expected[k], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
k++;
}
HIPCHECK(hipDeviceSynchronize());
+28
查看文件
@@ -0,0 +1,28 @@
#include "timer.h"
// Make sure to compile this translation unit with the host compiler and not
// nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
#include <chrono>
namespace {
std::uint64_t now() {
using clock = std::chrono::steady_clock;
return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
}
}
timer::timer() {
t0 = now();
}
double timer::elapsed() const {
std::uint64_t t1 = now();
return 1.e-9*(t1 - t0);
}
double timer::reset() {
std::uint64_t t1 = now();
double ans = 1.e-9*(t1 - t0);
t0 = t1;
return ans;
}
+15
查看文件
@@ -0,0 +1,15 @@
#ifndef _408319ecdd5b47b28bf8f511c4fdf816
#define _408319ecdd5b47b28bf8f511c4fdf816
#include <cstdint>
// Can't include <chrono> because of bug with gcc 10.3.0
class timer {
std::uint64_t t0;
public:
timer();
double elapsed() const;
double reset();
};
#endif
+73
查看文件
@@ -0,0 +1,73 @@
#
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
#
#include ../../makefiles/common.mk
.PHONY: all clean
BUILDDIR := $(abspath ../../build)
DST_DIR := $(BUILDDIR)/test/verifiable
ROCM_PATH ?= /opt/rocm
MPI_HOME ?= /usr/lib/openmpi
PREFIX ?= /usr/local
VERBOSE ?= 0
DEBUG ?= 0
NCCL_HOME ?= ""
HIPCC = $(ROCM_PATH)/bin/hipcc
CXX = $(HIPCC)
HIPCUFLAGS := -std=c++14
LDFLAGS :=
HIPLDFLAGS :=
ifneq ($(NCCL_HOME), "")
HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
endif
HIPCUFLAGS += -I$(ROCM_PATH)/include
HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
ifeq ($(DEBUG), 0)
HIPCUFLAGS += -O3
else
HIPCUFLAGS += -O0 -g -ggdb3
endif
ifeq ($(VERBOSE), 0)
.SILENT:
endif
ifeq ($(MPI), 1)
HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
else ifeq ($(MPICH), 1)
HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
HIPLDFLAGS += -L/usr/lib -lmpich
endif
LIBRARIES += rccl
HIPLDFLAGS += $(LIBRARIES:%=-l%)
all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test
clean:
rm -rf $(DST_DIR)
TEST_VERIFIABLE_SRCDIR := .
TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
include verifiable.mk
self_test: $(DST_DIR)/self_test
$(DST_DIR)/self_test: verifiable.cu verifiable.h
@printf "Linking %s\n" $@
@mkdir -p $(DST_DIR)
$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
@@ -0,0 +1,195 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
/* Generate parameters for our error bound model of floating point average
* (sum of scaled values) by sampling sums of random sequences for each
* floating point type.
*
* The model has parameters "coef" and "power", where for two floats a & b,
* they are close enough if and only if:
* abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
*
* Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
*
* Compile with:
* nvcc -gencode=arch=compute_80,code=sm_80
*/
#include <algorithm>
#include <cmath>
#include <cstdio>
#include <cstdint>
#include <hip/hip_bfloat16.h>
#include <hip/hip_fp16.h>
using std::uint64_t;
using std::uint32_t;
using bfloat16 = hip_bfloat16;
template<typename T>
struct float_traits;
template<>
struct float_traits<float> {
static constexpr int mantissa_bits = 23;
static constexpr int exponent_bits = 8;
using uint_t = uint32_t;
__device__ static float make(double x) { return (float)x; }
__device__ static float make(uint64_t x) { return (float)x; }
__device__ static double todouble(float x) { return x; }
__device__ static float add(float a, float b) { return a+b; }
__device__ static float mul(float a, float b) { return a*b; }
};
template<>
struct float_traits<double> {
static constexpr int mantissa_bits = 52;
static constexpr int exponent_bits = 11;
using uint_t = uint64_t;
__device__ static double make(double x) { return x; }
__device__ static double make(uint64_t x) { return (double)x; }
__device__ static double todouble(double x) { return x; }
__device__ static double add(double a, double b) { return a+b; }
__device__ static double mul(double a, double b) { return a*b; }
};
template<>
struct float_traits<__half> {
static constexpr int mantissa_bits = 10;
static constexpr int exponent_bits = 5;
using uint_t = uint16_t;
__device__ static __half make(double x) { return __float2half((float)x); }
__device__ static __half make(uint64_t x) { return __int2half_rn(x); }
__device__ static double todouble(__half x) { return __half2float(x); }
__device__ static __half add(__half a, __half b) { return __hadd(a, b); }
__device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
};
template<>
struct float_traits<bfloat16> {
static constexpr int mantissa_bits = 7;
static constexpr int exponent_bits = 8;
using uint_t = uint16_t;
__device__ static bfloat16 make(double x) { return bfloat16(x); }
__device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
__device__ static double todouble(bfloat16 x) { return double(x); }
__device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
__device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
};
template<typename F>
__device__ int compare(F a, F b) {
union { typename float_traits<F>::uint_t ua; F fa; };
union { typename float_traits<F>::uint_t ub; F fb; };
ua=0; ub=0;
fa=a; fb=b;
//std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
return ua < ub ? ub-ua : ua-ub;
}
struct xoshiro256ss {
uint64_t s[4];
__device__ xoshiro256ss(int seed) {
constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
for(int i=0; i < 4; i++)
s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
}
__device__ uint64_t operator()() {
auto rol64 = [](uint64_t x, int k) {
return (x << k) | (x >> (64 - k));
};
uint64_t const result = rol64(s[1] * 5, 7) * 9;
uint64_t const t = s[1] << 17;
s[2] ^= s[0];
s[3] ^= s[1];
s[1] ^= s[2];
s[0] ^= s[3];
s[2] ^= t;
s[3] = rol64(s[3], 45);
return result;
}
};
static __device__ int __reduce_max_sync(unsigned int mask, int value)
{
//We ignore mask, since all bits are set when calling them in the
//test code below.
int width = warpSize;
for (unsigned int i = warpSize; i; i >>= 1) {
value = max(__shfl_down(value, i, width), value);
}
return value;
}
template<typename F>
__global__ void kernel() {
using traits = float_traits<F>;
constexpr int samps = 4<<10;
__shared__ F accf[samps];
__shared__ double accd[samps];
xoshiro256ss rng(threadIdx.x);
float expo_avg = 1;
for(int pass=0; pass < 2; pass++) {
F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
int err_max = 0;
float coef = 0;
double expo_sum = 0;
int expo_n = 0;
int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
//for(int round=0; round < 2; round++) {
for(int i=threadIdx.x; i < samps; i += blockDim.x) {
accf[i] = (F)0;
accd[i] = 0;
}
__syncthreads();
for(int r=0; r < max_ranks; r++) {
int err = 0;
for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
F f = traits::make(d);
accf[i] = traits::add(accf[i], traits::mul(scalar, f));
accd[i] += traits::todouble(f);
//if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
err = err > e ? err : e;
}
err = __reduce_max_sync(-1u, err);
err_max = err_max > err ? err_max : err;
if (r >= 2) {
// err = 1 + coef*pow(r,expo)
float c = float(err-1)/powf(float(r), expo_avg);
coef = coef > c ? coef : c;
}
if (r >= 2) {
double expo = log2f(1+err_max)/log2f(r);
expo_sum += expo;
expo_n++;
//if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
}
}
}
if(pass==0)
expo_avg = expo_sum/expo_n;
else if(threadIdx.x == 0)
printf(" coef=%1.10f expo=%1.10f\n", coef, expo_avg);
}
}
int main() {
std::printf("type=float:\n");
kernel<float><<<1,32>>>();
hipDeviceSynchronize();
std::printf("\ntype=half:\n");
kernel<half><<<1,32>>>();
hipDeviceSynchronize();
std::printf("\ntype=bfloat16:\n");
kernel<bfloat16><<<1,32>>>();
hipDeviceSynchronize();
return 0;
}
文件差异内容过多而无法显示 加载差异
@@ -0,0 +1,66 @@
/*************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
#ifndef _d41d8cd98f00b204e9800998ecf8427e
#define _d41d8cd98f00b204e9800998ecf8427e
#include <hip/hip_runtime.h>
#include <stdint.h>
/* Routines for launching kernels that verify reduction results. A significant
* feature of these routines is they carefully craft floating point input
* to produce exactly predictable output.
*
* int elt_ty: actually just a ncclDataType_t
*
* int red_op: mostly just a ncclRedOp_t. Since PreMulSum ops are dynamically
* created, these are encoded as the value ncclNumOps and their scalar is
* assumed to be `ncclVerifiablePremulScalar(rank_me)`
*
* uint64_t seed: arbitrary 64-bits to use in seeding the random values
*
* intptr_t elt_ix0: index of first element pointed to by elts when generating
* random values. This makes it possible to generate subsequences independently
* as well as in aggregate.
*
* int rank_n: Number of contributions into the reduction. Non-reduction
* collectives like broadcast, gather, etc will always set this to one.
*
* int rank_me: Index of this contribution
*/
// Use this as the local scalar for PreMulSum ops
template<typename T>
__host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
return T(rank_me%2 == 0 ? 1.0f : 2.0f);
}
// Enqueue kernel to generate data which is to be reduced.
void ncclVerifiablePrepareInput(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
);
// Enqueue kernel to generate expected results of reduction.
void ncclVerifiablePrepareExpected(
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
);
// Enqueue kernel to verify reduced data matches expectation. The number of
// failed elements is written to bad_elt_n which must be in cudaHost memory.
// If `expected == nullptr` then the expected results are generated on-the-fly
// which can be costly. Thus if you plan to run the same reduction multiple
// times it is advantageous to precompute the expected values with
// ncclVerifiablePrepareExpected and pass them as `expected` here.
void ncclVerifiableVerify(
void const *results, void const *expected, intptr_t elt_n, int elt_ty,
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
int64_t *bad_elt_n, hipStream_t stream
);
#endif
@@ -0,0 +1,18 @@
# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
# We requires both of the following paths to be set upon including this makefile
# TEST_VERIFIABLE_SRCDIR = <points to this directory>
# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
@printf "Compiling %s\n" $@
@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu