@@ -2,7 +2,7 @@
|
||||
# Copyright 2022 Advanced Micro Devices, Inc.
|
||||
# ########################################################################
|
||||
#Adding pthread flag for linking
|
||||
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
|
||||
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
|
||||
macro(check_mpi mpi_compiler mpi_lib_a mpi_lib_so mpi_bin_dir mpi_base_lib_dir mpi_inc_dir)
|
||||
find_program(MPI_MPICXX ${mpi_compiler} PATHS ${mpi_bin_dir} NO_DEFAULT_PATH)
|
||||
if (MPI_MPICXX)
|
||||
@@ -44,8 +44,6 @@ set(CMAKE_CXX_STANDARD 14)
|
||||
|
||||
# Get additional packages required
|
||||
find_package(ROCM 0.7.3 CONFIG REQUIRED PATHS "${ROCM_PATH}")
|
||||
find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}")
|
||||
|
||||
include(ROCMSetupVersion)
|
||||
include(ROCMCreatePackage)
|
||||
include(ROCMInstallTargets)
|
||||
@@ -55,12 +53,37 @@ include(ROCMClients)
|
||||
# Build variables
|
||||
option(NO_MPI "Build RCCL-tests without MPI support.")
|
||||
option(MPI_PATH "Use MPI in the specified directory.")
|
||||
## Get default GPU targets using rocm_check_target_ids
|
||||
rocm_check_target_ids(
|
||||
DEFAULT_AMDGPU_TARGETS
|
||||
TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx1030"
|
||||
)
|
||||
set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for these tests to target.")
|
||||
|
||||
# Default GPU architectures to build
|
||||
#==================================================================================================
|
||||
set(DEFAULT_GPUS
|
||||
gfx803
|
||||
gfx900:xnack-
|
||||
gfx906:xnack-
|
||||
gfx908:xnack-
|
||||
gfx90a:xnack-
|
||||
gfx90a:xnack+
|
||||
gfx940
|
||||
gfx941
|
||||
gfx942
|
||||
gfx1030
|
||||
gfx1100
|
||||
gfx1101
|
||||
gfx1102)
|
||||
|
||||
set(AMDGPU_TARGETS ${DEFAULT_GPUS} CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined.")
|
||||
## Determine which GPU architectures to build for
|
||||
if (COMMAND rocm_check_target_ids)
|
||||
message(STATUS "Checking for ROCm support for GPU targets:")
|
||||
rocm_check_target_ids(SUPPORTED_GPUS TARGETS "${AMDGPU_TARGETS}")
|
||||
else()
|
||||
message(WARNING "Unable to check for supported GPU targets. Falling back to default GPUs")
|
||||
set(SUPPORTED_GPUS ${DEFAULT_GPUS})
|
||||
endif()
|
||||
set(GPU_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "List of specific GPU architectures to build for.")
|
||||
message(STATUS "Compiling for ${GPU_TARGETS}")
|
||||
|
||||
find_package(RCCL HINTS CONFIG REQUIRED PATHS "${ROCM_PATH}")
|
||||
|
||||
if (NOT NO_MPI)
|
||||
# CHECK for MPI Path first. User requested this directory explicitely
|
||||
@@ -108,7 +131,7 @@ if (NOT NO_MPI)
|
||||
if (NOT MPI_MPICXX)
|
||||
check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi3/bin /usr/lib64/mpi/gcc/openmpi3 /usr/lib64/mpi/gcc/openmpi3/include)
|
||||
endif()
|
||||
|
||||
|
||||
# Check for Open MPI v2 SLES installation
|
||||
if (NOT MPI_MPICXX)
|
||||
check_mpi(mpicxx libmpi.a libmpi.so /usr/lib64/mpi/gcc/openmpi2/bin /usr/lib64/mpi/gcc/openmpi2 /usr/lib64/mpi/gcc/openmpi2/include)
|
||||
|
||||
@@ -4,9 +4,12 @@
|
||||
# See LICENCE.txt for license information
|
||||
#
|
||||
|
||||
.PHONY : all clean
|
||||
BUILDDIR ?= build
|
||||
override BUILDDIR := $(abspath $(BUILDDIR))
|
||||
|
||||
default : src.build
|
||||
.PHONY: all clean
|
||||
|
||||
default: src.build
|
||||
|
||||
TARGETS=$(filter-out src/hypercube.cu, $(wildcard src/*))
|
||||
|
||||
@@ -14,7 +17,7 @@ all: ${TARGETS:%=%.build}
|
||||
clean: ${TARGETS:%=%.clean}
|
||||
|
||||
%.build:
|
||||
${MAKE} -C $* build
|
||||
${MAKE} -C $* build BUILDDIR=${BUILDDIR}
|
||||
|
||||
%.clean:
|
||||
${MAKE} -C $* clean
|
||||
${MAKE} -C $* clean BUILDDIR=${BUILDDIR}
|
||||
|
||||
@@ -46,9 +46,9 @@ Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
|
||||
$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
|
||||
```
|
||||
|
||||
Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each :
|
||||
Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
|
||||
```shell
|
||||
$ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
|
||||
$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
|
||||
```
|
||||
|
||||
### Performance
|
||||
@@ -66,7 +66,7 @@ All tests support the same set of arguments :
|
||||
* `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
|
||||
* `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
|
||||
* Increments can be either fixed or a multiplication factor. Only one of those should be used
|
||||
* `-i,--stepbytes <increment size>` fixed increment between sizes. Default : (max-min)/10.
|
||||
* `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M.
|
||||
* `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
|
||||
* RCCL operations arguments
|
||||
* `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
|
||||
|
||||
@@ -3,12 +3,11 @@
|
||||
# ########################################################################
|
||||
|
||||
# Compile common object library
|
||||
set_property(SOURCE common.cu PROPERTY LANGUAGE CXX)
|
||||
add_library(rccl_common OBJECT common.cu)
|
||||
set_property(SOURCE common.cu timer.cc ../verifiable/verifiable.cu PROPERTY LANGUAGE CXX)
|
||||
add_library(rccl_common OBJECT common.cu timer.cc ../verifiable/verifiable.cu)
|
||||
target_link_libraries(rccl_common roc::rccl hip::device)
|
||||
if(USE_MPI)
|
||||
target_link_libraries(rccl_common roc::rccl MPI::MPI_CXX)
|
||||
else()
|
||||
target_link_libraries(rccl_common roc::rccl)
|
||||
target_link_libraries(rccl_common MPI::MPI_CXX)
|
||||
endif()
|
||||
|
||||
function(add_relative_test test_name test_target)
|
||||
@@ -38,11 +37,6 @@ function(add_rccl_test TEST)
|
||||
PRIVATE
|
||||
rccl_common
|
||||
)
|
||||
if (NOT WIN32)
|
||||
foreach(amdgpu_target ${AMDGPU_TARGETS})
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE --amdgpu-target=${amdgpu_target})
|
||||
endforeach()
|
||||
endif()
|
||||
set_target_properties(
|
||||
${TEST_TARGET}
|
||||
PROPERTIES
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
#
|
||||
# Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications are Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
@@ -20,11 +20,10 @@ LDFLAGS :=
|
||||
HIPLDFLAGS :=
|
||||
|
||||
ifneq ($(NCCL_HOME), "")
|
||||
HIPCUFLAGS += -I$(NCCL_HOME) -I$(NCCL_HOME)/rccl/include
|
||||
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
|
||||
HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
|
||||
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME) -L$(NCCL_HOME)/lib
|
||||
endif
|
||||
HIPCUFLAGS += -I$(ROCM_PATH)/include
|
||||
HIPCUFLAGS += -I$(ROCM_PATH)/include/rccl
|
||||
HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
|
||||
LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
|
||||
HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt -pthread
|
||||
@@ -65,13 +64,22 @@ build: ${BIN_FILES}
|
||||
clean:
|
||||
rm -rf ${DST_DIR}
|
||||
|
||||
${DST_DIR}/%.o: %.cu common.h
|
||||
TEST_VERIFIABLE_SRCDIR := ../verifiable
|
||||
TEST_VERIFIABLE_BUILDDIR := $(BUILDDIR)/verifiable
|
||||
include ../verifiable/verifiable.mk
|
||||
|
||||
${DST_DIR}/%.o: %.cu common.h $(TEST_VERIFIABLE_HDRS)
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<"
|
||||
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $<
|
||||
|
||||
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o
|
||||
${DST_DIR}/timer.o: timer.cc timer.h
|
||||
@printf "Compiling %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
$(CXX) $(CXXFLAGS) -o $@ -c timer.cc
|
||||
|
||||
${DST_DIR}/%_perf:${DST_DIR}/%.o ${DST_DIR}/common.o ${DST_DIR}/timer.o $(TEST_VERIFIABLE_OBJS)
|
||||
@printf "Linking %-35s > %s\n" $< $@
|
||||
@mkdir -p ${DST_DIR}
|
||||
echo "$(HIPCC) -o $@ $(HIPCUFLAGS) $^ ${HIPLDFLAGS}"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,24 +8,15 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s", size, count, typeName);
|
||||
}
|
||||
#define ALIGN 4
|
||||
|
||||
void AllGatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count/nranks;
|
||||
*recvcount = (count/nranks)*nranks;
|
||||
*sendInplaceOffset = count/nranks;
|
||||
size_t base = (count/(ALIGN*nranks))*ALIGN;
|
||||
*sendcount = base;
|
||||
*recvcount = base*nranks;
|
||||
*sendInplaceOffset = base;
|
||||
*recvInplaceOffset = 0;
|
||||
*paramcount = *sendcount;
|
||||
*paramcount = base;
|
||||
}
|
||||
|
||||
testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
|
||||
@@ -35,18 +26,15 @@ testResult_t AllGatherInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
|
||||
for (int j=0; j<nranks; j++) {
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
|
||||
}
|
||||
k++;
|
||||
}
|
||||
@@ -98,7 +86,7 @@ testResult_t AllGatherRunTest(struct threadArgs* args, int root, ncclDataType_t
|
||||
}
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
|
||||
}
|
||||
|
||||
void AllReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
@@ -35,16 +23,13 @@ testResult_t AllReduceInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
|
||||
int k = 0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
|
||||
TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
|
||||
k++;
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
|
||||
}
|
||||
|
||||
void AlltoAllGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = (count/nranks)*nranks;
|
||||
*recvcount = (count/nranks)*nranks;
|
||||
@@ -35,19 +23,16 @@ testResult_t AlltoAllInitData(struct threadArgs* args, ncclDataType_t type, nccl
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
char* str = getenv("NCCL_TESTS_DEVICE");
|
||||
int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
|
||||
for (int j=0; j<nranks; j++) {
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes/nranks*j, sendcount/nranks, type, rep+rank*sendcount/nranks, j));
|
||||
size_t partcount = sendcount/nranks;
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+ j*partcount*wordSize(type), partcount, rank*partcount, type, ncclSum, 33*rep + j, 1, 0));
|
||||
}
|
||||
k++;
|
||||
}
|
||||
@@ -101,7 +86,7 @@ testResult_t AlltoAllRunTest(struct threadArgs* args, int root, ncclDataType_t t
|
||||
}
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", -1));
|
||||
}
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
@@ -10,18 +10,6 @@
|
||||
|
||||
#define USE_RCCL_GATHER_SCATTER
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %6s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %6s %6s", size, count, typeName, opName);
|
||||
}
|
||||
|
||||
void AlltoAllvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
if (count < nranks*nranks/2) {
|
||||
*sendcount = 0;
|
||||
@@ -45,17 +33,14 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
char* str = getenv("NCCL_TESTS_DEVICE");
|
||||
int gpuid = str ? atoi(str) : args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep+rank, 1, 0));
|
||||
|
||||
#if 0
|
||||
int *dataHost = (int *)malloc(args->sendBytes);
|
||||
hipMemcpy(dataHost, data, args->sendBytes, hipMemcpyDeviceToHost);
|
||||
@@ -66,24 +51,25 @@ testResult_t AlltoAllvInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
printf("\n");
|
||||
free(dataHost);
|
||||
#endif
|
||||
|
||||
size_t rdisp = 0;
|
||||
size_t data_count = sendcount*2/nranks;
|
||||
size_t chunksize = data_count/nranks;
|
||||
for (int j=0; j<nranks; j++) {
|
||||
size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
|
||||
if ((j+rank)%nranks == 0)
|
||||
size_t scount = 0, rcount = ((j+rank)%nranks)*chunksize;
|
||||
if ((j+rank)%nranks == 0)
|
||||
rcount += (sendcount-chunksize*(nranks-1)*nranks/2);
|
||||
size_t sdisp = 0;
|
||||
for (int k=0; k<nranks; k++) {
|
||||
scount = ((k+j)%nranks)*chunksize;
|
||||
if ((k+j)%nranks == 0)
|
||||
scount += (sendcount-chunksize*(nranks-1)*nranks/2);
|
||||
if (k == rank)
|
||||
break;
|
||||
sdisp += scount;
|
||||
}
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, type, rep+sdisp, j));
|
||||
rdisp += rcount;
|
||||
size_t sdisp = 0;
|
||||
for (int kk=0; kk<nranks; kk++) {
|
||||
scount = ((kk+j)%nranks)*chunksize;
|
||||
if ((kk+j)%nranks == 0)
|
||||
scount += (sendcount-chunksize*(nranks-1)*nranks/2);
|
||||
if (kk == rank)
|
||||
break;
|
||||
sdisp += scount;
|
||||
}
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+rdisp*wordSize(type), rcount, sdisp, type, ncclSum, 33*rep+j, 1, 0));
|
||||
rdisp += rcount;
|
||||
}
|
||||
k++;
|
||||
}
|
||||
@@ -107,11 +93,16 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
|
||||
NCCLCHECK(ncclCommCount(comm, &nranks));
|
||||
int rank;
|
||||
NCCLCHECK(ncclCommUserRank(comm, &rank));
|
||||
#define MAX_ALLTOALLV_RANKS 256
|
||||
static size_t sendcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], recvcounts[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], sdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS], rdispls[MAX_ALLTOALLV_RANKS*MAX_ALLTOALLV_RANKS];
|
||||
|
||||
if (count == 0) return testSuccess;
|
||||
if (nranks > MAX_ALLTOALLV_RANKS) {
|
||||
printf("Number of ranks %d exceeds limit %d\n", nranks, MAX_ALLTOALLV_RANKS);
|
||||
|
||||
size_t *sendcounts, *recvcounts, *sdispls, *rdispls;
|
||||
sendcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
|
||||
recvcounts = (size_t *)malloc(nranks*nranks*sizeof(size_t));
|
||||
sdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
|
||||
rdispls = (size_t *)malloc(nranks*nranks*sizeof(size_t));
|
||||
if (sendcounts == nullptr || recvcounts == nullptr || sdispls == nullptr || rdispls == nullptr) {
|
||||
printf("failed to allocate buffers for alltoallv\n");
|
||||
return testNcclError;
|
||||
}
|
||||
|
||||
@@ -121,10 +112,10 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
|
||||
size_t scount = ((i+rank)%nranks)*chunksize;
|
||||
if ((i+rank)%nranks == 0)
|
||||
scount += (count*nranks-chunksize*(nranks-1)*nranks/2);
|
||||
sendcounts[i+rank*MAX_ALLTOALLV_RANKS] = recvcounts[i+rank*MAX_ALLTOALLV_RANKS] = scount;
|
||||
sdispls[i+rank*MAX_ALLTOALLV_RANKS] = rdispls[i+rank*MAX_ALLTOALLV_RANKS] = disp;
|
||||
sendcounts[i+rank*nranks] = recvcounts[i+rank*nranks] = scount;
|
||||
sdispls[i+rank*nranks] = rdispls[i+rank*nranks] = disp;
|
||||
disp += scount;
|
||||
//printf("%d->%d: sendcounts/recvcounts %lx sdispls/rdispls %lx\n", rank, i, sendcounts[i+rank*MAX_ALLTOALLV_RANKS]*wordSize(type), sdispls[i+rank*MAX_ALLTOALLV_RANKS]*wordSize(type));
|
||||
//printf("%d->%d: sendcounts/recvcounts %lx sdispls/rdispls %lx\n", rank, i, sendcounts[i+rank*nranks]*wordSize(type), sdispls[i+rank*nranks]*wordSize(type));
|
||||
}
|
||||
|
||||
#if NCCL_MAJOR < 2 || NCCL_MINOR < 7
|
||||
@@ -132,23 +123,23 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
|
||||
return testNcclError;
|
||||
#else
|
||||
#if defined(RCCL_ALLTOALLV) && defined(USE_RCCL_GATHER_SCATTER)
|
||||
NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts+rank*MAX_ALLTOALLV_RANKS, sdispls+rank*MAX_ALLTOALLV_RANKS, recvbuff, recvcounts+rank*MAX_ALLTOALLV_RANKS, rdispls+rank*MAX_ALLTOALLV_RANKS, type, comm, stream));
|
||||
NCCLCHECK(ncclAllToAllv(sendbuff, sendcounts+rank*nranks, sdispls+rank*nranks, recvbuff, recvcounts+rank*nranks, rdispls+rank*nranks, type, comm, stream));
|
||||
#else
|
||||
NCCLCHECK(ncclGroupStart());
|
||||
for (int r=0; r<nranks; r++) {
|
||||
if (sendcounts[r+rank*MAX_ALLTOALLV_RANKS] != 0) {
|
||||
if (sendcounts[r+rank*nranks] != 0) {
|
||||
NCCLCHECK(ncclSend(
|
||||
((char*)sendbuff) + sdispls[r+rank*MAX_ALLTOALLV_RANKS] * wordSize(type),
|
||||
sendcounts[r+rank*MAX_ALLTOALLV_RANKS],
|
||||
((char*)sendbuff) + sdispls[r+rank*nranks] * wordSize(type),
|
||||
sendcounts[r+rank*nranks],
|
||||
type,
|
||||
r,
|
||||
comm,
|
||||
stream));
|
||||
}
|
||||
if (recvcounts[r+rank*MAX_ALLTOALLV_RANKS] != 0) {
|
||||
if (recvcounts[r+rank*nranks] != 0) {
|
||||
NCCLCHECK(ncclRecv(
|
||||
((char*)recvbuff) + rdispls[r+rank*MAX_ALLTOALLV_RANKS] * wordSize(type),
|
||||
recvcounts[r+rank*MAX_ALLTOALLV_RANKS],
|
||||
((char*)recvbuff) + rdispls[r+rank*nranks] * wordSize(type),
|
||||
recvcounts[r+rank*nranks],
|
||||
type,
|
||||
r,
|
||||
comm,
|
||||
@@ -157,8 +148,12 @@ testResult_t AlltoAllvRunColl(void* sendbuff, void* recvbuff, size_t count, nccl
|
||||
}
|
||||
NCCLCHECK(ncclGroupEnd());
|
||||
#endif
|
||||
return testSuccess;
|
||||
#endif
|
||||
free(sendcounts);
|
||||
free(recvcounts);
|
||||
free(sdispls);
|
||||
free(rdispls);
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
struct testColl alltoAllTest = {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2015-2016, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
|
||||
}
|
||||
|
||||
void BroadcastGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
@@ -34,17 +22,14 @@ testResult_t BroadcastInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(args->expected[k], recvcount, type, rep, root));
|
||||
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
|
||||
TESTCHECK(InitData(args->expected[k], recvcount, 0, type, ncclSum, rep, 1, 0));
|
||||
k++;
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
@@ -114,7 +99,7 @@ testResult_t BroadcastRunTest(struct threadArgs* args, int root, ncclDataType_t
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
for (int j=begin_root; j<=end_root; j++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
|
||||
}
|
||||
}
|
||||
return testSuccess;
|
||||
|
||||
+311
-441
文件差异内容过多而无法显示
加载差异
+37
-26
@@ -1,13 +1,13 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
#ifndef __COMMON_H__
|
||||
#define __COMMON_H__
|
||||
|
||||
#include "rccl.h"
|
||||
#include "rccl/rccl.h"
|
||||
#include <stdio.h>
|
||||
#include <cstdint>
|
||||
#include <algorithm>
|
||||
@@ -16,19 +16,38 @@
|
||||
#endif
|
||||
#include <pthread.h>
|
||||
#include "nccl1_compat.h"
|
||||
#include "timer.h"
|
||||
|
||||
#define HIPCHECK(cmd) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
// For nccl.h < 2.13 since we define a weak fallback
|
||||
extern "C" char const* ncclGetLastError(ncclComm_t comm);
|
||||
|
||||
#define HIPCHECK(cmd) do { \
|
||||
hipError_t e = cmd; \
|
||||
if( e != hipSuccess ) { \
|
||||
char hostname[1024]; \
|
||||
getHostName(hostname, 1024); \
|
||||
printf("%s: Test HIP failure %s:%d '%s'\n", \
|
||||
printf("%s: Test HIP failure %s:%d '%s'\n", \
|
||||
hostname, \
|
||||
__FILE__,__LINE__,hipGetErrorString(e)); \
|
||||
__FILE__,__LINE__,hipGetErrorString(e)); \
|
||||
return testCudaError; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#if NCCL_VERSION_CODE >= NCCL_VERSION(2,13,0)
|
||||
#define NCCLCHECK(cmd) do { \
|
||||
ncclResult_t res = cmd; \
|
||||
if (res != ncclSuccess) { \
|
||||
char hostname[1024]; \
|
||||
getHostName(hostname, 1024); \
|
||||
printf("%s: Test NCCL failure %s:%d " \
|
||||
"'%s / %s'\n", \
|
||||
hostname,__FILE__,__LINE__, \
|
||||
ncclGetErrorString(res), \
|
||||
ncclGetLastError(NULL)); \
|
||||
return testNcclError; \
|
||||
} \
|
||||
} while(0)
|
||||
#else
|
||||
#define NCCLCHECK(cmd) do { \
|
||||
ncclResult_t res = cmd; \
|
||||
if (res != ncclSuccess) { \
|
||||
@@ -40,13 +59,15 @@
|
||||
return testNcclError; \
|
||||
} \
|
||||
} while(0)
|
||||
#endif
|
||||
|
||||
typedef enum {
|
||||
testSuccess = 0,
|
||||
testInternalError = 1,
|
||||
testCudaError = 2,
|
||||
testNcclError = 3,
|
||||
testCuRandError = 4
|
||||
testTimeout = 4,
|
||||
testNumResults = 5
|
||||
} testResult_t;
|
||||
|
||||
// Relay errors up and trace
|
||||
@@ -96,14 +117,17 @@ struct threadArgs {
|
||||
size_t stepbytes;
|
||||
size_t stepfactor;
|
||||
|
||||
int totalProcs;
|
||||
int nProcs;
|
||||
int proc;
|
||||
int nThreads;
|
||||
int thread;
|
||||
int nGpus;
|
||||
int* gpus;
|
||||
int localRank;
|
||||
int localNumDevices;
|
||||
int enable_multiranks;
|
||||
int enable_out_of_place;
|
||||
int nRanks;
|
||||
void** sendbuffs;
|
||||
size_t sendBytes;
|
||||
@@ -116,14 +140,6 @@ struct threadArgs {
|
||||
|
||||
void** expected;
|
||||
size_t expectedBytes;
|
||||
volatile int* sync;
|
||||
int sync_idx;
|
||||
volatile int* barrier;
|
||||
int barrier_idx;
|
||||
volatile double* reduce;
|
||||
int syncRank;
|
||||
int syncNranks;
|
||||
double* deltaHost;
|
||||
int* errors;
|
||||
double* bw;
|
||||
int* bw_count;
|
||||
@@ -141,19 +157,13 @@ struct testThread {
|
||||
testResult_t ret;
|
||||
};
|
||||
|
||||
#include <chrono>
|
||||
|
||||
// Provided by common.cu
|
||||
extern void Barrier(struct threadArgs* args);
|
||||
extern testResult_t TimeTest(struct threadArgs* args, ncclDataType_t type, const char* typeName, ncclRedOp_t op, const char* opName, int root);
|
||||
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const int rep, const int nranks);
|
||||
extern testResult_t InitData(void* data, const size_t count, ncclDataType_t type, const int rep, const int rank);
|
||||
extern testResult_t InitDataReduce(void* data, const size_t count, const size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks);
|
||||
extern testResult_t InitData(void* data, const size_t count, size_t offset, ncclDataType_t type, ncclRedOp_t op, const uint64_t seed, const int nranks, const int rank);
|
||||
extern void AllocateBuffs(void **sendbuff, void **recvbuff, void **expected, void **expectedHost, size_t nbytes, int nranks);
|
||||
|
||||
// Provided by each coll
|
||||
extern void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root);
|
||||
extern void print_header();
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
static void getHostName(char* hostname, int maxlen) {
|
||||
@@ -233,7 +243,7 @@ static size_t wordSize(ncclDataType_t type) {
|
||||
case ncclInt64:
|
||||
case ncclUint64:
|
||||
case ncclDouble:
|
||||
//case ncclFloat64:
|
||||
//case ncclFloat64:
|
||||
return 8;
|
||||
default: return 0;
|
||||
}
|
||||
@@ -290,6 +300,7 @@ static int ncclstringtomtype (char *str) {
|
||||
return ncclCoarse;
|
||||
}
|
||||
|
||||
extern int is_main_proc;
|
||||
extern thread_local int is_main_thread;
|
||||
#define PRINT if (is_main_thread) printf
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
|
||||
}
|
||||
|
||||
void GatherGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count/nranks;
|
||||
*recvcount = (count/nranks)*nranks;
|
||||
@@ -35,20 +23,17 @@ testResult_t GatherInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
|
||||
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
|
||||
if (rank == root) {
|
||||
for (int j=0; j<nranks; j++) {
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
|
||||
TESTCHECK(InitData(((char*)args->expected[k]), nranks*sendcount, 0, type, ncclSum, rep, 1, 0));
|
||||
}
|
||||
}
|
||||
k++;
|
||||
@@ -125,7 +110,7 @@ testResult_t GatherRunTest(struct threadArgs* args, int root, ncclDataType_t typ
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
for (int j=begin_root; j<=end_root; j++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
|
||||
}
|
||||
}
|
||||
return testSuccess;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -9,18 +10,6 @@
|
||||
|
||||
#define ALIGN 4
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s", size, count, typeName);
|
||||
}
|
||||
|
||||
void HyperCubeGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
size_t base = (count/(ALIGN*nranks))*ALIGN;
|
||||
*sendcount = base;
|
||||
@@ -37,18 +26,15 @@ testResult_t HyperCubeInitData(struct threadArgs* args, ncclDataType_t type, ncc
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? ((char*)args->recvbuffs[k])+rank*args->sendBytes : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, 33*rep + rank, 1, 0));
|
||||
for (int j=0; j<nranks; j++) {
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, type, rep, j));
|
||||
TESTCHECK(InitData(((char*)args->expected[k])+args->sendBytes*j, sendcount, 0, type, ncclSum, 33*rep + j, 1, 0));
|
||||
}
|
||||
k++;
|
||||
}
|
||||
@@ -116,9 +102,16 @@ testResult_t HyperCubeRunTest(struct threadArgs* args, int root, ncclDataType_t
|
||||
run_typenames = test_typenames;
|
||||
}
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
|
||||
// Check if this is a power of 2
|
||||
int nRanks = args->nProcs*args->nThreads*args->nGpus;
|
||||
if (nRanks && !(nRanks & (nRanks - 1))) {
|
||||
for (int i=0; i<type_count; i++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", -1));
|
||||
}
|
||||
} else {
|
||||
printf("nRanks %d is not a power of 2, skipping\n", nRanks);
|
||||
}
|
||||
|
||||
return testSuccess;
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop", "root",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6s %6i", size, count, typeName, opName, root);
|
||||
}
|
||||
|
||||
void ReduceGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
@@ -35,16 +23,13 @@ testResult_t ReduceInitData(struct threadArgs* args, ncclDataType_t type, ncclRe
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
|
||||
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
|
||||
if (rank == root) TESTCHECK(InitDataReduce(args->expected[k], recvcount, 0, type, op, rep, nranks));
|
||||
k++;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,24 +8,15 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "redop",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6s", size, count, typeName, opName);
|
||||
}
|
||||
#define ALIGN 4
|
||||
|
||||
void ReduceScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = (count/nranks)*nranks;
|
||||
*recvcount = count/nranks;
|
||||
size_t base = (count/(ALIGN*nranks))*ALIGN;
|
||||
*sendcount = base*nranks;
|
||||
*recvcount = base;
|
||||
*sendInplaceOffset = 0;
|
||||
*recvInplaceOffset = count/nranks;
|
||||
*paramcount = *recvcount;
|
||||
*recvInplaceOffset = base;
|
||||
*paramcount = base;
|
||||
}
|
||||
|
||||
testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t op, int root, int rep, int in_place) {
|
||||
@@ -35,16 +26,13 @@ testResult_t ReduceScatterInitData(struct threadArgs* args, ncclDataType_t type,
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, 0, type, op, rep, nranks, rank));
|
||||
HIPCHECK(hipMemcpy(args->expected[k], args->recvbuffs[k], args->expectedBytes, hipMemcpyDefault));
|
||||
TESTCHECK(InitDataReduce(args->expected[k], recvcount, rank*recvcount, type, op, rep, nranks));
|
||||
k++;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s %6s out-of-place in-place \n", "", "", "", "");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type", "root",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %6s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s %6i", size, count, typeName, root);
|
||||
}
|
||||
|
||||
void ScatterGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = (count/nranks)*nranks;
|
||||
*recvcount = count/nranks;
|
||||
@@ -34,17 +22,14 @@ testResult_t ScatterInitData(struct threadArgs* args, ncclDataType_t type, ncclR
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
if (rank == root) TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(args->expected[k], recvcount, type, rep+rank*recvcount, root));
|
||||
if (rank == root) TESTCHECK(InitData(data, sendcount, 0, type, ncclSum, rep, 1, 0));
|
||||
TESTCHECK(InitData(args->expected[k], recvcount, rank*recvcount, type, ncclSum, rep, 1, 0));
|
||||
k++;
|
||||
|
||||
}
|
||||
@@ -120,7 +105,7 @@ testResult_t ScatterRunTest(struct threadArgs* args, int root, ncclDataType_t ty
|
||||
|
||||
for (int i=0; i<type_count; i++) {
|
||||
for (int j=begin_root; j<=end_root; j++) {
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "", j));
|
||||
TESTCHECK(TimeTest(args, run_types[i], run_typenames[i], (ncclRedOp_t)0, "none", j));
|
||||
}
|
||||
}
|
||||
return testSuccess;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2021, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2021 Advanced Micro Devices, Inc. All rights reserved.
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
@@ -8,18 +8,6 @@
|
||||
#include <hip/hip_runtime.h>
|
||||
#include "common.h"
|
||||
|
||||
void print_header() {
|
||||
PRINT("# %10s %12s %8s out-of-place in-place \n", "", "", "");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "size", "count", "type",
|
||||
"time", "algbw", "busbw", "error", "time", "algbw", "busbw", "error");
|
||||
PRINT("# %10s %12s %8s %7s %6s %6s %5s %7s %6s %6s %5s\n", "(B)", "(elements)", "",
|
||||
"(us)", "(GB/s)", "(GB/s)", "", "(us)", "(GB/s)", "(GB/s)", "");
|
||||
}
|
||||
|
||||
void print_line_header (size_t size, size_t count, const char *typeName, const char *opName, int root) {
|
||||
PRINT("%12li %12li %8s", size, count, typeName);
|
||||
}
|
||||
|
||||
void SendRecvGetCollByteCount(size_t *sendcount, size_t *recvcount, size_t *paramcount, size_t *sendInplaceOffset, size_t *recvInplaceOffset, size_t count, int nranks) {
|
||||
*sendcount = count;
|
||||
*recvcount = count;
|
||||
@@ -35,18 +23,15 @@ testResult_t SendRecvInitData(struct threadArgs* args, ncclDataType_t type, nccl
|
||||
|
||||
int k=0;
|
||||
for (int i=0; i<args->nGpus; i++) {
|
||||
int gpuid = args->localRank*args->nThreads*args->nGpus + args->thread*args->nGpus + i;
|
||||
if (args->enable_multiranks)
|
||||
gpuid = gpuid % args->localNumDevices;
|
||||
HIPCHECK(hipSetDevice(gpuid));
|
||||
HIPCHECK(hipSetDevice(args->gpus[i]));
|
||||
|
||||
for (int l=0; l<args->nRanks; l++) {
|
||||
int rank = ((args->proc*args->nThreads + args->thread)*args->nGpus*args->nRanks + i*args->nRanks + l);
|
||||
HIPCHECK(hipMemset(args->recvbuffs[k], 0, args->expectedBytes));
|
||||
void* data = in_place ? args->recvbuffs[k] : args->sendbuffs[k];
|
||||
TESTCHECK(InitData(data, sendcount, type, rep, rank));
|
||||
TESTCHECK(InitData(data, sendcount, rank*sendcount, type, ncclSum, rep, 1, 0));
|
||||
int peer = (rank-1+nranks)%nranks;
|
||||
TESTCHECK(InitData(args->expected[k], recvcount, type, rep, peer));
|
||||
TESTCHECK(InitData(args->expected[k], recvcount, peer*recvcount, type, ncclSum, rep, 1, 0));
|
||||
k++;
|
||||
}
|
||||
HIPCHECK(hipDeviceSynchronize());
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
#include "timer.h"
|
||||
|
||||
// Make sure to compile this translation unit with the host compiler and not
|
||||
// nvcc, lest you hit an internal compiler error (ICE) with GCC 10.3.0
|
||||
#include <chrono>
|
||||
|
||||
namespace {
|
||||
std::uint64_t now() {
|
||||
using clock = std::chrono::steady_clock;
|
||||
return std::chrono::duration_cast<std::chrono::nanoseconds>(clock::now().time_since_epoch()).count();
|
||||
}
|
||||
}
|
||||
|
||||
timer::timer() {
|
||||
t0 = now();
|
||||
}
|
||||
|
||||
double timer::elapsed() const {
|
||||
std::uint64_t t1 = now();
|
||||
return 1.e-9*(t1 - t0);
|
||||
}
|
||||
|
||||
double timer::reset() {
|
||||
std::uint64_t t1 = now();
|
||||
double ans = 1.e-9*(t1 - t0);
|
||||
t0 = t1;
|
||||
return ans;
|
||||
}
|
||||
@@ -0,0 +1,15 @@
|
||||
#ifndef _408319ecdd5b47b28bf8f511c4fdf816
|
||||
#define _408319ecdd5b47b28bf8f511c4fdf816
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
// Can't include <chrono> because of bug with gcc 10.3.0
|
||||
class timer {
|
||||
std::uint64_t t0;
|
||||
public:
|
||||
timer();
|
||||
double elapsed() const;
|
||||
double reset();
|
||||
};
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,73 @@
|
||||
#
|
||||
# Copyright (c) 2015-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications are Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
#
|
||||
|
||||
#include ../../makefiles/common.mk
|
||||
|
||||
.PHONY: all clean
|
||||
|
||||
BUILDDIR := $(abspath ../../build)
|
||||
DST_DIR := $(BUILDDIR)/test/verifiable
|
||||
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
MPI_HOME ?= /usr/lib/openmpi
|
||||
PREFIX ?= /usr/local
|
||||
VERBOSE ?= 0
|
||||
DEBUG ?= 0
|
||||
NCCL_HOME ?= ""
|
||||
|
||||
HIPCC = $(ROCM_PATH)/bin/hipcc
|
||||
CXX = $(HIPCC)
|
||||
|
||||
HIPCUFLAGS := -std=c++14
|
||||
LDFLAGS :=
|
||||
HIPLDFLAGS :=
|
||||
|
||||
ifneq ($(NCCL_HOME), "")
|
||||
HIPCUFLAGS += -I$(NCCL_HOME)/ -I$(NCCL_HOME)/include
|
||||
HIPLDFLAGS += -Wl,-rpath,$(NCCL_HOME) -L$(NCCL_HOME)
|
||||
endif
|
||||
HIPCUFLAGS += -I$(ROCM_PATH)/include
|
||||
HIPCUFLAGS += -I$(ROCM_PATH)/include/hip
|
||||
LDFLAGS += -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
|
||||
HIPLDFLAGS += $(CUSTOM_RCCL_LIB) -L$(ROCM_PATH)/lib -lhsa-runtime64 -lrt
|
||||
|
||||
ifeq ($(DEBUG), 0)
|
||||
HIPCUFLAGS += -O3
|
||||
else
|
||||
HIPCUFLAGS += -O0 -g -ggdb3
|
||||
endif
|
||||
|
||||
ifeq ($(VERBOSE), 0)
|
||||
.SILENT:
|
||||
endif
|
||||
|
||||
ifeq ($(MPI), 1)
|
||||
HIPCUFLAGS += -DMPI_SUPPORT -I${MPI_HOME}/include -I${MPI_HOME}/include/mpi
|
||||
HIPLDFLAGS += -L${MPI_HOME}/lib -lmpi
|
||||
else ifeq ($(MPICH), 1)
|
||||
HIPCUFLAGS += -DMPI_SUPPORT -I/usr/include/mpich -I/usr/include/x86_64-linux-gnu/mpich
|
||||
HIPLDFLAGS += -L/usr/lib -lmpich
|
||||
endif
|
||||
|
||||
LIBRARIES += rccl
|
||||
HIPLDFLAGS += $(LIBRARIES:%=-l%)
|
||||
|
||||
all: $(DST_DIR)/verifiable.o $(DST_DIR)/self_test
|
||||
|
||||
clean:
|
||||
rm -rf $(DST_DIR)
|
||||
|
||||
TEST_VERIFIABLE_SRCDIR := .
|
||||
TEST_VERIFIABLE_BUILDDIR := $(DST_DIR)
|
||||
include verifiable.mk
|
||||
|
||||
self_test: $(DST_DIR)/self_test
|
||||
|
||||
$(DST_DIR)/self_test: verifiable.cu verifiable.h
|
||||
@printf "Linking %s\n" $@
|
||||
@mkdir -p $(DST_DIR)
|
||||
$(HIPCC) -o $@ $(HIPCUFLAGS) -DSELF_TEST=1 verifiable.cu $(HIPLDFLAGS)
|
||||
@@ -0,0 +1,195 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
/* Generate parameters for our error bound model of floating point average
|
||||
* (sum of scaled values) by sampling sums of random sequences for each
|
||||
* floating point type.
|
||||
*
|
||||
* The model has parameters "coef" and "power", where for two floats a & b,
|
||||
* they are close enough if and only if:
|
||||
* abs(intBits(a) - intBits(b)) <= 1 + coef*pow(rank_n, power);
|
||||
*
|
||||
* Where intBits(x) is the reinterpretation of the float bitpattern as an integer.
|
||||
*
|
||||
* Compile with:
|
||||
* nvcc -gencode=arch=compute_80,code=sm_80
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <hip/hip_bfloat16.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
|
||||
using std::uint64_t;
|
||||
using std::uint32_t;
|
||||
using bfloat16 = hip_bfloat16;
|
||||
|
||||
template<typename T>
|
||||
struct float_traits;
|
||||
|
||||
template<>
|
||||
struct float_traits<float> {
|
||||
static constexpr int mantissa_bits = 23;
|
||||
static constexpr int exponent_bits = 8;
|
||||
using uint_t = uint32_t;
|
||||
__device__ static float make(double x) { return (float)x; }
|
||||
__device__ static float make(uint64_t x) { return (float)x; }
|
||||
__device__ static double todouble(float x) { return x; }
|
||||
__device__ static float add(float a, float b) { return a+b; }
|
||||
__device__ static float mul(float a, float b) { return a*b; }
|
||||
};
|
||||
template<>
|
||||
struct float_traits<double> {
|
||||
static constexpr int mantissa_bits = 52;
|
||||
static constexpr int exponent_bits = 11;
|
||||
using uint_t = uint64_t;
|
||||
__device__ static double make(double x) { return x; }
|
||||
__device__ static double make(uint64_t x) { return (double)x; }
|
||||
__device__ static double todouble(double x) { return x; }
|
||||
__device__ static double add(double a, double b) { return a+b; }
|
||||
__device__ static double mul(double a, double b) { return a*b; }
|
||||
};
|
||||
template<>
|
||||
struct float_traits<__half> {
|
||||
static constexpr int mantissa_bits = 10;
|
||||
static constexpr int exponent_bits = 5;
|
||||
using uint_t = uint16_t;
|
||||
__device__ static __half make(double x) { return __float2half((float)x); }
|
||||
__device__ static __half make(uint64_t x) { return __int2half_rn(x); }
|
||||
__device__ static double todouble(__half x) { return __half2float(x); }
|
||||
__device__ static __half add(__half a, __half b) { return __hadd(a, b); }
|
||||
__device__ static __half mul(__half a, __half b) { return __hmul(a, b); }
|
||||
};
|
||||
template<>
|
||||
struct float_traits<bfloat16> {
|
||||
static constexpr int mantissa_bits = 7;
|
||||
static constexpr int exponent_bits = 8;
|
||||
using uint_t = uint16_t;
|
||||
__device__ static bfloat16 make(double x) { return bfloat16(x); }
|
||||
__device__ static bfloat16 make(uint64_t x) { return bfloat16(x); }
|
||||
__device__ static double todouble(bfloat16 x) { return double(x); }
|
||||
__device__ static bfloat16 add(bfloat16 a, bfloat16 b) { return bfloat16(__hadd((float)a, (float)b)); }
|
||||
__device__ static bfloat16 mul(bfloat16 a, bfloat16 b) { return bfloat16(__hmul((float)a, (float)b)); }
|
||||
};
|
||||
|
||||
template<typename F>
|
||||
__device__ int compare(F a, F b) {
|
||||
union { typename float_traits<F>::uint_t ua; F fa; };
|
||||
union { typename float_traits<F>::uint_t ub; F fb; };
|
||||
ua=0; ub=0;
|
||||
fa=a; fb=b;
|
||||
//std::printf("bits(%1.10f)=%x bits(%1.10f)=%x\n", fa, ua, fb, ub);
|
||||
return ua < ub ? ub-ua : ua-ub;
|
||||
}
|
||||
|
||||
struct xoshiro256ss {
|
||||
uint64_t s[4];
|
||||
__device__ xoshiro256ss(int seed) {
|
||||
constexpr uint64_t src[4] = {0xbb99e851d1f545cc, 0xbfc4022389ca40cb, 0xe84aff5cb1914af5, 0x845999858284de77};
|
||||
for(int i=0; i < 4; i++)
|
||||
s[i] = src[i] + (seed + i)*0xb45de8a52fdb65d3;
|
||||
}
|
||||
__device__ uint64_t operator()() {
|
||||
auto rol64 = [](uint64_t x, int k) {
|
||||
return (x << k) | (x >> (64 - k));
|
||||
};
|
||||
uint64_t const result = rol64(s[1] * 5, 7) * 9;
|
||||
uint64_t const t = s[1] << 17;
|
||||
s[2] ^= s[0];
|
||||
s[3] ^= s[1];
|
||||
s[1] ^= s[2];
|
||||
s[0] ^= s[3];
|
||||
s[2] ^= t;
|
||||
s[3] = rol64(s[3], 45);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
static __device__ int __reduce_max_sync(unsigned int mask, int value)
|
||||
{
|
||||
//We ignore mask, since all bits are set when calling them in the
|
||||
//test code below.
|
||||
int width = warpSize;
|
||||
for (unsigned int i = warpSize; i; i >>= 1) {
|
||||
value = max(__shfl_down(value, i, width), value);
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
template<typename F>
|
||||
__global__ void kernel() {
|
||||
using traits = float_traits<F>;
|
||||
constexpr int samps = 4<<10;
|
||||
__shared__ F accf[samps];
|
||||
__shared__ double accd[samps];
|
||||
|
||||
xoshiro256ss rng(threadIdx.x);
|
||||
float expo_avg = 1;
|
||||
for(int pass=0; pass < 2; pass++) {
|
||||
F scalar = traits::make(1.0/(3.14159 + .5*threadIdx.x));
|
||||
int err_max = 0;
|
||||
float coef = 0;
|
||||
double expo_sum = 0;
|
||||
int expo_n = 0;
|
||||
int max_ranks = std::is_same<F,float>::value ? 16<<10 : 1<<traits::mantissa_bits;
|
||||
for(int round=0; round < 1 + (16<<10)/max_ranks; round++) {
|
||||
//for(int round=0; round < 2; round++) {
|
||||
for(int i=threadIdx.x; i < samps; i += blockDim.x) {
|
||||
accf[i] = (F)0;
|
||||
accd[i] = 0;
|
||||
}
|
||||
__syncthreads();
|
||||
for(int r=0; r < max_ranks; r++) {
|
||||
int err = 0;
|
||||
for(int i=threadIdx.x; i < samps; i+=blockDim.x) {
|
||||
constexpr uint64_t m = (1ll<<traits::mantissa_bits)-1;
|
||||
double d = std::is_same<F,float>::value ? double(rng() & m) : 1.0;
|
||||
F f = traits::make(d);
|
||||
accf[i] = traits::add(accf[i], traits::mul(scalar, f));
|
||||
accd[i] += traits::todouble(f);
|
||||
//if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d f=%f\n", r, traits::todouble(accf[i]));
|
||||
int e = compare(accf[i], traits::mul(scalar, traits::make(accd[i])));
|
||||
err = err > e ? err : e;
|
||||
}
|
||||
err = __reduce_max_sync(-1u, err);
|
||||
err_max = err_max > err ? err_max : err;
|
||||
if (r >= 2) {
|
||||
// err = 1 + coef*pow(r,expo)
|
||||
float c = float(err-1)/powf(float(r), expo_avg);
|
||||
coef = coef > c ? coef : c;
|
||||
}
|
||||
if (r >= 2) {
|
||||
double expo = log2f(1+err_max)/log2f(r);
|
||||
expo_sum += expo;
|
||||
expo_n++;
|
||||
//if(threadIdx.x==0 && std::is_same<F,half>::value) std::printf(" r=%d err=%d errmax=%d expo=%f sum=%f n=%d\n", r, err, err_max, expo, expo_sum, expo_n);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(pass==0)
|
||||
expo_avg = expo_sum/expo_n;
|
||||
else if(threadIdx.x == 0)
|
||||
printf(" coef=%1.10f expo=%1.10f\n", coef, expo_avg);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
std::printf("type=float:\n");
|
||||
kernel<float><<<1,32>>>();
|
||||
hipDeviceSynchronize();
|
||||
|
||||
std::printf("\ntype=half:\n");
|
||||
kernel<half><<<1,32>>>();
|
||||
hipDeviceSynchronize();
|
||||
|
||||
std::printf("\ntype=bfloat16:\n");
|
||||
kernel<bfloat16><<<1,32>>>();
|
||||
hipDeviceSynchronize();
|
||||
return 0;
|
||||
}
|
||||
文件差异内容过多而无法显示
加载差异
@@ -0,0 +1,66 @@
|
||||
/*************************************************************************
|
||||
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
* Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
*
|
||||
* See LICENSE.txt for license information
|
||||
************************************************************************/
|
||||
|
||||
#ifndef _d41d8cd98f00b204e9800998ecf8427e
|
||||
#define _d41d8cd98f00b204e9800998ecf8427e
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/* Routines for launching kernels that verify reduction results. A significant
|
||||
* feature of these routines is they carefully craft floating point input
|
||||
* to produce exactly predictable output.
|
||||
*
|
||||
* int elt_ty: actually just a ncclDataType_t
|
||||
*
|
||||
* int red_op: mostly just a ncclRedOp_t. Since PreMulSum ops are dynamically
|
||||
* created, these are encoded as the value ncclNumOps and their scalar is
|
||||
* assumed to be `ncclVerifiablePremulScalar(rank_me)`
|
||||
*
|
||||
* uint64_t seed: arbitrary 64-bits to use in seeding the random values
|
||||
*
|
||||
* intptr_t elt_ix0: index of first element pointed to by elts when generating
|
||||
* random values. This makes it possible to generate subsequences independently
|
||||
* as well as in aggregate.
|
||||
*
|
||||
* int rank_n: Number of contributions into the reduction. Non-reduction
|
||||
* collectives like broadcast, gather, etc will always set this to one.
|
||||
*
|
||||
* int rank_me: Index of this contribution
|
||||
*/
|
||||
|
||||
// Use this as the local scalar for PreMulSum ops
|
||||
template<typename T>
|
||||
__host__ __device__ T ncclVerifiablePremulScalar(int rank_me) {
|
||||
return T(rank_me%2 == 0 ? 1.0f : 2.0f);
|
||||
}
|
||||
|
||||
// Enqueue kernel to generate data which is to be reduced.
|
||||
void ncclVerifiablePrepareInput(
|
||||
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n, int rank_me,
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
);
|
||||
|
||||
// Enqueue kernel to generate expected results of reduction.
|
||||
void ncclVerifiablePrepareExpected(
|
||||
void *elts, intptr_t elt_n, int elt_ty, int red_op, int rank_n,
|
||||
uint64_t seed, intptr_t elt_ix0, hipStream_t stream
|
||||
);
|
||||
|
||||
// Enqueue kernel to verify reduced data matches expectation. The number of
|
||||
// failed elements is written to bad_elt_n which must be in cudaHost memory.
|
||||
// If `expected == nullptr` then the expected results are generated on-the-fly
|
||||
// which can be costly. Thus if you plan to run the same reduction multiple
|
||||
// times it is advantageous to precompute the expected values with
|
||||
// ncclVerifiablePrepareExpected and pass them as `expected` here.
|
||||
void ncclVerifiableVerify(
|
||||
void const *results, void const *expected, intptr_t elt_n, int elt_ty,
|
||||
int red_op, int rank_n, uint64_t seed, intptr_t elt_ix0,
|
||||
int64_t *bad_elt_n, hipStream_t stream
|
||||
);
|
||||
#endif
|
||||
@@ -0,0 +1,18 @@
|
||||
# Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
# Modifications Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved.
|
||||
#
|
||||
# See LICENSE.txt for license information
|
||||
|
||||
|
||||
# We requires both of the following paths to be set upon including this makefile
|
||||
# TEST_VERIFIABLE_SRCDIR = <points to this directory>
|
||||
# TEST_VERIFIABLE_BUILDDIR = <points to destination of .o file>
|
||||
|
||||
TEST_VERIFIABLE_HDRS = $(TEST_VERIFIABLE_SRCDIR)/verifiable.h
|
||||
TEST_VERIFIABLE_OBJS = $(TEST_VERIFIABLE_BUILDDIR)/verifiable.o
|
||||
|
||||
$(TEST_VERIFIABLE_BUILDDIR)/verifiable.o: $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu $(TEST_VERIFY_REDUCE_HDRS)
|
||||
@printf "Compiling %s\n" $@
|
||||
@mkdir -p $(TEST_VERIFIABLE_BUILDDIR)
|
||||
echo " $(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu"
|
||||
$(HIPCC) -o $@ $(HIPCUFLAGS) -c $(TEST_VERIFIABLE_SRCDIR)/verifiable.cu
|
||||
在新工单中引用
屏蔽一个用户