From c0e3f4d443a3b3be6a89e1f79792d447bef6d4b1 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Wed, 12 Oct 2022 01:23:46 -0700 Subject: [PATCH 1/9] Fix build on RHEL7 with GCC 4.8 Add -std=c++11 to CXXFLAGS. Fixes #116. [ROCm/rccl-tests commit: 365b92a1ead1b80077fac0929e2bbfbd25cdcdd0] --- projects/rccl-tests/src/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/rccl-tests/src/Makefile b/projects/rccl-tests/src/Makefile index 6d8b1ef40f..6ea07303b5 100644 --- a/projects/rccl-tests/src/Makefile +++ b/projects/rccl-tests/src/Makefile @@ -35,6 +35,7 @@ NVCC_GENCODE ?= -gencode=arch=compute_35,code=sm_35 \ endif NVCUFLAGS := -ccbin $(CXX) $(NVCC_GENCODE) -std=c++11 +CXXFLAGS := -std=c++11 LDFLAGS := -L${CUDA_LIB} -lcudart -lrt NVLDFLAGS := -L${CUDA_LIB} -l${CUDARTLIB} -lrt From 04b5c40b1c62681c0cc03ec0de38306e51fbe4b1 Mon Sep 17 00:00:00 2001 From: David Addison Date: Tue, 22 Nov 2022 11:16:47 -0800 Subject: [PATCH 2/9] Add fflush(stdout) before perf output [ROCm/rccl-tests commit: 3bd2bd292bd3b249892b63f6342d1ca559c37391] --- projects/rccl-tests/src/common.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index 5837ed1bcd..41d747905b 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -969,6 +969,8 @@ testResult_t run() { errors[t] = bw_count[t] = 0; } + fflush(stdout); + const char* timeStr = report_cputime ? "cputime" : "time"; PRINT("#\n"); PRINT("# %10s %12s %8s %6s %6s out-of-place in-place \n", "", "", "", "", ""); From 6313530fccbc112cec7ddae1b5a828b17081cc8d Mon Sep 17 00:00:00 2001 From: David Addison Date: Tue, 22 Nov 2022 11:18:37 -0800 Subject: [PATCH 3/9] Call cudaFreeHost() on wrongPerGpu not cudaFree() [ROCm/rccl-tests commit: 24fcf64ed19bb178aa867b14c1d7f13493656e74] --- projects/rccl-tests/src/common.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index 41d747905b..48a629ce10 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -262,7 +262,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t *wrongElts = 0; for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i]; - cudaFree(wrongPerGpu); + cudaFreeHost(wrongPerGpu); if (args->reportErrors && *wrongElts) args->errors[0]++; return testSuccess; From 5ba670d55150f7c9dcf590e5139dcc9945f65c53 Mon Sep 17 00:00:00 2001 From: Jithin Jose Date: Fri, 18 Dec 2020 10:12:54 -0800 Subject: [PATCH 4/9] Use DJB2a hash algorithm in getHostHash() [ROCm/rccl-tests commit: 0aeba157db77cc9e99186639bf71368b74fb90e2] --- projects/rccl-tests/src/common.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/rccl-tests/src/common.h b/projects/rccl-tests/src/common.h index b69d071606..6fc6bfdd69 100644 --- a/projects/rccl-tests/src/common.h +++ b/projects/rccl-tests/src/common.h @@ -174,10 +174,10 @@ static void getHostName(char* hostname, int maxlen) { #include static uint64_t getHostHash(const char* string) { - // Based on DJB2, result = result * 33 + char + // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; for (int c = 0; string[c] != '\0'; c++){ - result = ((result << 5) + result) + string[c]; + result = ((result << 5) + result) ^ string[c]; } return result; } From 129a1b4b7800433e02345c25822cdb1129a40d2a Mon Sep 17 00:00:00 2001 From: David Addison Date: Mon, 4 Jan 2021 11:37:32 -0800 Subject: [PATCH 5/9] Add boot_id to the hostname hash due to collisions on Azure Fixes #60 [ROCm/rccl-tests commit: 0b4c4cb99fb1381edec1f78c37230688ea1ceb26] --- projects/rccl-tests/src/common.h | 35 ++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/projects/rccl-tests/src/common.h b/projects/rccl-tests/src/common.h index 6fc6bfdd69..20fa4612db 100644 --- a/projects/rccl-tests/src/common.h +++ b/projects/rccl-tests/src/common.h @@ -173,15 +173,46 @@ static void getHostName(char* hostname, int maxlen) { #include -static uint64_t getHostHash(const char* string) { +static uint64_t getHash(const char* string, size_t n) { // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; - for (int c = 0; string[c] != '\0'; c++){ + for (size_t c = 0; c < n; c++) { result = ((result << 5) + result) ^ string[c]; } return result; } +/* Generate a hash of the unique identifying string for this host + * that will be unique for both bare-metal and container instances + * Equivalent of a hash of; + * + * $(hostname)$(cat /proc/sys/kernel/random/boot_id) + * + */ +#define HOSTID_FILE "/proc/sys/kernel/random/boot_id" +static uint64_t getHostHash(const char* hostname) { + char hostHash[1024]; + + // Fall back is the hostname if something fails + (void) strncpy(hostHash, hostname, sizeof(hostHash)); + int offset = strlen(hostHash); + + FILE *file = fopen(HOSTID_FILE, "r"); + if (file != NULL) { + char *p; + if (fscanf(file, "%ms", &p) == 1) { + strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1); + free(p); + } + } + fclose(file); + + // Make sure the string is terminated + hostHash[sizeof(hostHash)-1]='\0'; + + return getHash(hostHash, strlen(hostHash)); +} + static size_t wordSize(ncclDataType_t type) { switch(type) { case ncclChar: From b70cac2b3356343ab90385d5fb3afe4c13ca82c5 Mon Sep 17 00:00:00 2001 From: Sylvain Jeaugey Date: Tue, 3 Jan 2023 08:47:43 +0100 Subject: [PATCH 6/9] Update README.md Improve MPI example to avoid confusion of number of processes / total number of GPUs. https://github.com/NVIDIA/nccl-tests/issues/54#issuecomment-1212023369 [ROCm/rccl-tests commit: 2cbb968101e2bfc7d3a7f0f1826c0189355de6fe] --- projects/rccl-tests/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/rccl-tests/README.md b/projects/rccl-tests/README.md index bff6433b89..12be254542 100644 --- a/projects/rccl-tests/README.md +++ b/projects/rccl-tests/README.md @@ -29,9 +29,9 @@ Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes : $ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8 ``` -Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each : +Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs: ```shell -$ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 +$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4 ``` ### Performance From b3db782c3f430e2e38a9b8d61810186539b3e19f Mon Sep 17 00:00:00 2001 From: Felix Abecassis Date: Thu, 23 Mar 2023 09:05:41 -0700 Subject: [PATCH 7/9] Update README.md [ROCm/rccl-tests commit: 17d0a42d5a4328e0e0e0d68440d8821224826d2f] --- projects/rccl-tests/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/rccl-tests/README.md b/projects/rccl-tests/README.md index 12be254542..580996b28d 100644 --- a/projects/rccl-tests/README.md +++ b/projects/rccl-tests/README.md @@ -49,7 +49,7 @@ All tests support the same set of arguments : * `-b,--minbytes ` minimum size to start with. Default : 32M. * `-e,--maxbytes ` maximum size to end at. Default : 32M. * Increments can be either fixed or a multiplication factor. Only one of those should be used - * `-i,--stepbytes ` fixed increment between sizes. Default : (max-min)/10. + * `-i,--stepbytes ` fixed increment between sizes. Default : 1M. * `-f,--stepfactor ` multiplication factor between sizes. Default : disabled. * NCCL operations arguments * `-o,--op ` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum. From 4fd5ceeba858ce26d080a7a227db6f958d28a1a1 Mon Sep 17 00:00:00 2001 From: "alan.souza" Date: Sat, 25 Mar 2023 16:56:16 -0300 Subject: [PATCH 8/9] fix handling of variable NVCC. Permit overriding the variable using environment variables [ROCm/rccl-tests commit: 7ccda3c97baf6924ff38411e364c0442096fc4be] --- projects/rccl-tests/src/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/rccl-tests/src/Makefile b/projects/rccl-tests/src/Makefile index 6ea07303b5..393de8e41b 100644 --- a/projects/rccl-tests/src/Makefile +++ b/projects/rccl-tests/src/Makefile @@ -11,7 +11,7 @@ DEBUG ?= 0 CUDA_LIB ?= $(CUDA_HOME)/lib64 CUDA_INC ?= $(CUDA_HOME)/include -NVCC = $(CUDA_HOME)/bin/nvcc +NVCC ?= $(CUDA_HOME)/bin/nvcc CUDARTLIB ?= cudart CUDA_VERSION = $(strip $(shell which $(NVCC) >/dev/null && $(NVCC) --version | grep release | sed 's/.*release //' | sed 's/\,.*//')) From cba8bfd093f9f78b73ef5923160b2d5bcdfe762e Mon Sep 17 00:00:00 2001 From: yangxingwu Date: Tue, 6 Jun 2023 09:47:50 +0000 Subject: [PATCH 9/9] makefile: remove extra space [ROCm/rccl-tests commit: 52ea1b214802fc37ef4baa29eb19942dcbf0a187] --- projects/rccl-tests/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/rccl-tests/Makefile b/projects/rccl-tests/Makefile index 43729f897a..f652b78a99 100644 --- a/projects/rccl-tests/Makefile +++ b/projects/rccl-tests/Makefile @@ -7,9 +7,9 @@ BUILDDIR ?= build override BUILDDIR := $(abspath $(BUILDDIR)) -.PHONY : all clean +.PHONY: all clean -default : src.build +default: src.build TARGETS=src