Merge remote-tracking branch 'nccl/master' into develop

[ROCm/rccl-tests commit: bb0f15d407]
Tá an tiomantas seo le fáil i:
Wenkai Du
2023-06-14 08:20:29 -07:00
tuismitheoir e07d7ec1b5 876c86fcdb
tiomantas a2e8a44adb
D'athraigh 4 comhad le 43 breiseanna agus 10 scriosta
+2 -2
Féach ar an gComhad
@@ -7,9 +7,9 @@
BUILDDIR ?= build
override BUILDDIR := $(abspath $(BUILDDIR))
.PHONY : all clean
.PHONY: all clean
default : src.build
default: src.build
TARGETS=$(filter-out src/hypercube.cu, $(wildcard src/*))
+3 -3
Féach ar an gComhad
@@ -46,9 +46,9 @@ Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
```
Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each :
Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
```shell
$ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
```
### Performance
@@ -66,7 +66,7 @@ All tests support the same set of arguments :
* `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
* `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
* Increments can be either fixed or a multiplication factor. Only one of those should be used
* `-i,--stepbytes <increment size>` fixed increment between sizes. Default : (max-min)/10.
* `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M.
* `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
* RCCL operations arguments
* `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
+3 -1
Féach ar an gComhad
@@ -310,7 +310,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
*wrongElts = 0;
for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
hipFree(wrongPerGpu);
hipHostFree(wrongPerGpu);
if (args->reportErrors && *wrongElts) args->errors[0]++;
return testSuccess;
@@ -1169,6 +1169,8 @@ testResult_t run() {
errors[t] = bw_count[t] = 0;
}
fflush(stdout);
const char* timeStr = report_cputime ? "cputime" : "time";
PRINT("#\n");
PRINT("# %10s %12s %8s %6s %6s out-of-place in-place \n", "", "", "", "", "");
+35 -4
Féach ar an gComhad
@@ -177,15 +177,46 @@ static void getHostName(char* hostname, int maxlen) {
#include <stdint.h>
static uint64_t getHostHash(const char* string) {
// Based on DJB2, result = result * 33 + char
static uint64_t getHash(const char* string, size_t n) {
// Based on DJB2a, result = result * 33 ^ char
uint64_t result = 5381;
for (int c = 0; string[c] != '\0'; c++){
result = ((result << 5) + result) + string[c];
for (size_t c = 0; c < n; c++) {
result = ((result << 5) + result) ^ string[c];
}
return result;
}
/* Generate a hash of the unique identifying string for this host
* that will be unique for both bare-metal and container instances
* Equivalent of a hash of;
*
* $(hostname)$(cat /proc/sys/kernel/random/boot_id)
*
*/
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
static uint64_t getHostHash(const char* hostname) {
char hostHash[1024];
// Fall back is the hostname if something fails
(void) strncpy(hostHash, hostname, sizeof(hostHash));
int offset = strlen(hostHash);
FILE *file = fopen(HOSTID_FILE, "r");
if (file != NULL) {
char *p;
if (fscanf(file, "%ms", &p) == 1) {
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
free(p);
}
}
fclose(file);
// Make sure the string is terminated
hostHash[sizeof(hostHash)-1]='\0';
return getHash(hostHash, strlen(hostHash));
}
static size_t wordSize(ncclDataType_t type) {
switch(type) {
case ncclChar: