Merge remote-tracking branch 'nccl/master' into develop
[ROCm/rccl-tests commit: bb0f15d407]
This commit is contained in:
@@ -7,9 +7,9 @@
|
|||||||
BUILDDIR ?= build
|
BUILDDIR ?= build
|
||||||
override BUILDDIR := $(abspath $(BUILDDIR))
|
override BUILDDIR := $(abspath $(BUILDDIR))
|
||||||
|
|
||||||
.PHONY : all clean
|
.PHONY: all clean
|
||||||
|
|
||||||
default : src.build
|
default: src.build
|
||||||
|
|
||||||
TARGETS=$(filter-out src/hypercube.cu, $(wildcard src/*))
|
TARGETS=$(filter-out src/hypercube.cu, $(wildcard src/*))
|
||||||
|
|
||||||
|
|||||||
@@ -46,9 +46,9 @@ Run on 8 GPUs (`-g 8`), scanning from 8 Bytes to 128MBytes :
|
|||||||
$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
|
$ ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 8
|
||||||
```
|
```
|
||||||
|
|
||||||
Run with MPI on 40 processes (potentially on multiple nodes) with 4 GPUs each :
|
Run with MPI on 10 processes (potentially on multiple nodes) with 4 GPUs each, for a total of 40 GPUs:
|
||||||
```shell
|
```shell
|
||||||
$ mpirun -np 40 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
|
$ mpirun -np 10 ./build/all_reduce_perf -b 8 -e 128M -f 2 -g 4
|
||||||
```
|
```
|
||||||
|
|
||||||
### Performance
|
### Performance
|
||||||
@@ -66,7 +66,7 @@ All tests support the same set of arguments :
|
|||||||
* `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
|
* `-b,--minbytes <min size in bytes>` minimum size to start with. Default : 32M.
|
||||||
* `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
|
* `-e,--maxbytes <max size in bytes>` maximum size to end at. Default : 32M.
|
||||||
* Increments can be either fixed or a multiplication factor. Only one of those should be used
|
* Increments can be either fixed or a multiplication factor. Only one of those should be used
|
||||||
* `-i,--stepbytes <increment size>` fixed increment between sizes. Default : (max-min)/10.
|
* `-i,--stepbytes <increment size>` fixed increment between sizes. Default : 1M.
|
||||||
* `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
|
* `-f,--stepfactor <increment factor>` multiplication factor between sizes. Default : disabled.
|
||||||
* RCCL operations arguments
|
* RCCL operations arguments
|
||||||
* `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
|
* `-o,--op <sum/prod/min/max/avg/all>` Specify which reduction operation to perform. Only relevant for reduction operations like Allreduce, Reduce or ReduceScatter. Default : Sum.
|
||||||
|
|||||||
@@ -310,7 +310,7 @@ testResult_t CheckData(struct threadArgs* args, ncclDataType_t type, ncclRedOp_t
|
|||||||
|
|
||||||
*wrongElts = 0;
|
*wrongElts = 0;
|
||||||
for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
|
for (int i=0; i < args->nGpus; i++) *wrongElts += wrongPerGpu[i];
|
||||||
hipFree(wrongPerGpu);
|
hipHostFree(wrongPerGpu);
|
||||||
|
|
||||||
if (args->reportErrors && *wrongElts) args->errors[0]++;
|
if (args->reportErrors && *wrongElts) args->errors[0]++;
|
||||||
return testSuccess;
|
return testSuccess;
|
||||||
@@ -1169,6 +1169,8 @@ testResult_t run() {
|
|||||||
errors[t] = bw_count[t] = 0;
|
errors[t] = bw_count[t] = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
const char* timeStr = report_cputime ? "cputime" : "time";
|
const char* timeStr = report_cputime ? "cputime" : "time";
|
||||||
PRINT("#\n");
|
PRINT("#\n");
|
||||||
PRINT("# %10s %12s %8s %6s %6s out-of-place in-place \n", "", "", "", "", "");
|
PRINT("# %10s %12s %8s %6s %6s out-of-place in-place \n", "", "", "", "", "");
|
||||||
|
|||||||
@@ -177,15 +177,46 @@ static void getHostName(char* hostname, int maxlen) {
|
|||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
static uint64_t getHostHash(const char* string) {
|
static uint64_t getHash(const char* string, size_t n) {
|
||||||
// Based on DJB2, result = result * 33 + char
|
// Based on DJB2a, result = result * 33 ^ char
|
||||||
uint64_t result = 5381;
|
uint64_t result = 5381;
|
||||||
for (int c = 0; string[c] != '\0'; c++){
|
for (size_t c = 0; c < n; c++) {
|
||||||
result = ((result << 5) + result) + string[c];
|
result = ((result << 5) + result) ^ string[c];
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Generate a hash of the unique identifying string for this host
|
||||||
|
* that will be unique for both bare-metal and container instances
|
||||||
|
* Equivalent of a hash of;
|
||||||
|
*
|
||||||
|
* $(hostname)$(cat /proc/sys/kernel/random/boot_id)
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
#define HOSTID_FILE "/proc/sys/kernel/random/boot_id"
|
||||||
|
static uint64_t getHostHash(const char* hostname) {
|
||||||
|
char hostHash[1024];
|
||||||
|
|
||||||
|
// Fall back is the hostname if something fails
|
||||||
|
(void) strncpy(hostHash, hostname, sizeof(hostHash));
|
||||||
|
int offset = strlen(hostHash);
|
||||||
|
|
||||||
|
FILE *file = fopen(HOSTID_FILE, "r");
|
||||||
|
if (file != NULL) {
|
||||||
|
char *p;
|
||||||
|
if (fscanf(file, "%ms", &p) == 1) {
|
||||||
|
strncpy(hostHash+offset, p, sizeof(hostHash)-offset-1);
|
||||||
|
free(p);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fclose(file);
|
||||||
|
|
||||||
|
// Make sure the string is terminated
|
||||||
|
hostHash[sizeof(hostHash)-1]='\0';
|
||||||
|
|
||||||
|
return getHash(hostHash, strlen(hostHash));
|
||||||
|
}
|
||||||
|
|
||||||
static size_t wordSize(ncclDataType_t type) {
|
static size_t wordSize(ncclDataType_t type) {
|
||||||
switch(type) {
|
switch(type) {
|
||||||
case ncclChar:
|
case ncclChar:
|
||||||
|
|||||||
Referens i nytt ärende
Block a user