Merge pull request #147 from nileshnegi/sync/nccl-tests_v2.16.7

[SYNC] NCCL-Tests v2.16.7
This commit is contained in:
nileshnegi
2025-08-18 15:28:34 -04:00
bovenliggende a7809b3243 6f1b11ad49
commit 690f97c119
2 gewijzigde bestanden met toevoegingen van 17 en 4 verwijderingen
+6 -3
Bestand weergeven
@@ -22,9 +22,12 @@ HIPCUFLAGS := -std=c++14
LDFLAGS :=
HIPLDFLAGS :=
MPI ?= 0 # Set to 1 to enable MPI support (multi-process/multi-node)
NAME_SUFFIX ?= # e.g. _mpi when using MPI=1
DSO ?= 0 # Set to 1 to create and use libverifiable.so to reduce binary size
# Set to 1 to enable MPI support (multi-process/multi-node)
MPI ?= 0
# e.g. Set to _mpi when using MPI=1
NAME_SUFFIX ?=
# Set to 1 to create and use libverifiable.so to reduce binary size
DSO ?= 0
HIP_VERSION = $(strip $(shell which $(HIPCONFIG) >/dev/null && $(HIPCONFIG) --version))
HIP_MAJOR = $(shell echo $(HIP_VERSION) | cut -d "." -f 1)
+11 -1
Bestand weergeven
@@ -21,12 +21,16 @@
#include "cuda.h"
#include <vector>
#include <utility>
#include <errno.h> /* program_invocation_short_name */
//#define DEBUG_PRINT
#include "verifiable.h"
#include "git_version.h"
#define DIVUP(x, y) \
(((x)+(y)-1)/(y))
int test_ncclVersion = 0; // init'd with ncclGetVersion()
int32_t gpu_block3;
size_t cache_bytes = 192 * 1024 * 1024; // Use 192MB
@@ -1446,6 +1450,7 @@ testResult_t run() {
#endif
is_main_thread = is_main_proc = (proc == 0) ? 1 : 0;
PRINT("# Collective test starting: %s\n", program_invocation_short_name);
PRINT("# nThread %d nGpus %d minBytes %ld maxBytes %ld step: %ld(%s) warmup iters: %d iters: %d agg iters: %d validation: %d graph: %d\n",
nThreads, nGpus, minBytes, maxBytes,
(stepFactor > 1)?stepFactor:stepBytes, (stepFactor > 1)?"factor":"bytes",
@@ -1488,10 +1493,14 @@ testResult_t run() {
PRINT("%s", line);
#endif
// Reserve 1GiB of memory for each 16GiB installed, but limit to a max of 4GiB
const size_t GB = (1ULL << 30);
size_t reserveMem = std::min(DIVUP(maxMem, 16*GB) * 1*GB, 4*GB);
// We need sendbuff, recvbuff, expected (when datacheck enabled), plus 1G for the rest.
size_t memMaxBytes = (maxMem - (1<<30)) / (datacheck ? 3 : 2);
size_t memMaxBytes = (maxMem - reserveMem - 1*GB) / (datacheck ? 3 : 2);
if (maxBytes > memMaxBytes) {
maxBytes = memMaxBytes;
if (minBytes > maxBytes) minBytes = maxBytes;
if (proc == 0) printf("#\n# Reducing maxBytes to %ld due to memory limitation\n", maxBytes);
}
@@ -1723,6 +1732,7 @@ testResult_t run() {
PRINT("# Out of bounds values : %d %s\n", errors[0], errors[0] ? "FAILED" : "OK");
PRINT("# Avg bus bandwidth : %g %s\n", bw[0], check_avg_bw == -1 ? "" : (bw[0] < check_avg_bw*(0.9) ? "FAILED" : "OK"));
PRINT("#\n");
PRINT("# Collective test concluded: %s\n", program_invocation_short_name);
#ifdef MPI_SUPPORT
MPI_Comm_free(&mpi_comm);
MPI_Finalize();