From e2a9cbb3620b594e23f373fccbcaa4db40e9d105 Mon Sep 17 00:00:00 2001 From: Junyu Ma Date: Thu, 23 Jan 2025 11:09:09 -0800 Subject: [PATCH] Perftests: Introduce NCCL_TESTS_SPLIT env `NCCL_TESTS_SPLIT` serves as new way of computing the color for splitting communicators. Will be overrided by `NCCL_TESTS_SPLIT_MASK`. Examples: NCCL_TESTS_SPLIT_MASK="0x7" # color = rank & 0x7. What we do today to run on a DGX with one GPU per node. NCCL_TESTS_SPLIT="AND 0x7" # color = rank & 0x7. New way to run on one GPU per node on a DGX, equivalent to NCCL_TESTS_SPLIT_MASK=0x7 NCCL_TESTS_SPLIT="MOD 72" # color = rank % 72. One GPU per NVLink domain on an NVL72 system. NCCL_TESTS_SPLIT="DIV 72" # color = rank / 72. Intra NVLink domain on NVL72. You can also use: "%" "&" "|" "/" for short. Extra spaces in the middle will be automatically ignored. Not case sensitive. The followings are all equivalent: NCCL_TESTS_SPLIT="%0x7" NCCL_TESTS_SPLIT="%0b111" NCCL_TESTS_SPLIT="AND 7" NCCL_TESTS_SPLIT="and 0x7" [ROCm/rccl-tests commit: a89cf07fe879e1c0187a4f617f873ae47d69af6b] --- projects/rccl-tests/src/common.cu | 51 +++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/projects/rccl-tests/src/common.cu b/projects/rccl-tests/src/common.cu index 6d103d797d..9277ea2b15 100644 --- a/projects/rccl-tests/src/common.cu +++ b/projects/rccl-tests/src/common.cu @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include "cuda.h" #include "../verifiable/verifiable.h" @@ -892,6 +894,26 @@ int main(int argc, char* argv[]) { return 0; } +#ifdef MPI_SUPPORT +// parse int for base 2/10/16, will ignore first whitespaces +static bool parseInt(char *s, int *num) { + char *p = NULL; + if (!s || !num) + return false; + while (*s && isspace(*s)) ++s; + if (!*s) return false; + + if (strncasecmp(s, "0b", 2) == 0) + *num = (int)strtoul(s + 2, &p, 2); + else + *num = (int)strtoul(s, &p, 0); + + if (p == s) + return false; + return true; +} +#endif + testResult_t run() { int totalProcs = 1, proc = 0, ncclProcs = 1, ncclProc = 0, color = 0; int localRank = 0; @@ -909,10 +931,33 @@ testResult_t run() { if (hostHashs[p] == hostHashs[proc]) localRank++; } - char* str = getenv("NCCL_TESTS_SPLIT_MASK"); - uint64_t mask = str ? strtoul(str, NULL, 16) : 0; + char *splitMaskEnv = NULL; + if (splitMaskEnv = getenv("NCCL_TESTS_SPLIT_MASK")) { + color = proc & strtoul(splitMaskEnv, NULL, 16); + } else if (splitMaskEnv = getenv("NCCL_TESTS_SPLIT")) { + if ( + (strncasecmp(splitMaskEnv, "AND", strlen("AND")) == 0 && parseInt(splitMaskEnv + strlen("AND"), &color)) || + (strncasecmp(splitMaskEnv, "&", strlen("&")) == 0 && parseInt(splitMaskEnv + strlen("&"), &color)) + ) + color = proc & color; + if ( + (strncasecmp(splitMaskEnv, "OR", strlen("OR")) == 0 && parseInt(splitMaskEnv + strlen("OR"), &color)) || + (strncasecmp(splitMaskEnv, "|", strlen("|")) == 0 && parseInt(splitMaskEnv + strlen("|"), &color)) + ) + color = proc | color; + if ( + (strncasecmp(splitMaskEnv, "MOD", strlen("MOD")) == 0 && parseInt(splitMaskEnv + strlen("MOD"), &color)) || + (strncasecmp(splitMaskEnv, "%", strlen("%")) == 0 && parseInt(splitMaskEnv + strlen("%"), &color)) + ) + color = proc % color; + if ( + (strncasecmp(splitMaskEnv, "DIV", strlen("DIV")) == 0 && parseInt(splitMaskEnv + strlen("DIV"), &color)) || + (strncasecmp(splitMaskEnv, "/", strlen("/")) == 0 && parseInt(splitMaskEnv + strlen("/"), &color)) + ) + color = proc / color; + } + MPI_Comm mpi_comm; - color = proc & mask; MPI_Comm_split(MPI_COMM_WORLD, color, proc, &mpi_comm); MPI_Comm_size(mpi_comm, &ncclProcs); MPI_Comm_rank(mpi_comm, &ncclProc);