diff --git a/projects/rccl/LICENSE.txt b/projects/rccl/LICENSE.txt index 2a454ff322..1eddd43e2a 100644 --- a/projects/rccl/LICENSE.txt +++ b/projects/rccl/LICENSE.txt @@ -1,5 +1,5 @@ - Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved. + Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -30,3 +30,11 @@ The U.S. Department of Energy funded the development of this software under subcontract 7078610 with Lawrence Berkeley National Laboratory. + +This code also includes files from the NVIDIA Tools Extension SDK project. + +See: + + https://github.com/NVIDIA/NVTX + +for more information and license details. diff --git a/projects/rccl/README.md b/projects/rccl/README.md index 2e8dcaee3f..433387fe85 100644 --- a/projects/rccl/README.md +++ b/projects/rccl/README.md @@ -86,6 +86,6 @@ Please refer to the [Library documentation](http://rccl.readthedocs.io/) for cur ## Copyright -All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved. +All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved. -All modifications are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. \ No newline at end of file +All modifications are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved. diff --git a/projects/rccl/makefiles/version.mk b/projects/rccl/makefiles/version.mk index f2539c595b..f64e8ad49a 100644 --- a/projects/rccl/makefiles/version.mk +++ b/projects/rccl/makefiles/version.mk @@ -1,6 +1,6 @@ ##### version NCCL_MAJOR := 2 NCCL_MINOR := 8 -NCCL_PATCH := 3 +NCCL_PATCH := 4 NCCL_SUFFIX := PKG_REVISION := 1 diff --git a/projects/rccl/src/bootstrap.cc b/projects/rccl/src/bootstrap.cc index 9bf8d6840d..b4db1368fd 100644 --- a/projects/rccl/src/bootstrap.cc +++ b/projects/rccl/src/bootstrap.cc @@ -78,7 +78,7 @@ static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) { int recvSize; NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int))); if (recvSize > size) { - WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size); + WARN("Message truncated : received %d bytes instead of %d", recvSize, size); return ncclInternalError; } NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size))); @@ -274,7 +274,7 @@ static ncclResult_t remoteAlloc(void** ptr, int fd) { void* ncclRemoteMemAllocationService(void* args) { struct remAllocState* state = (struct remAllocState *) args; if (hipSetDevice(state->cudaDev) != hipSuccess) { - WARN("[Rem Allocator] Failed to set CUDA device %d\n", state->cudaDev); + WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev); } // Prepare poll descriptor @@ -523,7 +523,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) { ncclResult_t bootstrapClose(void* commState) { struct extState* state = (struct extState*)commState; if (state->unexpectedConnections != NULL) { - WARN("Unexpected connections are not empty.\n"); + WARN("Unexpected connections are not empty"); return ncclInternalError; } close(state->extListenFd); diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc index f5e769a5a2..43848017f6 100644 --- a/projects/rccl/src/enqueue.cc +++ b/projects/rccl/src/enqueue.cc @@ -376,7 +376,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) { info->nstepsPerLoop = 1; info->nchunksPerLoop = info->comm->nRanks; break; default: - WARN("Unknown pattern %d\n", info->pattern); + WARN("Unknown pattern %d", info->pattern); return ncclInternalError; } return ncclSuccess; diff --git a/projects/rccl/src/graph/connect.cc b/projects/rccl/src/graph/connect.cc index be422bd32d..737f3218e8 100644 --- a/projects/rccl/src/graph/connect.cc +++ b/projects/rccl/src/graph/connect.cc @@ -117,7 +117,7 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) { int x = 0; while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++; if (x == NCCL_MAX_TREE_ARITY) { - WARN("Internal error : tree already has %d children (%d %d %d)\n", x, tree->down[0], tree->down[1], tree->down[2]); + WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]); return ncclInternalError; } tree->down[x] = indexes[d]; @@ -215,7 +215,7 @@ int ncclMinNchannels() { if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings(); if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels(); if (minNchannels > MAXCHANNELS) { - WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS); + WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS); minNchannels = MAXCHANNELS; } if (minNchannels < 0) minNchannels = 0; @@ -227,7 +227,7 @@ int ncclMaxNchannels() { if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels(); if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS; if (maxNchannels < 1) { - WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels); + WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels); maxNchannels = 1; } return maxNchannels; diff --git a/projects/rccl/src/graph/paths.cc b/projects/rccl/src/graph/paths.cc index c9924229be..13a6434dd4 100644 --- a/projects/rccl/src/graph/paths.cc +++ b/projects/rccl/src/graph/paths.cc @@ -26,7 +26,7 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode* return ncclSuccess; } } - WARN("Could not find node of type %d id %lx\n", t, id); + WARN("Could not find node of type %d id %lx", t, id); return ncclInternalError; } @@ -282,6 +282,9 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_ if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB; else p2pLevel = PATH_SYS; } + if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { + p2pLevel = PATH_PXB; + } compare: // Compute the PCI distance and compare with the p2pLevel. diff --git a/projects/rccl/src/graph/search.cc b/projects/rccl/src/graph/search.cc index 1d732a92ff..097dcaaeb4 100644 --- a/projects/rccl/src/graph/search.cc +++ b/projects/rccl/src/graph/search.cc @@ -58,7 +58,7 @@ static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode* return ncclSuccess; } } - WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id); + WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id); return ncclInternalError; } @@ -191,7 +191,7 @@ static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* in return ncclSuccess; } } - WARN("Could not find gpu rank %d\n", rank); + WARN("Could not find gpu rank %d", rank); return ncclInternalError; } @@ -202,7 +202,7 @@ static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int* return ncclSuccess; } } - WARN("Could not find net id %lx\n", id); + WARN("Could not find net id %lx", id); return ncclInternalError; } @@ -1253,7 +1253,7 @@ done: } if (graph->nChannels == 0 && graph->collNet == 0) { - WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern); + WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern); for (int i=0; iintra[i] = system->nodes[GPU].nodes[i].gpu.rank; graph->inter[0] = graph->inter[1] = 0; graph->speedIntra = graph->speedInter = 0.1; diff --git a/projects/rccl/src/graph/topo.cc b/projects/rccl/src/graph/topo.cc index b50b89c04b..61a38c1c84 100644 --- a/projects/rccl/src/graph/topo.cc +++ b/projects/rccl/src/graph/topo.cc @@ -82,6 +82,9 @@ static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* wi if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) { *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH; } + if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { + *width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_WIDTH : ZPI_WIDTH; + } return ncclSuccess; } @@ -104,7 +107,7 @@ ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode* ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) { if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) { - WARN("Error : tried to create too many nodes of type %d\n", type); + WARN("Error : tried to create too many nodes of type %d", type); return ncclInternalError; } struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count; @@ -379,7 +382,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s } struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } }; -struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { NULL, 0 } }; +struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { " Shanghai ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } }; ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) { int numaId; @@ -402,6 +405,11 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW; + } else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) { + int familyId, modelId; + NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId)); + NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId)); + if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG; } if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) { int familyId, modelId; @@ -486,7 +494,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem* NCCLCHECK(busIdToInt64(parentBusId, &pBusId)); NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId)); if (gpu == NULL) { - WARN("Add NVLink error : could not find GPU %lx\n", pBusId); + WARN("Add NVLink error : could not find GPU %lx", pBusId); return ncclInternalError; } int count; diff --git a/projects/rccl/src/graph/topo.h b/projects/rccl/src/graph/topo.h index 9379844bf2..379b5a3351 100644 --- a/projects/rccl/src/graph/topo.h +++ b/projects/rccl/src/graph/topo.h @@ -20,6 +20,8 @@ #define PCI_WIDTH 12.0 // PCI Gen3 x16 #define QPI_WIDTH 6.0 #define SKL_QPI_WIDTH 9.0 +#define ZPI_WIDTH 6.0 +#define YONGFENG_ZPI_WIDTH 9.0 #define P9_WIDTH 32.0 #define ARM_WIDTH 6.0 #define NET_WIDTH 12.0 // 100Gbit diff --git a/projects/rccl/src/graph/xml.cc b/projects/rccl/src/graph/xml.cc index a4da61d60e..1e61332fda 100644 --- a/projects/rccl/src/graph/xml.cc +++ b/projects/rccl/src/graph/xml.cc @@ -71,7 +71,7 @@ ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) { if (c == '=') { ptr[o] = '\0'; if (value == NULL) { - WARN("XML Parse : Unexpected value with name %s\n", ptr); + WARN("XML Parse : Unexpected value with name %s", ptr); return ncclInternalError; } return xmlGetValue(file, value, last); @@ -137,7 +137,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) { // Re-read the name, we got '/' in the first call NCCLCHECK(xmlGetToken(file, node->name, NULL, &c)); if (c != '>') { - WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name); + WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name); return ncclInternalError; } return ncclSuccess; @@ -150,7 +150,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) { while (c == ' ') { NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c)); if (a == MAX_ATTR_COUNT) { - INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT); + INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT); // Actually we need to still consume the extra attributes so we have an extra one. } else a++; } @@ -178,7 +178,7 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess; while (1) { if (xml->maxIndex == MAX_NODES) { - WARN("Error : XML parser is limited to 1024 nodes\n"); + WARN("Error : XML parser is limited to 1024 nodes"); return ncclInternalError; } struct ncclXmlNode* node = xml->nodes+xml->maxIndex; @@ -373,7 +373,7 @@ ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* pat char strValue[MAX_STR_LEN]; NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue)); if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); } - TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue); + TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue); return ncclSuccess; } @@ -661,7 +661,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12; if (maxNvLinks > 0 && nvmlDev == NULL) { - WARN("No NVML device handle. Skipping nvlink detection.\n"); + WARN("No NVML device handle. Skipping nvlink detection."); maxNvLinks = 0; } diff --git a/projects/rccl/src/group.cc b/projects/rccl/src/group.cc index df9cd85ddd..3ab95c0533 100644 --- a/projects/rccl/src/group.cc +++ b/projects/rccl/src/group.cc @@ -202,7 +202,7 @@ ncclResult_t ncclGroupEnd() { if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) { int err = pthread_join(ncclGroupThreads[i], NULL); if (err != 0) { - WARN("Error waiting for pthread_join : %s\n", strerror(errno)); + WARN("Error waiting for pthread_join : %s", strerror(errno)); return ncclSystemError; } NCCLCHECKGOTO(args->ret, ret, end); @@ -234,7 +234,7 @@ ncclResult_t ncclGroupEnd() { // schedule delta 0, +1, -1, +2, -2, ... // also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even. for (int d=0; d<=nRanks/4; d++) { - int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, nRanks-(nRanks/2-d) }; + int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, (nRanks-(nRanks/2-d))%nRanks }; int index = 0; int delta = deltas[index]; sched_delta: @@ -259,6 +259,10 @@ sched_delta: ssize_t sendbytes = totSendBytes-sendOffset; if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; } if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; } + // 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested + // (total size == 0), otherwise set size to -1 so that the kernel skips the operation. + if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1; + if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1; if (sendbytes >= 0 || recvbytes >= 0) { NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId, recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL, diff --git a/projects/rccl/src/include/debug.h b/projects/rccl/src/include/debug.h index d88458c24a..e7a152cc97 100644 --- a/projects/rccl/src/include/debug.h +++ b/projects/rccl/src/include/debug.h @@ -25,7 +25,7 @@ extern pthread_mutex_t ncclDebugOutputLock; extern FILE *ncclDebugFile; extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim); -void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...); +void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6))); // Let code temporarily downgrade WARN into INFO extern thread_local int ncclDebugNoWarn; diff --git a/projects/rccl/src/include/graph.h b/projects/rccl/src/include/graph.h index 6ef9f4c196..ac7fcbf992 100644 --- a/projects/rccl/src/include/graph.h +++ b/projects/rccl/src/include/graph.h @@ -40,10 +40,12 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank); #define NCCL_TOPO_CPU_ARCH_ARM 3 #define NCCL_TOPO_CPU_VENDOR_INTEL 1 #define NCCL_TOPO_CPU_VENDOR_AMD 2 +#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3 #define NCCL_TOPO_CPU_TYPE_BDW 1 #define NCCL_TOPO_CPU_TYPE_SKL 2 #define NCCL_TOPO_CPU_TYPE_ZEN 3 #define NCCL_TOPO_CPU_TYPE_ROME 4 +#define NCCL_TOPO_CPU_TYPE_YONGFENG 1 ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model); ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count); diff --git a/projects/rccl/src/include/shm.h b/projects/rccl/src/include/shm.h index 85acdfe3dc..6a3e556c1e 100644 --- a/projects/rccl/src/include/shm.h +++ b/projects/rccl/src/include/shm.h @@ -46,7 +46,7 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt *shmPtr = ptr; return ncclSuccess; sysError: - WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize); + WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmname, shmsize); hipError: if (fd != -1) close(fd); if (create) shm_unlink(shmname); diff --git a/projects/rccl/src/include/socket.h b/projects/rccl/src/include/socket.h index e386598cdf..aaf137713e 100644 --- a/projects/rccl/src/include/socket.h +++ b/projects/rccl/src/include/socket.h @@ -372,7 +372,7 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) { /* IPv4/IPv6 support */ int family = remoteAddr->sa.sa_family; if (family != AF_INET && family != AF_INET6) { - WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n", family, AF_INET, AF_INET6); + WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)", family, AF_INET, AF_INET6); return ncclInternalError; } int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6); diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index b534264dee..76237e7b22 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -286,7 +286,7 @@ static ncclResult_t commFree(ncclComm_t comm) { #define VEGA_GPU_RTC_FREQUENCY 2.5E7 if (comm->rank == 0) { INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank", "total", " wait", "w_recv", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS"); - INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)"); + INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)"); } INFO(NCCL_INIT, "# %4d %6.4f %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f", comm->rank, (double)prof->total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels, @@ -385,7 +385,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { comm->nRanks = comm->hostDevComm.nRanks = ndev; hipGetDevice(&comm->cudaDev); NCCLCHECK(getBusId(comm->cudaDev, &comm->busId)); - TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x", comm, rank, ndev, comm->cudaDev, comm->busId); + TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId); comm->doneEvent = doneEvent; comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false; @@ -766,7 +766,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm for (int i = 0; i < nranks; i++) { memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo)); if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) { - WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId); + WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId); return ncclInvalidUsage; } } @@ -1163,7 +1163,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup); NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup); - INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId); + INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId); return ncclSuccess; cleanup: @@ -1234,6 +1234,9 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) { static ncclResult_t commDestroy(ncclComm_t comm) { int savedDevice; +#ifdef ENABLE_TRACE + int rank = comm->rank; +#endif CUDACHECK(hipGetDevice(&savedDevice)); int commDevice = comm->cudaDev; @@ -1250,7 +1253,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) { if (savedDevice != commDevice) CUDACHECK(hipSetDevice(savedDevice)); - TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, comm->rank); + TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank); return ncclSuccess; } @@ -1261,7 +1264,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) { if (comm == NULL) return ncclSuccess; - TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %x", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId); + TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId); // Try and prevent a double free of the comm struct (user error) if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) { diff --git a/projects/rccl/src/misc/utils.cc b/projects/rccl/src/misc/utils.cc index a02b558b49..29b618196c 100644 --- a/projects/rccl/src/misc/utils.cc +++ b/projects/rccl/src/misc/utils.cc @@ -69,10 +69,10 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) { } uint64_t getHash(const char* string, int n) { - // Based on DJB2, result = result * 33 + char + // Based on DJB2a, result = result * 33 ^ char uint64_t result = 5381; for (int c = 0; c < n; c++) { - result = ((result << 5) + result) + string[c]; + result = ((result << 5) + result) ^ string[c]; } return result; } diff --git a/projects/rccl/src/proxy.cc b/projects/rccl/src/proxy.cc index 2ba7aabfb4..29cf1fda39 100644 --- a/projects/rccl/src/proxy.cc +++ b/projects/rccl/src/proxy.cc @@ -72,7 +72,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) { struct ncclProxyArgs* op = state->ops; while (op) { if (op->idle & OP_SEEN) { - WARN("Active list loop at element %ld\n", OP_INDEX(op)); + WARN("Active list loop at element %ld", OP_INDEX(op)); } op->idle |= OP_SEEN; printf("[%ld]", OP_INDEX(op)); @@ -98,7 +98,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) { struct ncclProxyArgs* free = state->pool; while (free) { if (free->idle & OP_SEEN) { - WARN("Free list loop at element %ld\n", OP_INDEX(free)); + WARN("Free list loop at element %ld", OP_INDEX(free)); } free->idle |= OP_SEEN; free = free->next; @@ -109,7 +109,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) { while (p) { for (int e=0; eelems[e].idle & OP_SEEN) == 0) { - WARN("Element %d of pool %d has been lost\n", e, i); + WARN("Element %d of pool %d has been lost", e, i); struct ncclProxyArgs* free = state->pool; printf("Free list "); while (free) { @@ -164,7 +164,7 @@ static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) { struct ncclPeer* peerComm = args->channel->peers+peer; struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send; if (connector->transportComm == NULL) { - WARN("[%d] Error no transport for %s peer %d on channel %d\n", connector->comm->rank, + WARN("[%d] Error no transport for %s peer %d on channel %d", connector->comm->rank, type == proxyRecv ? "recv" : "send", peer, args->channel->id); return ncclInternalError; } @@ -480,7 +480,7 @@ ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int typ while (nslots*state->slotSize < size) nslots *= 2; int s = (ptr-buff)/state->slotSize; if (s < 0 || s+nslots > state->nslots) { - WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)\n", ptr, size, buff, state->slotSize, state->nslots); + WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)", ptr, size, buff, state->slotSize, state->nslots); return ncclInternalError; } for (int i=0; idtype); NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*) buff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot)); if (args->requests[buffSlot] != NULL) { - TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]); + TRACE(NCCL_NET, "sendProxy [%lu/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]); STORE(sizesFifo+buffSlot, -1); // Make sure size is reset to zero before we update the head. __sync_synchronize(); @@ -312,7 +312,7 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) { int buffSlot = args->done%NCCL_STEPS; NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size)); if (done) { - TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size); + TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size); STORE(&reqFifo[buffSlot].size, size); // Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush) // (reordered store after store is possible on POWER, though not on x86) @@ -357,7 +357,7 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff; int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize; STORE(&reqFifo[buffSlot].recvBuff, recvBuff+buffSlot*recvStepSize); - TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff); + TRACE(NCCL_NET, "recvProxy [%lu/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff); args->posted += args->sliceSteps; args->idle = 0; return ncclSuccess; @@ -365,7 +365,7 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) { if (args->posted > args->received) { int buffSlot = args->received%NCCL_STEPS; if (LOAD(&reqFifo[buffSlot].recvBuff) == NULL) { // Buffer is cleared : coll is complete - TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->received, buffSlot, LOAD(&reqFifo[buffSlot].size)); + TRACE(NCCL_NET, "recvProxy [%lu/%d] done, size %d", args->received, buffSlot, LOAD(&reqFifo[buffSlot].size)); if (args->protocol == NCCL_PROTO_LL) { // ll // re-attach flag uint32_t flag = NCCL_LL_FLAG(args->received + 1); diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc index be658ec4d6..4fe4423d43 100644 --- a/projects/rccl/src/transport/net.cc +++ b/projects/rccl/src/transport/net.cc @@ -362,7 +362,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { args->channel->sizes += LOAD(sizesFifo+buffSlot); args->channel->send_byte += LOAD(sizesFifo+buffSlot); #endif - TRACE(NCCL_NET, "sendProxy [%d/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]); + TRACE(NCCL_NET, "sendProxy [%lu/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]); STORE(sizesFifo+buffSlot, -1); // Make sure size is reset to zero before we update the head. __sync_synchronize(); @@ -379,7 +379,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) { int buffSlot = args->done%NCCL_STEPS; NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL)); if (done) { - TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot]); + TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done", args->done, buffSlot, args->requests[buffSlot]); #ifdef ENABLE_PROFILING if (args->protocol == NCCL_PROTO_SIMPLE) { args->channel->active_req --; @@ -446,7 +446,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) { } NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, args->requests+buffSlot)); if (args->requests[buffSlot] != NULL) { - TRACE(NCCL_NET, "recvProxy [%d/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]); + TRACE(NCCL_NET, "recvProxy [%lu/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]); #ifdef ENABLE_PROFILING if (args->protocol == NCCL_PROTO_SIMPLE) { if (args->channel->active_req == 0) { diff --git a/projects/rccl/src/transport/net_ib.cc b/projects/rccl/src/transport/net_ib.cc index 50ffd567cb..228a3fd4f8 100644 --- a/projects/rccl/src/transport/net_ib.cc +++ b/projects/rccl/src/transport/net_ib.cc @@ -85,7 +85,7 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort) snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName); char* p = realpath(devicePath, NULL); if (p == NULL) { - WARN("Could not find real path of %s", *devicePath); + WARN("Could not find real path of %s (%s)", devName, devicePath); } else { // Merge multi-port NICs into the same PCI device p[strlen(p)-1] = '0'; diff --git a/projects/rccl/src/transport/net_socket.cc b/projects/rccl/src/transport/net_socket.cc index bbb3c8a58b..13e155c0fb 100644 --- a/projects/rccl/src/transport/net_socket.cc +++ b/projects/rccl/src/transport/net_socket.cc @@ -238,7 +238,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) { if (fd == -1) { // Could not find device vendor. This is handled silently so // we don't want to print an INFO error. - TRACE(NCCL_NET, "Open of %s failed : %s\n", vendorPath, strerror(errno)); + TRACE(NCCL_NET, "Open of %s failed : %s", vendorPath, strerror(errno)); goto end; } char vendor[7];