Merge remote-tracking branch 'nccl/master' into 2.8.3
이 커밋은 다음에 포함됨:
+9
-1
@@ -1,5 +1,5 @@
|
||||
|
||||
Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
|
||||
Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
@@ -30,3 +30,11 @@
|
||||
The U.S. Department of Energy funded the development of this software
|
||||
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
|
||||
|
||||
|
||||
This code also includes files from the NVIDIA Tools Extension SDK project.
|
||||
|
||||
See:
|
||||
|
||||
https://github.com/NVIDIA/NVTX
|
||||
|
||||
for more information and license details.
|
||||
|
||||
@@ -86,6 +86,6 @@ Please refer to the [Library documentation](http://rccl.readthedocs.io/) for cur
|
||||
|
||||
## Copyright
|
||||
|
||||
All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
|
||||
All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
|
||||
|
||||
All modifications are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
All modifications are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
+1
-1
@@ -1,6 +1,6 @@
|
||||
##### version
|
||||
NCCL_MAJOR := 2
|
||||
NCCL_MINOR := 8
|
||||
NCCL_PATCH := 3
|
||||
NCCL_PATCH := 4
|
||||
NCCL_SUFFIX :=
|
||||
PKG_REVISION := 1
|
||||
|
||||
+3
-3
@@ -78,7 +78,7 @@ static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) {
|
||||
int recvSize;
|
||||
NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int)));
|
||||
if (recvSize > size) {
|
||||
WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
|
||||
WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
|
||||
return ncclInternalError;
|
||||
}
|
||||
NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size)));
|
||||
@@ -274,7 +274,7 @@ static ncclResult_t remoteAlloc(void** ptr, int fd) {
|
||||
void* ncclRemoteMemAllocationService(void* args) {
|
||||
struct remAllocState* state = (struct remAllocState *) args;
|
||||
if (hipSetDevice(state->cudaDev) != hipSuccess) {
|
||||
WARN("[Rem Allocator] Failed to set CUDA device %d\n", state->cudaDev);
|
||||
WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev);
|
||||
}
|
||||
|
||||
// Prepare poll descriptor
|
||||
@@ -523,7 +523,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
|
||||
ncclResult_t bootstrapClose(void* commState) {
|
||||
struct extState* state = (struct extState*)commState;
|
||||
if (state->unexpectedConnections != NULL) {
|
||||
WARN("Unexpected connections are not empty.\n");
|
||||
WARN("Unexpected connections are not empty");
|
||||
return ncclInternalError;
|
||||
}
|
||||
close(state->extListenFd);
|
||||
|
||||
+1
-1
@@ -376,7 +376,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
|
||||
info->nstepsPerLoop = 1;
|
||||
info->nchunksPerLoop = info->comm->nRanks; break;
|
||||
default:
|
||||
WARN("Unknown pattern %d\n", info->pattern);
|
||||
WARN("Unknown pattern %d", info->pattern);
|
||||
return ncclInternalError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
|
||||
+3
-3
@@ -117,7 +117,7 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
|
||||
int x = 0;
|
||||
while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
|
||||
if (x == NCCL_MAX_TREE_ARITY) {
|
||||
WARN("Internal error : tree already has %d children (%d %d %d)\n", x, tree->down[0], tree->down[1], tree->down[2]);
|
||||
WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
|
||||
return ncclInternalError;
|
||||
}
|
||||
tree->down[x] = indexes[d];
|
||||
@@ -215,7 +215,7 @@ int ncclMinNchannels() {
|
||||
if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
|
||||
if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
|
||||
if (minNchannels > MAXCHANNELS) {
|
||||
WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS);
|
||||
WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
|
||||
minNchannels = MAXCHANNELS;
|
||||
}
|
||||
if (minNchannels < 0) minNchannels = 0;
|
||||
@@ -227,7 +227,7 @@ int ncclMaxNchannels() {
|
||||
if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
|
||||
if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
|
||||
if (maxNchannels < 1) {
|
||||
WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels);
|
||||
WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
|
||||
maxNchannels = 1;
|
||||
}
|
||||
return maxNchannels;
|
||||
|
||||
+4
-1
@@ -26,7 +26,7 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("Could not find node of type %d id %lx\n", t, id);
|
||||
WARN("Could not find node of type %d id %lx", t, id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
@@ -282,6 +282,9 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
|
||||
if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
|
||||
else p2pLevel = PATH_SYS;
|
||||
}
|
||||
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
|
||||
p2pLevel = PATH_PXB;
|
||||
}
|
||||
|
||||
compare:
|
||||
// Compute the PCI distance and compare with the p2pLevel.
|
||||
|
||||
+4
-4
@@ -58,7 +58,7 @@ static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode*
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id);
|
||||
WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
@@ -191,7 +191,7 @@ static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* in
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("Could not find gpu rank %d\n", rank);
|
||||
WARN("Could not find gpu rank %d", rank);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
@@ -202,7 +202,7 @@ static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int*
|
||||
return ncclSuccess;
|
||||
}
|
||||
}
|
||||
WARN("Could not find net id %lx\n", id);
|
||||
WARN("Could not find net id %lx", id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
|
||||
@@ -1253,7 +1253,7 @@ done:
|
||||
}
|
||||
|
||||
if (graph->nChannels == 0 && graph->collNet == 0) {
|
||||
WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
|
||||
WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
|
||||
for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
|
||||
graph->inter[0] = graph->inter[1] = 0;
|
||||
graph->speedIntra = graph->speedInter = 0.1;
|
||||
|
||||
+11
-3
@@ -82,6 +82,9 @@ static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* wi
|
||||
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
|
||||
*width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH;
|
||||
}
|
||||
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
|
||||
*width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_WIDTH : ZPI_WIDTH;
|
||||
}
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -104,7 +107,7 @@ ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode*
|
||||
|
||||
ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
|
||||
if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
|
||||
WARN("Error : tried to create too many nodes of type %d\n", type);
|
||||
WARN("Error : tried to create too many nodes of type %d", type);
|
||||
return ncclInternalError;
|
||||
}
|
||||
struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
|
||||
@@ -379,7 +382,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
|
||||
}
|
||||
|
||||
struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
|
||||
struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { NULL, 0 } };
|
||||
struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { " Shanghai ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } };
|
||||
|
||||
ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
|
||||
int numaId;
|
||||
@@ -402,6 +405,11 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
|
||||
cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
|
||||
} else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
|
||||
int familyId, modelId;
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
|
||||
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
|
||||
if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG;
|
||||
}
|
||||
if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
|
||||
int familyId, modelId;
|
||||
@@ -486,7 +494,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
|
||||
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
|
||||
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
|
||||
if (gpu == NULL) {
|
||||
WARN("Add NVLink error : could not find GPU %lx\n", pBusId);
|
||||
WARN("Add NVLink error : could not find GPU %lx", pBusId);
|
||||
return ncclInternalError;
|
||||
}
|
||||
int count;
|
||||
|
||||
@@ -20,6 +20,8 @@
|
||||
#define PCI_WIDTH 12.0 // PCI Gen3 x16
|
||||
#define QPI_WIDTH 6.0
|
||||
#define SKL_QPI_WIDTH 9.0
|
||||
#define ZPI_WIDTH 6.0
|
||||
#define YONGFENG_ZPI_WIDTH 9.0
|
||||
#define P9_WIDTH 32.0
|
||||
#define ARM_WIDTH 6.0
|
||||
#define NET_WIDTH 12.0 // 100Gbit
|
||||
|
||||
+6
-6
@@ -71,7 +71,7 @@ ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
|
||||
if (c == '=') {
|
||||
ptr[o] = '\0';
|
||||
if (value == NULL) {
|
||||
WARN("XML Parse : Unexpected value with name %s\n", ptr);
|
||||
WARN("XML Parse : Unexpected value with name %s", ptr);
|
||||
return ncclInternalError;
|
||||
}
|
||||
return xmlGetValue(file, value, last);
|
||||
@@ -137,7 +137,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
|
||||
// Re-read the name, we got '/' in the first call
|
||||
NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
|
||||
if (c != '>') {
|
||||
WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name);
|
||||
WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name);
|
||||
return ncclInternalError;
|
||||
}
|
||||
return ncclSuccess;
|
||||
@@ -150,7 +150,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
|
||||
while (c == ' ') {
|
||||
NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
|
||||
if (a == MAX_ATTR_COUNT) {
|
||||
INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT);
|
||||
INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT);
|
||||
// Actually we need to still consume the extra attributes so we have an extra one.
|
||||
} else a++;
|
||||
}
|
||||
@@ -178,7 +178,7 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea
|
||||
if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess;
|
||||
while (1) {
|
||||
if (xml->maxIndex == MAX_NODES) {
|
||||
WARN("Error : XML parser is limited to 1024 nodes\n");
|
||||
WARN("Error : XML parser is limited to 1024 nodes");
|
||||
return ncclInternalError;
|
||||
}
|
||||
struct ncclXmlNode* node = xml->nodes+xml->maxIndex;
|
||||
@@ -373,7 +373,7 @@ ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* pat
|
||||
char strValue[MAX_STR_LEN];
|
||||
NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue));
|
||||
if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); }
|
||||
TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue);
|
||||
TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue);
|
||||
return ncclSuccess;
|
||||
}
|
||||
|
||||
@@ -661,7 +661,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
|
||||
int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12;
|
||||
|
||||
if (maxNvLinks > 0 && nvmlDev == NULL) {
|
||||
WARN("No NVML device handle. Skipping nvlink detection.\n");
|
||||
WARN("No NVML device handle. Skipping nvlink detection.");
|
||||
maxNvLinks = 0;
|
||||
}
|
||||
|
||||
|
||||
+6
-2
@@ -202,7 +202,7 @@ ncclResult_t ncclGroupEnd() {
|
||||
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
|
||||
int err = pthread_join(ncclGroupThreads[i], NULL);
|
||||
if (err != 0) {
|
||||
WARN("Error waiting for pthread_join : %s\n", strerror(errno));
|
||||
WARN("Error waiting for pthread_join : %s", strerror(errno));
|
||||
return ncclSystemError;
|
||||
}
|
||||
NCCLCHECKGOTO(args->ret, ret, end);
|
||||
@@ -234,7 +234,7 @@ ncclResult_t ncclGroupEnd() {
|
||||
// schedule delta 0, +1, -1, +2, -2, ...
|
||||
// also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
|
||||
for (int d=0; d<=nRanks/4; d++) {
|
||||
int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, nRanks-(nRanks/2-d) };
|
||||
int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, (nRanks-(nRanks/2-d))%nRanks };
|
||||
int index = 0;
|
||||
int delta = deltas[index];
|
||||
sched_delta:
|
||||
@@ -259,6 +259,10 @@ sched_delta:
|
||||
ssize_t sendbytes = totSendBytes-sendOffset;
|
||||
if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
|
||||
if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
|
||||
// 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
|
||||
// (total size == 0), otherwise set size to -1 so that the kernel skips the operation.
|
||||
if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1;
|
||||
if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1;
|
||||
if (sendbytes >= 0 || recvbytes >= 0) {
|
||||
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
|
||||
recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
|
||||
|
||||
+1
-1
@@ -25,7 +25,7 @@ extern pthread_mutex_t ncclDebugOutputLock;
|
||||
extern FILE *ncclDebugFile;
|
||||
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
|
||||
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
|
||||
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
|
||||
|
||||
// Let code temporarily downgrade WARN into INFO
|
||||
extern thread_local int ncclDebugNoWarn;
|
||||
|
||||
@@ -40,10 +40,12 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
|
||||
#define NCCL_TOPO_CPU_ARCH_ARM 3
|
||||
#define NCCL_TOPO_CPU_VENDOR_INTEL 1
|
||||
#define NCCL_TOPO_CPU_VENDOR_AMD 2
|
||||
#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
|
||||
#define NCCL_TOPO_CPU_TYPE_BDW 1
|
||||
#define NCCL_TOPO_CPU_TYPE_SKL 2
|
||||
#define NCCL_TOPO_CPU_TYPE_ZEN 3
|
||||
#define NCCL_TOPO_CPU_TYPE_ROME 4
|
||||
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
|
||||
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
|
||||
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
|
||||
|
||||
|
||||
+1
-1
@@ -46,7 +46,7 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt
|
||||
*shmPtr = ptr;
|
||||
return ncclSuccess;
|
||||
sysError:
|
||||
WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize);
|
||||
WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmname, shmsize);
|
||||
hipError:
|
||||
if (fd != -1) close(fd);
|
||||
if (create) shm_unlink(shmname);
|
||||
|
||||
+1
-1
@@ -372,7 +372,7 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
|
||||
/* IPv4/IPv6 support */
|
||||
int family = remoteAddr->sa.sa_family;
|
||||
if (family != AF_INET && family != AF_INET6) {
|
||||
WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n", family, AF_INET, AF_INET6);
|
||||
WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)", family, AF_INET, AF_INET6);
|
||||
return ncclInternalError;
|
||||
}
|
||||
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
|
||||
|
||||
+9
-6
@@ -286,7 +286,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
#define VEGA_GPU_RTC_FREQUENCY 2.5E7
|
||||
if (comm->rank == 0) {
|
||||
INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank", "total", " wait", "w_recv", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS");
|
||||
INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
|
||||
INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
|
||||
}
|
||||
INFO(NCCL_INIT, "# %4d %6.4f %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
|
||||
comm->rank, (double)prof->total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
|
||||
@@ -385,7 +385,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
|
||||
comm->nRanks = comm->hostDevComm.nRanks = ndev;
|
||||
hipGetDevice(&comm->cudaDev);
|
||||
NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
|
||||
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x", comm, rank, ndev, comm->cudaDev, comm->busId);
|
||||
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId);
|
||||
|
||||
comm->doneEvent = doneEvent;
|
||||
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
|
||||
@@ -766,7 +766,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
|
||||
for (int i = 0; i < nranks; i++) {
|
||||
memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
|
||||
if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
|
||||
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId);
|
||||
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId);
|
||||
return ncclInvalidUsage;
|
||||
}
|
||||
}
|
||||
@@ -1163,7 +1163,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
|
||||
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
|
||||
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
|
||||
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
|
||||
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
|
||||
|
||||
return ncclSuccess;
|
||||
cleanup:
|
||||
@@ -1234,6 +1234,9 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
|
||||
|
||||
static ncclResult_t commDestroy(ncclComm_t comm) {
|
||||
int savedDevice;
|
||||
#ifdef ENABLE_TRACE
|
||||
int rank = comm->rank;
|
||||
#endif
|
||||
CUDACHECK(hipGetDevice(&savedDevice));
|
||||
int commDevice = comm->cudaDev;
|
||||
|
||||
@@ -1250,7 +1253,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
|
||||
if (savedDevice != commDevice)
|
||||
CUDACHECK(hipSetDevice(savedDevice));
|
||||
|
||||
TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, comm->rank);
|
||||
TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
|
||||
|
||||
return ncclSuccess;
|
||||
}
|
||||
@@ -1261,7 +1264,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
|
||||
if (comm == NULL)
|
||||
return ncclSuccess;
|
||||
|
||||
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %x", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
|
||||
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
|
||||
|
||||
// Try and prevent a double free of the comm struct (user error)
|
||||
if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
|
||||
|
||||
+2
-2
@@ -69,10 +69,10 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
|
||||
}
|
||||
|
||||
uint64_t getHash(const char* string, int n) {
|
||||
// Based on DJB2, result = result * 33 + char
|
||||
// Based on DJB2a, result = result * 33 ^ char
|
||||
uint64_t result = 5381;
|
||||
for (int c = 0; c < n; c++) {
|
||||
result = ((result << 5) + result) + string[c];
|
||||
result = ((result << 5) + result) ^ string[c];
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
+5
-5
@@ -72,7 +72,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
|
||||
struct ncclProxyArgs* op = state->ops;
|
||||
while (op) {
|
||||
if (op->idle & OP_SEEN) {
|
||||
WARN("Active list loop at element %ld\n", OP_INDEX(op));
|
||||
WARN("Active list loop at element %ld", OP_INDEX(op));
|
||||
}
|
||||
op->idle |= OP_SEEN;
|
||||
printf("[%ld]", OP_INDEX(op));
|
||||
@@ -98,7 +98,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
|
||||
struct ncclProxyArgs* free = state->pool;
|
||||
while (free) {
|
||||
if (free->idle & OP_SEEN) {
|
||||
WARN("Free list loop at element %ld\n", OP_INDEX(free));
|
||||
WARN("Free list loop at element %ld", OP_INDEX(free));
|
||||
}
|
||||
free->idle |= OP_SEEN;
|
||||
free = free->next;
|
||||
@@ -109,7 +109,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
|
||||
while (p) {
|
||||
for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) {
|
||||
if ((p->elems[e].idle & OP_SEEN) == 0) {
|
||||
WARN("Element %d of pool %d has been lost\n", e, i);
|
||||
WARN("Element %d of pool %d has been lost", e, i);
|
||||
struct ncclProxyArgs* free = state->pool;
|
||||
printf("Free list ");
|
||||
while (free) {
|
||||
@@ -164,7 +164,7 @@ static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
|
||||
struct ncclPeer* peerComm = args->channel->peers+peer;
|
||||
struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
|
||||
if (connector->transportComm == NULL) {
|
||||
WARN("[%d] Error no transport for %s peer %d on channel %d\n", connector->comm->rank,
|
||||
WARN("[%d] Error no transport for %s peer %d on channel %d", connector->comm->rank,
|
||||
type == proxyRecv ? "recv" : "send", peer, args->channel->id);
|
||||
return ncclInternalError;
|
||||
}
|
||||
@@ -480,7 +480,7 @@ ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int typ
|
||||
while (nslots*state->slotSize < size) nslots *= 2;
|
||||
int s = (ptr-buff)/state->slotSize;
|
||||
if (s < 0 || s+nslots > state->nslots) {
|
||||
WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)\n", ptr, size, buff, state->slotSize, state->nslots);
|
||||
WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)", ptr, size, buff, state->slotSize, state->nslots);
|
||||
return ncclInternalError;
|
||||
}
|
||||
for (int i=0; i<nslots; i++) used[s+i] = 0;
|
||||
|
||||
@@ -295,7 +295,7 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
|
||||
int count = size/ncclTypeSize(args->dtype);
|
||||
NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*) buff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
|
||||
TRACE(NCCL_NET, "sendProxy [%lu/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
@@ -312,7 +312,7 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
|
||||
int buffSlot = args->done%NCCL_STEPS;
|
||||
NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
|
||||
if (done) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size);
|
||||
TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size);
|
||||
STORE(&reqFifo[buffSlot].size, size);
|
||||
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
|
||||
// (reordered store after store is possible on POWER, though not on x86)
|
||||
@@ -357,7 +357,7 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
|
||||
char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
|
||||
int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
|
||||
STORE(&reqFifo[buffSlot].recvBuff, recvBuff+buffSlot*recvStepSize);
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff);
|
||||
TRACE(NCCL_NET, "recvProxy [%lu/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff);
|
||||
args->posted += args->sliceSteps;
|
||||
args->idle = 0;
|
||||
return ncclSuccess;
|
||||
@@ -365,7 +365,7 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
|
||||
if (args->posted > args->received) {
|
||||
int buffSlot = args->received%NCCL_STEPS;
|
||||
if (LOAD(&reqFifo[buffSlot].recvBuff) == NULL) { // Buffer is cleared : coll is complete
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->received, buffSlot, LOAD(&reqFifo[buffSlot].size));
|
||||
TRACE(NCCL_NET, "recvProxy [%lu/%d] done, size %d", args->received, buffSlot, LOAD(&reqFifo[buffSlot].size));
|
||||
if (args->protocol == NCCL_PROTO_LL) { // ll
|
||||
// re-attach flag
|
||||
uint32_t flag = NCCL_LL_FLAG(args->received + 1);
|
||||
|
||||
+3
-3
@@ -362,7 +362,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
args->channel->sizes += LOAD(sizesFifo+buffSlot);
|
||||
args->channel->send_byte += LOAD(sizesFifo+buffSlot);
|
||||
#endif
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
|
||||
TRACE(NCCL_NET, "sendProxy [%lu/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
|
||||
STORE(sizesFifo+buffSlot, -1);
|
||||
// Make sure size is reset to zero before we update the head.
|
||||
__sync_synchronize();
|
||||
@@ -379,7 +379,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
|
||||
int buffSlot = args->done%NCCL_STEPS;
|
||||
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
|
||||
if (done) {
|
||||
TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot]);
|
||||
TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done", args->done, buffSlot, args->requests[buffSlot]);
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||
args->channel->active_req --;
|
||||
@@ -446,7 +446,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
|
||||
}
|
||||
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, args->requests+buffSlot));
|
||||
if (args->requests[buffSlot] != NULL) {
|
||||
TRACE(NCCL_NET, "recvProxy [%d/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]);
|
||||
TRACE(NCCL_NET, "recvProxy [%lu/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]);
|
||||
#ifdef ENABLE_PROFILING
|
||||
if (args->protocol == NCCL_PROTO_SIMPLE) {
|
||||
if (args->channel->active_req == 0) {
|
||||
|
||||
+1
-1
@@ -85,7 +85,7 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
|
||||
snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName);
|
||||
char* p = realpath(devicePath, NULL);
|
||||
if (p == NULL) {
|
||||
WARN("Could not find real path of %s", *devicePath);
|
||||
WARN("Could not find real path of %s (%s)", devName, devicePath);
|
||||
} else {
|
||||
// Merge multi-port NICs into the same PCI device
|
||||
p[strlen(p)-1] = '0';
|
||||
|
||||
@@ -238,7 +238,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
|
||||
if (fd == -1) {
|
||||
// Could not find device vendor. This is handled silently so
|
||||
// we don't want to print an INFO error.
|
||||
TRACE(NCCL_NET, "Open of %s failed : %s\n", vendorPath, strerror(errno));
|
||||
TRACE(NCCL_NET, "Open of %s failed : %s", vendorPath, strerror(errno));
|
||||
goto end;
|
||||
}
|
||||
char vendor[7];
|
||||
|
||||
새 이슈에서 참조
사용자 차단