Merge remote-tracking branch 'nccl/master' into 2.8.3

Bu işleme şunda yer alıyor:
Wenkai Du
2021-02-15 19:29:45 +00:00
işleme c985358e11
23 değiştirilmiş dosya ile 82 ekleme ve 52 silme
+9 -1
Dosyayı Görüntüle
@@ -1,5 +1,5 @@
Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -30,3 +30,11 @@
The U.S. Department of Energy funded the development of this software
under subcontract 7078610 with Lawrence Berkeley National Laboratory.
This code also includes files from the NVIDIA Tools Extension SDK project.
See:
https://github.com/NVIDIA/NVTX
for more information and license details.
+2 -2
Dosyayı Görüntüle
@@ -86,6 +86,6 @@ Please refer to the [Library documentation](http://rccl.readthedocs.io/) for cur
## Copyright
All source code and accompanying documentation is copyright (c) 2015-2018, NVIDIA CORPORATION. All rights reserved.
All source code and accompanying documentation is copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
All modifications are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
All modifications are copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
+1 -1
Dosyayı Görüntüle
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 8
NCCL_PATCH := 3
NCCL_PATCH := 4
NCCL_SUFFIX :=
PKG_REVISION := 1
+3 -3
Dosyayı Görüntüle
@@ -78,7 +78,7 @@ static ncclResult_t bootstrapNetRecv(int fd, void* data, int size) {
int recvSize;
NCCLCHECK(socketRecv(fd, &recvSize, sizeof(int)));
if (recvSize > size) {
WARN("Message truncated : received %d bytes instead of %d\n", recvSize, size);
WARN("Message truncated : received %d bytes instead of %d", recvSize, size);
return ncclInternalError;
}
NCCLCHECK(socketRecv(fd, data, std::min(recvSize, size)));
@@ -274,7 +274,7 @@ static ncclResult_t remoteAlloc(void** ptr, int fd) {
void* ncclRemoteMemAllocationService(void* args) {
struct remAllocState* state = (struct remAllocState *) args;
if (hipSetDevice(state->cudaDev) != hipSuccess) {
WARN("[Rem Allocator] Failed to set CUDA device %d\n", state->cudaDev);
WARN("[Rem Allocator] Failed to set CUDA device %d", state->cudaDev);
}
// Prepare poll descriptor
@@ -523,7 +523,7 @@ ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size) {
ncclResult_t bootstrapClose(void* commState) {
struct extState* state = (struct extState*)commState;
if (state->unexpectedConnections != NULL) {
WARN("Unexpected connections are not empty.\n");
WARN("Unexpected connections are not empty");
return ncclInternalError;
}
close(state->extListenFd);
+1 -1
Dosyayı Görüntüle
@@ -376,7 +376,7 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
info->nstepsPerLoop = 1;
info->nchunksPerLoop = info->comm->nRanks; break;
default:
WARN("Unknown pattern %d\n", info->pattern);
WARN("Unknown pattern %d", info->pattern);
return ncclInternalError;
}
return ncclSuccess;
+3 -3
Dosyayı Görüntüle
@@ -117,7 +117,7 @@ static ncclResult_t setTreeDown(struct ncclTree* tree, int* indexes, int d) {
int x = 0;
while (x < NCCL_MAX_TREE_ARITY && tree->down[x] >= 0) x++;
if (x == NCCL_MAX_TREE_ARITY) {
WARN("Internal error : tree already has %d children (%d %d %d)\n", x, tree->down[0], tree->down[1], tree->down[2]);
WARN("Internal error : tree already has %d children (%d %d %d)", x, tree->down[0], tree->down[1], tree->down[2]);
return ncclInternalError;
}
tree->down[x] = indexes[d];
@@ -215,7 +215,7 @@ int ncclMinNchannels() {
if (ncclParamMinNrings() != -2) minNchannels = ncclParamMinNrings();
if (ncclParamMinNchannels() != -2) minNchannels = ncclParamMinNchannels();
if (minNchannels > MAXCHANNELS) {
WARN("User asked for a minimum of %d channels, limiting to %d\n", minNchannels, MAXCHANNELS);
WARN("User asked for a minimum of %d channels, limiting to %d", minNchannels, MAXCHANNELS);
minNchannels = MAXCHANNELS;
}
if (minNchannels < 0) minNchannels = 0;
@@ -227,7 +227,7 @@ int ncclMaxNchannels() {
if (ncclParamMaxNchannels() != -2) maxNchannels = ncclParamMaxNchannels();
if (maxNchannels > MAXCHANNELS) maxNchannels = MAXCHANNELS;
if (maxNchannels < 1) {
WARN("User asked for a maximum of %d channels, setting it to 1\n", maxNchannels);
WARN("User asked for a maximum of %d channels, setting it to 1", maxNchannels);
maxNchannels = 1;
}
return maxNchannels;
+4 -1
Dosyayı Görüntüle
@@ -26,7 +26,7 @@ static ncclResult_t getPath(struct ncclTopoSystem* system, struct ncclTopoNode*
return ncclSuccess;
}
}
WARN("Could not find node of type %d id %lx\n", t, id);
WARN("Could not find node of type %d id %lx", t, id);
return ncclInternalError;
}
@@ -282,6 +282,9 @@ ncclResult_t ncclTopoCheckP2p(struct ncclTopoSystem* system, int64_t id1, int64_
if (model == NCCL_TOPO_CPU_TYPE_BDW) p2pLevel = PATH_PXB;
else p2pLevel = PATH_SYS;
}
if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
p2pLevel = PATH_PXB;
}
compare:
// Compute the PCI distance and compare with the p2pLevel.
+4 -4
Dosyayı Görüntüle
@@ -58,7 +58,7 @@ static ncclResult_t findRevLink(struct ncclTopoNode* node1, struct ncclTopoNode*
return ncclSuccess;
}
}
WARN("Could not find rev link for %d/%d -> %d/%d\n", node1->type, node1->id, node2->type, node2->id);
WARN("Could not find rev link for %d/%ld -> %d/%ld", node1->type, node1->id, node2->type, node2->id);
return ncclInternalError;
}
@@ -191,7 +191,7 @@ static ncclResult_t getGpuIndex(struct ncclTopoSystem* system, int rank, int* in
return ncclSuccess;
}
}
WARN("Could not find gpu rank %d\n", rank);
WARN("Could not find gpu rank %d", rank);
return ncclInternalError;
}
@@ -202,7 +202,7 @@ static ncclResult_t getNetIndex(struct ncclTopoSystem* system, int64_t id, int*
return ncclSuccess;
}
}
WARN("Could not find net id %lx\n", id);
WARN("Could not find net id %lx", id);
return ncclInternalError;
}
@@ -1253,7 +1253,7 @@ done:
}
if (graph->nChannels == 0 && graph->collNet == 0) {
WARN("Could not find a path for pattern %d, falling back to simple order\n", graph->pattern);
WARN("Could not find a path for pattern %d, falling back to simple order", graph->pattern);
for (int i=0; i<ngpus; i++) graph->intra[i] = system->nodes[GPU].nodes[i].gpu.rank;
graph->inter[0] = graph->inter[1] = 0;
graph->speedIntra = graph->speedInter = 0.1;
+11 -3
Dosyayı Görüntüle
@@ -82,6 +82,9 @@ static ncclResult_t ncclTopoGetInterCpuWidth(struct ncclTopoNode* cpu, float* wi
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_INTEL) {
*width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_SKL ? SKL_QPI_WIDTH : QPI_WIDTH;
}
if (cpu->cpu.arch == NCCL_TOPO_CPU_ARCH_X86 && cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
*width = cpu->cpu.model == NCCL_TOPO_CPU_TYPE_YONGFENG ? YONGFENG_ZPI_WIDTH : ZPI_WIDTH;
}
return ncclSuccess;
}
@@ -104,7 +107,7 @@ ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode*
ncclResult_t ncclTopoCreateNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id) {
if (system->nodes[type].count == NCCL_TOPO_MAX_NODES) {
WARN("Error : tried to create too many nodes of type %d\n", type);
WARN("Error : tried to create too many nodes of type %d", type);
return ncclInternalError;
}
struct ncclTopoNode* n = system->nodes[type].nodes+system->nodes[type].count;
@@ -379,7 +382,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
}
struct kvDict kvDictCpuArch[] = { { "x86_64", NCCL_TOPO_CPU_ARCH_X86 }, { "arm64", NCCL_TOPO_CPU_ARCH_ARM }, { "ppc64", NCCL_TOPO_CPU_ARCH_POWER }, { NULL, 0 } };
struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { NULL, 0 } };
struct kvDict kvDictCpuVendor[] = { { "GenuineIntel", NCCL_TOPO_CPU_VENDOR_INTEL }, { "AuthenticAMD", NCCL_TOPO_CPU_VENDOR_AMD }, { "CentaurHauls", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { " Shanghai ", NCCL_TOPO_CPU_VENDOR_ZHAOXIN }, { NULL, 0 } };
ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* system) {
int numaId;
@@ -402,6 +405,11 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
cpu->cpu.model = (familyId == 6 && modelId >= 0x55) ? NCCL_TOPO_CPU_TYPE_SKL : NCCL_TOPO_CPU_INTEL_BDW;
} else if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_ZHAOXIN) {
int familyId, modelId;
NCCLCHECK(xmlGetAttrInt(xmlCpu, "familyid", &familyId));
NCCLCHECK(xmlGetAttrInt(xmlCpu, "modelid", &modelId));
if (familyId == 7 && modelId == 0x5B) cpu->cpu.model = NCCL_TOPO_CPU_TYPE_YONGFENG;
}
if (cpu->cpu.vendor == NCCL_TOPO_CPU_VENDOR_AMD) {
int familyId, modelId;
@@ -486,7 +494,7 @@ ncclResult_t ncclTopoAddNvLinks(struct ncclXmlNode* node, struct ncclTopoSystem*
NCCLCHECK(busIdToInt64(parentBusId, &pBusId));
NCCLCHECK(ncclTopoGetNode(system, &gpu, GPU, pBusId));
if (gpu == NULL) {
WARN("Add NVLink error : could not find GPU %lx\n", pBusId);
WARN("Add NVLink error : could not find GPU %lx", pBusId);
return ncclInternalError;
}
int count;
+2
Dosyayı Görüntüle
@@ -20,6 +20,8 @@
#define PCI_WIDTH 12.0 // PCI Gen3 x16
#define QPI_WIDTH 6.0
#define SKL_QPI_WIDTH 9.0
#define ZPI_WIDTH 6.0
#define YONGFENG_ZPI_WIDTH 9.0
#define P9_WIDTH 32.0
#define ARM_WIDTH 6.0
#define NET_WIDTH 12.0 // 100Gbit
+6 -6
Dosyayı Görüntüle
@@ -71,7 +71,7 @@ ncclResult_t xmlGetToken(FILE* file, char* name, char* value, char* last) {
if (c == '=') {
ptr[o] = '\0';
if (value == NULL) {
WARN("XML Parse : Unexpected value with name %s\n", ptr);
WARN("XML Parse : Unexpected value with name %s", ptr);
return ncclInternalError;
}
return xmlGetValue(file, value, last);
@@ -137,7 +137,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
// Re-read the name, we got '/' in the first call
NCCLCHECK(xmlGetToken(file, node->name, NULL, &c));
if (c != '>') {
WARN("XML Parse error : unexpected trailing %c in closing tag %s\n", c, node->name);
WARN("XML Parse error : unexpected trailing %c in closing tag %s", c, node->name);
return ncclInternalError;
}
return ncclSuccess;
@@ -150,7 +150,7 @@ ncclResult_t xmlGetNode(FILE* file, struct ncclXmlNode* node) {
while (c == ' ') {
NCCLCHECK(xmlGetToken(file, node->attrs[a].key, node->attrs[a].value, &c));
if (a == MAX_ATTR_COUNT) {
INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)\n", MAX_ATTR_COUNT);
INFO(NCCL_GRAPH, "XML Parse : Ignoring extra attributes (max %d)", MAX_ATTR_COUNT);
// Actually we need to still consume the extra attributes so we have an extra one.
} else a++;
}
@@ -178,7 +178,7 @@ ncclResult_t xmlLoadSub(FILE* file, struct ncclXml* xml, struct ncclXmlNode* hea
if (head && head->type == NODE_TYPE_SINGLE) return ncclSuccess;
while (1) {
if (xml->maxIndex == MAX_NODES) {
WARN("Error : XML parser is limited to 1024 nodes\n");
WARN("Error : XML parser is limited to 1024 nodes");
return ncclInternalError;
}
struct ncclXmlNode* node = xml->nodes+xml->maxIndex;
@@ -373,7 +373,7 @@ ncclResult_t ncclTopoSetAttrFromSys(struct ncclXmlNode* pciNode, const char* pat
char strValue[MAX_STR_LEN];
NCCLCHECK(ncclTopoGetStrFromSys(path, fileName, strValue));
if (strValue[0] != '\0') { NCCLCHECK(xmlSetAttr(pciNode, attrName, strValue)); }
TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s\n", path, fileName, attrName, strValue);
TRACE(NCCL_GRAPH, "Read from sys %s/%s -> %s=%s", path, fileName, attrName, strValue);
return ncclSuccess;
}
@@ -661,7 +661,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, nvmlDevice_t nvm
int maxNvLinks = (sm < 60) ? 0 : (sm < 70) ? 4 : (sm < 80) ? 6 : 12;
if (maxNvLinks > 0 && nvmlDev == NULL) {
WARN("No NVML device handle. Skipping nvlink detection.\n");
WARN("No NVML device handle. Skipping nvlink detection.");
maxNvLinks = 0;
}
+6 -2
Dosyayı Görüntüle
@@ -202,7 +202,7 @@ ncclResult_t ncclGroupEnd() {
if (args->funcType == ASYNC_FUNC_COLL && args->coll.comm->connect) {
int err = pthread_join(ncclGroupThreads[i], NULL);
if (err != 0) {
WARN("Error waiting for pthread_join : %s\n", strerror(errno));
WARN("Error waiting for pthread_join : %s", strerror(errno));
return ncclSystemError;
}
NCCLCHECKGOTO(args->ret, ret, end);
@@ -234,7 +234,7 @@ ncclResult_t ncclGroupEnd() {
// schedule delta 0, +1, -1, +2, -2, ...
// also make sure we don't do 0 twice, nor +n/2 and -n/2 if n is even.
for (int d=0; d<=nRanks/4; d++) {
int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, nRanks-(nRanks/2-d) };
int deltas[4] = { d, (nRanks-d)%nRanks, nRanks/2-d, (nRanks-(nRanks/2-d))%nRanks };
int index = 0;
int delta = deltas[index];
sched_delta:
@@ -259,6 +259,10 @@ sched_delta:
ssize_t sendbytes = totSendBytes-sendOffset;
if (recvbytes > recvChunkSize) { recvbytes = recvChunkSize; } else { recvRemaining = 0; }
if (sendbytes > sendChunkSize) { sendbytes = sendChunkSize; } else { sendRemaining = 0; }
// 0-bytes send/recv are considered as syncs. Make sure we only add syncs when requested
// (total size == 0), otherwise set size to -1 so that the kernel skips the operation.
if (sendbytes == 0 && totSendBytes != 0) sendbytes = -1;
if (recvbytes == 0 && totRecvBytes != 0) recvbytes = -1;
if (sendbytes >= 0 || recvbytes >= 0) {
NCCLCHECKGOTO(scheduleSendRecv(comm, delta, channelId,
recvbytes, recv ? ((char*)(recv->buff)) + recvOffset : NULL,
+1 -1
Dosyayı Görüntüle
@@ -25,7 +25,7 @@ extern pthread_mutex_t ncclDebugOutputLock;
extern FILE *ncclDebugFile;
extern ncclResult_t getHostName(char* hostname, int maxlen, const char delim);
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...);
void ncclDebugLog(ncclDebugLogLevel level, unsigned long flags, const char *filefunc, int line, const char *fmt, ...) __attribute__ ((format (printf, 5, 6)));
// Let code temporarily downgrade WARN into INFO
extern thread_local int ncclDebugNoWarn;
+2
Dosyayı Görüntüle
@@ -40,10 +40,12 @@ ncclResult_t ncclTopoSetAffinity(struct ncclTopoSystem* system, int rank);
#define NCCL_TOPO_CPU_ARCH_ARM 3
#define NCCL_TOPO_CPU_VENDOR_INTEL 1
#define NCCL_TOPO_CPU_VENDOR_AMD 2
#define NCCL_TOPO_CPU_VENDOR_ZHAOXIN 3
#define NCCL_TOPO_CPU_TYPE_BDW 1
#define NCCL_TOPO_CPU_TYPE_SKL 2
#define NCCL_TOPO_CPU_TYPE_ZEN 3
#define NCCL_TOPO_CPU_TYPE_ROME 4
#define NCCL_TOPO_CPU_TYPE_YONGFENG 1
ncclResult_t ncclTopoCpuType(struct ncclTopoSystem* system, int* arch, int* vendor, int* model);
ncclResult_t ncclTopoGetNetCount(struct ncclTopoSystem* system, int* count);
+1 -1
Dosyayı Görüntüle
@@ -46,7 +46,7 @@ static ncclResult_t shmOpen(const char* shmname, const int shmsize, void** shmPt
*shmPtr = ptr;
return ncclSuccess;
sysError:
WARN("Error while %s shared memory segment %s (size %d)\n", create ? "creating" : "attaching to", shmname, shmsize);
WARN("Error while %s shared memory segment %s (size %d)", create ? "creating" : "attaching to", shmname, shmsize);
hipError:
if (fd != -1) close(fd);
if (create) shm_unlink(shmname);
+1 -1
Dosyayı Görüntüle
@@ -372,7 +372,7 @@ static ncclResult_t connectAddress(int* fd, union socketAddress* remoteAddr) {
/* IPv4/IPv6 support */
int family = remoteAddr->sa.sa_family;
if (family != AF_INET && family != AF_INET6) {
WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n", family, AF_INET, AF_INET6);
WARN("Error : connecting to address with family %d is neither AF_INET(%d) nor AF_INET6(%d)", family, AF_INET, AF_INET6);
return ncclInternalError;
}
int salen = (family == AF_INET) ? sizeof(sockaddr_in) : sizeof(sockaddr_in6);
+9 -6
Dosyayı Görüntüle
@@ -286,7 +286,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
#define VEGA_GPU_RTC_FREQUENCY 2.5E7
if (comm->rank == 0) {
INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "Rank", "total", " wait", "w_recv", "send", "rcRdS", "dRcRdCS", "dRcCS", "dRc", "cS", "rc", "rcCS");
INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
INFO(NCCL_INIT, "# %4s %6s %6s %6s %6s %6s %7s %6s %6s %6s %6s %6s", "", "(s)", "(s)", "(s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)", "(GB/s)");
}
INFO(NCCL_INIT, "# %4d %6.4f %6.4f %6.4f %6.2f %6.2f %7.2f %6.2f %6.2f %6.2f %6.2f %6.2f",
comm->rank, (double)prof->total_cycle/VEGA_GPU_RTC_FREQUENCY/comm->nChannels,
@@ -385,7 +385,7 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
comm->nRanks = comm->hostDevComm.nRanks = ndev;
hipGetDevice(&comm->cudaDev);
NCCLCHECK(getBusId(comm->cudaDev, &comm->busId));
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x", comm, rank, ndev, comm->cudaDev, comm->busId);
TRACE(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx", comm, rank, ndev, comm->cudaDev, comm->busId);
comm->doneEvent = doneEvent;
comm->checkPointers = ncclParamCheckPointers() == 1 ? true : false;
@@ -766,7 +766,7 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
for (int i = 0; i < nranks; i++) {
memcpy(comm->peerInfo+i, &allGather1Data[i].peerInfo, sizeof(struct ncclPeerInfo));
if ((i != rank) && (comm->peerInfo[i].hostHash == myInfo->hostHash) && (comm->peerInfo[i].busId == myInfo->busId)) {
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %x", rank, i, myInfo->busId);
WARN("Duplicate GPU detected : rank %d and rank %d both on CUDA device %lx", rank, i, myInfo->busId);
return ncclInvalidUsage;
}
}
@@ -1163,7 +1163,7 @@ ncclResult_t ncclCommInitRankSync(ncclComm_t* newcomm, int nranks, ncclUniqueId
NCCLCHECKGOTO(initTransportsRank(*newcomm, &commId), res, cleanup);
NCCLCHECKGOTO(devCommSetup(*newcomm), res, cleanup);
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %x - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
INFO(NCCL_INIT,"comm %p rank %d nranks %d cudaDev %d busId %lx - Init COMPLETE", *newcomm, myrank, nranks, (*newcomm)->cudaDev, (*newcomm)->busId);
return ncclSuccess;
cleanup:
@@ -1234,6 +1234,9 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
static ncclResult_t commDestroy(ncclComm_t comm) {
int savedDevice;
#ifdef ENABLE_TRACE
int rank = comm->rank;
#endif
CUDACHECK(hipGetDevice(&savedDevice));
int commDevice = comm->cudaDev;
@@ -1250,7 +1253,7 @@ static ncclResult_t commDestroy(ncclComm_t comm) {
if (savedDevice != commDevice)
CUDACHECK(hipSetDevice(savedDevice));
TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, comm->rank);
TRACE(NCCL_INIT, "Destroyed comm %p rank %d", comm, rank);
return ncclSuccess;
}
@@ -1261,7 +1264,7 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
if (comm == NULL)
return ncclSuccess;
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %x", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
TRACE(NCCL_INIT, "comm %p rank %d nRanks %d cudaDev %d busId %lx", comm, comm->rank, comm->nRanks, comm->cudaDev, comm->busId);
// Try and prevent a double free of the comm struct (user error)
if (comm->rank == -1 || comm->nRanks <= 0 || comm->cudaDev == -1 || comm->busId == -1) {
+2 -2
Dosyayı Görüntüle
@@ -69,10 +69,10 @@ ncclResult_t getHostName(char* hostname, int maxlen, const char delim) {
}
uint64_t getHash(const char* string, int n) {
// Based on DJB2, result = result * 33 + char
// Based on DJB2a, result = result * 33 ^ char
uint64_t result = 5381;
for (int c = 0; c < n; c++) {
result = ((result << 5) + result) + string[c];
result = ((result << 5) + result) ^ string[c];
}
return result;
}
+5 -5
Dosyayı Görüntüle
@@ -72,7 +72,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
struct ncclProxyArgs* op = state->ops;
while (op) {
if (op->idle & OP_SEEN) {
WARN("Active list loop at element %ld\n", OP_INDEX(op));
WARN("Active list loop at element %ld", OP_INDEX(op));
}
op->idle |= OP_SEEN;
printf("[%ld]", OP_INDEX(op));
@@ -98,7 +98,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
struct ncclProxyArgs* free = state->pool;
while (free) {
if (free->idle & OP_SEEN) {
WARN("Free list loop at element %ld\n", OP_INDEX(free));
WARN("Free list loop at element %ld", OP_INDEX(free));
}
free->idle |= OP_SEEN;
free = free->next;
@@ -109,7 +109,7 @@ ncclResult_t dumpProxyState(struct ncclProxyState* state) {
while (p) {
for (int e=0; e<PROXYARGS_ALLOCATE_SIZE; e++) {
if ((p->elems[e].idle & OP_SEEN) == 0) {
WARN("Element %d of pool %d has been lost\n", e, i);
WARN("Element %d of pool %d has been lost", e, i);
struct ncclProxyArgs* free = state->pool;
printf("Free list ");
while (free) {
@@ -164,7 +164,7 @@ static ncclResult_t SaveProxy(int type, int peer, struct ncclProxyArgs* args) {
struct ncclPeer* peerComm = args->channel->peers+peer;
struct ncclConnector* connector = type == proxyRecv ? &peerComm->recv : &peerComm->send;
if (connector->transportComm == NULL) {
WARN("[%d] Error no transport for %s peer %d on channel %d\n", connector->comm->rank,
WARN("[%d] Error no transport for %s peer %d on channel %d", connector->comm->rank,
type == proxyRecv ? "recv" : "send", peer, args->channel->id);
return ncclInternalError;
}
@@ -480,7 +480,7 @@ ncclResult_t ncclProxySharedBuffersFree(struct ncclComm* comm, int cuda, int typ
while (nslots*state->slotSize < size) nslots *= 2;
int s = (ptr-buff)/state->slotSize;
if (s < 0 || s+nslots > state->nslots) {
WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)\n", ptr, size, buff, state->slotSize, state->nslots);
WARN("Error freeing shared buffer : freeing ptr %p size %d (start %p slot size %d nslots %d)", ptr, size, buff, state->slotSize, state->nslots);
return ncclInternalError;
}
for (int i=0; i<nslots; i++) used[s+i] = 0;
+4 -4
Dosyayı Görüntüle
@@ -295,7 +295,7 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
int count = size/ncclTypeSize(args->dtype);
NCCLCHECK(collNetIallreduce(resources->collNetSendComm, (void*) buff, (void*)(reqFifo[buffSlot].recvBuff), count, args->dtype, args->redOp, sendMhandle, recvMhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "sendProxy [%d/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
TRACE(NCCL_NET, "sendProxy [%lu/%d] Iallreduce posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
STORE(sizesFifo+buffSlot, -1);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
@@ -312,7 +312,7 @@ ncclResult_t collNetSendProxy(struct ncclProxyArgs* args) {
int buffSlot = args->done%NCCL_STEPS;
NCCLCHECK(collNetTest((void*)(args->requests[buffSlot]), &done, &size));
if (done) {
TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size);
TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot], size);
STORE(&reqFifo[buffSlot].size, size);
// Make sure size is updated before we set recvBuff to NULL (from the view of recv proxy, concerning the flush)
// (reordered store after store is possible on POWER, though not on x86)
@@ -357,7 +357,7 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
char* recvBuff = p == NCCL_PROTO_LL ? (char*)resources->llData : localBuff;
int recvStepSize = p == NCCL_PROTO_LL ? stepSize/2 : stepSize;
STORE(&reqFifo[buffSlot].recvBuff, recvBuff+buffSlot*recvStepSize);
TRACE(NCCL_NET, "recvProxy [%d/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff);
TRACE(NCCL_NET, "recvProxy [%lu/%d] posted buffer %p", args->posted, buffSlot, reqFifo[buffSlot].recvBuff);
args->posted += args->sliceSteps;
args->idle = 0;
return ncclSuccess;
@@ -365,7 +365,7 @@ ncclResult_t collNetRecvProxy(struct ncclProxyArgs* args) {
if (args->posted > args->received) {
int buffSlot = args->received%NCCL_STEPS;
if (LOAD(&reqFifo[buffSlot].recvBuff) == NULL) { // Buffer is cleared : coll is complete
TRACE(NCCL_NET, "recvProxy [%d/%d] done, size %d", args->received, buffSlot, LOAD(&reqFifo[buffSlot].size));
TRACE(NCCL_NET, "recvProxy [%lu/%d] done, size %d", args->received, buffSlot, LOAD(&reqFifo[buffSlot].size));
if (args->protocol == NCCL_PROTO_LL) { // ll
// re-attach flag
uint32_t flag = NCCL_LL_FLAG(args->received + 1);
+3 -3
Dosyayı Görüntüle
@@ -362,7 +362,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
args->channel->sizes += LOAD(sizesFifo+buffSlot);
args->channel->send_byte += LOAD(sizesFifo+buffSlot);
#endif
TRACE(NCCL_NET, "sendProxy [%d/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
TRACE(NCCL_NET, "sendProxy [%lu/%d] Isend (LL) posted, req %p", args->transmitted, buffSlot, args->requests[buffSlot]);
STORE(sizesFifo+buffSlot, -1);
// Make sure size is reset to zero before we update the head.
__sync_synchronize();
@@ -379,7 +379,7 @@ ncclResult_t netSendProxy(struct ncclProxyArgs* args) {
int buffSlot = args->done%NCCL_STEPS;
NCCLCHECK(ncclNetTest(args->requests[buffSlot], &done, NULL));
if (done) {
TRACE(NCCL_NET, "sendProxy [%d/%d] request %p done, size %d", args->done, buffSlot, args->requests[buffSlot]);
TRACE(NCCL_NET, "sendProxy [%lu/%d] request %p done", args->done, buffSlot, args->requests[buffSlot]);
#ifdef ENABLE_PROFILING
if (args->protocol == NCCL_PROTO_SIMPLE) {
args->channel->active_req --;
@@ -446,7 +446,7 @@ ncclResult_t netRecvProxy(struct ncclProxyArgs* args) {
}
NCCLCHECK(ncclNetIrecv(resources->netRecvComm, ptr, buffSize, mhandle, args->requests+buffSlot));
if (args->requests[buffSlot] != NULL) {
TRACE(NCCL_NET, "recvProxy [%d/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]);
TRACE(NCCL_NET, "recvProxy [%lu/%d] posted recv request %p", args->posted, buffSlot, args->requests[buffSlot]);
#ifdef ENABLE_PROFILING
if (args->protocol == NCCL_PROTO_SIMPLE) {
if (args->channel->active_req == 0) {
+1 -1
Dosyayı Görüntüle
@@ -85,7 +85,7 @@ static ncclResult_t ncclIbGetPciPath(char* devName, char** path, int* realPort)
snprintf(devicePath, PATH_MAX, "/sys/class/infiniband/%s/device", devName);
char* p = realpath(devicePath, NULL);
if (p == NULL) {
WARN("Could not find real path of %s", *devicePath);
WARN("Could not find real path of %s (%s)", devName, devicePath);
} else {
// Merge multi-port NICs into the same PCI device
p[strlen(p)-1] = '0';
+1 -1
Dosyayı Görüntüle
@@ -238,7 +238,7 @@ ncclResult_t ncclSocketGetNsockNthread(int dev, int* ns, int* nt) {
if (fd == -1) {
// Could not find device vendor. This is handled silently so
// we don't want to print an INFO error.
TRACE(NCCL_NET, "Open of %s failed : %s\n", vendorPath, strerror(errno));
TRACE(NCCL_NET, "Open of %s failed : %s", vendorPath, strerror(errno));
goto end;
}
char vendor[7];