diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt index 2cac2be33d..b3d770994e 100644 --- a/projects/rccl/CMakeLists.txt +++ b/projects/rccl/CMakeLists.txt @@ -125,8 +125,7 @@ set(CMAKE_CXX_EXTENSIONS OFF) # Without this line, it will add -std=gnu++14 ins list(APPEND CMAKE_PREFIX_PATH # Add ROCM_PATH to CMake search paths (for finding HIP / HSA ${ROCM_PATH} ${ROCM_PATH}/hip - ${ROCM_PATH}/llvm - ${ROCM_PATH}/hcc) + ${ROCM_PATH}/llvm) # Check for required dependencies #================================================================================================== diff --git a/projects/rccl/src/device/msccl_kernel_impl.h b/projects/rccl/src/device/msccl_kernel_impl.h index e77223eba9..fdcc0502d2 100644 --- a/projects/rccl/src/device/msccl_kernel_impl.h +++ b/projects/rccl/src/device/msccl_kernel_impl.h @@ -48,7 +48,7 @@ extern __shared__ struct mscclShmemData mscclShmem; #endif inline __device__ static void barrier(int nthreads) { -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) assert(nthreads == NCCL_MAX_NTHREADS); #ifdef __GFX12__ __asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0)\ns_barrier_signal -1\ns_barrier_wait -1"); diff --git a/projects/rccl/src/device/prims_ll.h b/projects/rccl/src/device/prims_ll.h index 5738003b5c..3be676a23f 100644 --- a/projects/rccl/src/device/prims_ll.h +++ b/projects/rccl/src/device/prims_ll.h @@ -69,7 +69,7 @@ private: uint64_t* barrier_next; inline __device__ void barrier() { -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) if (nthreads != WARP_SIZE) barrier_by_group(); #else @@ -147,7 +147,7 @@ private: } #endif -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) union ncclLLFifoLine i4; do { #ifdef __GFX11__ @@ -190,7 +190,7 @@ private: for (int i=BeginIx; i < MaxRecv; i++) { if (i < fan.nrecv()) { union ncclLLFifoLine* src = recvPtr(i) + offset; -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #ifdef __GFX11__ asm volatile ("global_load_b128 %0, %1, off glc slc dlc\n" "s_waitcnt vmcnt(0)\n" : "=v"(line[i].i4) : "v"(&src->i4)); @@ -217,7 +217,7 @@ private: #endif do { -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #ifdef __GFX11__ asm volatile ("global_load_b128 %0, %1, off glc slc dlc\n" "s_waitcnt vmcnt(0)\n" : "=v"(line[i].i4) : "v"(&src->i4)); @@ -246,7 +246,7 @@ private: } __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) { -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) union ncclLLFifoLine i4; i4.data1 = val & 0xffffffff; i4.flag1 = flag; @@ -270,7 +270,7 @@ private: uint32_t u4; uint64_t u8; }; -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) if(sizeof(U) == 1) #ifdef __GFX11__ u1 = __atomic_load_n((uint8_t*)src, __ATOMIC_RELAXED); @@ -318,7 +318,7 @@ private: uint64_t u8; }; elt = val; -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) if(sizeof(U) == 1) __builtin_nontemporal_store(u1, (uint8_t*)dst); else if(sizeof(U) == 2) diff --git a/projects/rccl/src/device/prims_ll128.h b/projects/rccl/src/device/prims_ll128.h index 3ba38316fb..2b9cefa505 100644 --- a/projects/rccl/src/device/prims_ll128.h +++ b/projects/rccl/src/device/prims_ll128.h @@ -74,7 +74,7 @@ private: #endif inline __device__ void barrier() { -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) if (nthreads != WARP_SIZE) barrier_by_group(); #else diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc index 96db86bd03..f7ac3d642c 100644 --- a/projects/rccl/src/enqueue.cc +++ b/projects/rccl/src/enqueue.cc @@ -1645,7 +1645,7 @@ static ncclResult_t getChannnelThreadInfo(struct ncclInfo* collInfo) { // Ring/Tree channel tuning while (collInfo->nBytes < nc*nt*threadThreshold) { if (nc >= 2) nc--; -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) // do not reduce threads count on VEGA #else else if ((nt % 128) == 0) nt/=2; @@ -1653,7 +1653,7 @@ static ncclResult_t getChannnelThreadInfo(struct ncclInfo* collInfo) { else break; } } -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #else if (collInfo->protocol == NCCL_PROTO_SIMPLE) { if (collInfo->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync diff --git a/projects/rccl/src/graph/paths.cc b/projects/rccl/src/graph/paths.cc index 21389f536b..b72c9b0466 100644 --- a/projects/rccl/src/graph/paths.cc +++ b/projects/rccl/src/graph/paths.cc @@ -304,7 +304,7 @@ compare: // Compute the PCI distance and compare with the p2pLevel. if (path->type <= p2pLevel) *p2p = 1; -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #else if (*p2p == 1) { // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to @@ -386,7 +386,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int if (read) { // For reads (sends) only enable under certain conditions int gdrReadParam = ncclParamNetGdrRead(); if (gdrReadParam == 0) return ncclSuccess; -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #else // Disable GDR Reads pre-Ampere when we have other PCI flows if (gdrReadParam < 0 && gpu->gpu.cudaCompCap < 80) { @@ -455,7 +455,7 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int int g; NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g)); struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g; -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #else // Flush is required on Ampere and earlier *flush = gpu->gpu.cudaCompCap < 90 ? 1 : ncclParamNetForceFlush(); diff --git a/projects/rccl/src/graph/search.cc b/projects/rccl/src/graph/search.cc index 31d14a74e6..39f93a3bba 100644 --- a/projects/rccl/src/graph/search.cc +++ b/projects/rccl/src/graph/search.cc @@ -886,7 +886,7 @@ ncclResult_t ncclTopoDupChannels(struct ncclTopoGraph* graph, int ccMin, int ngp return ncclSuccess; } -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) float speedArrayIntra[] = { 48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; float speedArrayInter[] = { 48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 }; #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float)) diff --git a/projects/rccl/src/graph/topo.cc b/projects/rccl/src/graph/topo.cc index 8419e27e5b..122f323b34 100644 --- a/projects/rccl/src/graph/topo.cc +++ b/projects/rccl/src/graph/topo.cc @@ -21,7 +21,7 @@ #define BUSID_REDUCED_SIZE (sizeof("0000:00")) const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" }; -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) const char* topoLinkTypeStr[] = { "LOC", "XGMI", "", "PCI", "", "", "", "SYS", "NET" }; const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" }; #else @@ -376,7 +376,7 @@ ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* s } ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) { -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) // There is no direct mapping between CUDA SM to HIP GFX. Use SM60 as compatibility level. gpu->gpu.cudaCompCap = 60; // Repurpose previously unused "sm" as CU counts @@ -526,7 +526,7 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s return ncclSuccess; } -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) ncclResult_t ncclTopoAddXGMI(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) { if (strcmp(node->name, "xgmi") == 0) { struct ncclTopoNode* gpu = NULL; @@ -668,7 +668,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem struct ncclXmlNode* node = topNode->subs[s]; if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem)); } -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) NCCLCHECK(ncclTopoAddXGMI(topNode, *topoSystem, NULL)); #else NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL)); diff --git a/projects/rccl/src/graph/tuning.cc b/projects/rccl/src/graph/tuning.cc index d9275c0f61..a0012b2b8e 100644 --- a/projects/rccl/src/graph/tuning.cc +++ b/projects/rccl/src/graph/tuning.cc @@ -303,7 +303,7 @@ static float getNetOverhead(struct ncclComm* comm) { ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) { int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS; comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, simpleDefaultThreads, comm->WarpSize); comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS, comm->WarpSize); @@ -374,7 +374,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2); // Various model refinements -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) if (nNodes <= 2) busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p]; else @@ -510,7 +510,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom // Disable LL protocol on gfx12xx int pEnable = (p == NCCL_PROTO_LL && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx12")) ? 0 : protoEnable[p]; if (pEnable == 2 && p == NCCL_PROTO_LL128) { -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #if defined(ENABLE_LL128) // Enable LL128 by default only on gfx90a with available tuning table pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL && @@ -650,7 +650,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto } int logSize = log2i(info->nBytes>>6); -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) if (algorithm == NCCL_ALGO_TREE) { if (logSize < 27) bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][logSize]; else bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][26]; diff --git a/projects/rccl/src/graph/xml.cc b/projects/rccl/src/graph/xml.cc index 53ddf968ed..3cfc7437b9 100644 --- a/projects/rccl/src/graph/xml.cc +++ b/projects/rccl/src/graph/xml.cc @@ -266,7 +266,7 @@ ncclResult_t ncclTopoXmlLoadC2c(FILE* file, struct ncclXml* xml, struct ncclXmlN return ncclSuccess; } ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) { -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) struct xmlHandler handlers[] = { { "xgmi", ncclTopoXmlLoadNvlink } }; #else struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink }, { "c2c", ncclTopoXmlLoadC2c } }; @@ -690,7 +690,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, uint32_t rocmDev struct ncclXmlNode* nvlNode = NULL; NCCLCHECK(xmlGetSub(gpuNode, "nvlink", &nvlNode)); if (nvlNode == NULL) { -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) const char* busId; NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId)); uint32_t deviceCnt; @@ -813,7 +813,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, uint32_t rocmDev // Fill target classes for (int s=0; snSubs; s++) { struct ncclXmlNode* sub = gpuNode->subs[s]; -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) if (strcmp(sub->name, "xgmi") != 0) continue; #else if (strcmp(sub->name, "nvlink") != 0) continue; @@ -845,7 +845,7 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node)); NCCLCHECK(xmlSetAttrIfUnset(node, "class", "0x03")); NCCLCHECK(ncclTopoGetXmlFromSys(node, xml)); -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) uint32_t devIndex = 0; static int rocmsmiInit = 0; if (rocmsmiInit == 0) { diff --git a/projects/rccl/src/include/comm.h b/projects/rccl/src/include/comm.h index 0f47b94911..4ecbac75b1 100644 --- a/projects/rccl/src/include/comm.h +++ b/projects/rccl/src/include/comm.h @@ -17,7 +17,7 @@ #include "nccl_net.h" #include "register.h" -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #define HIPRT_CB #else #if CUDART_VERSION < 9000 diff --git a/projects/rccl/src/include/gdrwrap.h b/projects/rccl/src/include/gdrwrap.h index 669cbace1a..a75831149f 100644 --- a/projects/rccl/src/include/gdrwrap.h +++ b/projects/rccl/src/include/gdrwrap.h @@ -157,7 +157,7 @@ typedef struct gdr_mem_desc { gdr_mh_t gdrMh; } gdr_mem_desc_t; -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) static gdr_t ncclGdrInit() { INFO(NCCL_INIT, "Enabled GDRCopy equivalent memory allocation"); return (gdr_t)0x12345678L; diff --git a/projects/rccl/src/include/nvtx.h b/projects/rccl/src/include/nvtx.h index ed5741edbc..bef5dd69d6 100644 --- a/projects/rccl/src/include/nvtx.h +++ b/projects/rccl/src/include/nvtx.h @@ -77,7 +77,7 @@ class payload_schema { // @param N schema name // @param S schema (entries) // @param P payload (struct) -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \ nvtxPayloadData_t nvtx3_bpl__[] = { \ {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \ diff --git a/projects/rccl/src/include/nvtx3/nvtx3.hpp b/projects/rccl/src/include/nvtx3/nvtx3.hpp index 61bda49ccc..7f17149580 100644 --- a/projects/rccl/src/include/nvtx3/nvtx3.hpp +++ b/projects/rccl/src/include/nvtx3/nvtx3.hpp @@ -2777,7 +2777,7 @@ inline void mark(Args const&... args) noexcept * `domain` to which the `registered_string_in` belongs. Else, * `domain::global` to indicate that the global NVTX domain should be used. */ -#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HCC__) && !defined(__HIPCC__) +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__) #define NVTX3_V1_FUNC_RANGE_IN(D) \ static ::nvtx3::v1::registered_string_in const nvtx3_func_name__{__func__}; \ static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \ diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index 4330617c8f..2ffb721b8e 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -61,7 +61,7 @@ #define STR(v) STR2(v) #endif -#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream #else #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream @@ -724,7 +724,7 @@ fail: } // Pre-process the string so that running "strings" on the lib can quickly reveal the version. -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #define VERSION_STRING "RCCL version : " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX #define VERSION_STRING_EXTENDED "HIP version : " HIP_BUILD_INFO "\nROCm version : " ROCM_BUILD_INFO #else @@ -736,13 +736,13 @@ static void showVersion() { if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) { char hostInfo[HOST_NAME_MAX] = {}, libPathInfo[2048] = {}; size_t hostInfoSize = sizeof(hostInfo), libPathInfoSize = sizeof(libPathInfo); - + // Retrieve Hostname info if (gethostname(hostInfo, hostInfoSize-1) != 0) { // Returns Unknown in hostInfo if function call unsuccessful strncpy(hostInfo, "Unknown", hostInfoSize-1); } - + // Retrieve librccl path Dl_info pathInfo; if (dladdr((void*)ncclCommInitRank, &pathInfo)) { @@ -802,7 +802,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u info->comm = comm; info->cudaCompCap = comm->minCompCap = comm->maxCompCap = comm->compCap; -#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HCC__) && !defined(__HIPCC__) +#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__) // MNNVL support { // MNNVL: Request the fabric UUID and partition info diff --git a/projects/rccl/src/net.cc b/projects/rccl/src/net.cc index 1326ea50a5..5523ef6440 100644 --- a/projects/rccl/src/net.cc +++ b/projects/rccl/src/net.cc @@ -548,7 +548,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) { ncclNetProperties_t props; NCCLCHECK(comm->ncclNet->getProperties(dev, &props)); if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue; -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) gdrSupportMatrix[comm->cudaDev] = 1; break; #endif diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc index be284c13ed..2f01847799 100644 --- a/projects/rccl/src/transport/net.cc +++ b/projects/rccl/src/transport/net.cc @@ -153,7 +153,7 @@ struct recvNetResources { /* Determine if two peers can communicate with NET */ static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { *ret = 1; -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #else if (info1->hostHash == info2->hostHash) { // If on the same host, check intra-node net is not disabled. diff --git a/projects/rccl/src/transport/net_ib.cc b/projects/rccl/src/transport/net_ib.cc index 5691f77c64..805ba52276 100644 --- a/projects/rccl/src/transport/net_ib.cc +++ b/projects/rccl/src/transport/net_ib.cc @@ -353,7 +353,7 @@ ncclResult_t ncclIbDevices(int* ndev) { ncclResult_t ncclIbGdrSupport() { static int moduleLoaded = -1; if (moduleLoaded == -1) { -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) moduleLoaded = (access("/sys/kernel/mm/memory_peers/amdkfd/version", F_OK) == -1) ? 0 : 1; char strValue[MAX_STR_LEN]; NCCLCHECK(ncclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue)); @@ -1100,7 +1100,7 @@ ib_recv: // Set the ece (enhanced connection establishment) on this QP before RTR if (remMeta.qpInfo[q].ece_supported) { NCCLCHECK(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported)); - + // Query the reduced ece for this QP (matching enhancements between the requestor and the responder) // Store this in our own qpInfo for returning to the requestor if (meta.qpInfo[q].ece_supported) diff --git a/projects/rccl/src/transport/p2p.cc b/projects/rccl/src/transport/p2p.cc index 63a0c99dc0..163b9f3a50 100644 --- a/projects/rccl/src/transport/p2p.cc +++ b/projects/rccl/src/transport/p2p.cc @@ -106,7 +106,7 @@ static void initCeOperation(); /* Determine if two peers can communicate through p2p */ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) { initCeOperation(); -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) if (!info1->hasFineGrain || !info2->hasFineGrain) { *ret = 0; return ncclSuccess; @@ -146,7 +146,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop int cudaDev1 = busIdToCudaDev(info1->busId); int cudaDev2 = busIdToCudaDev(info2->busId); if (cudaDev1 == -1 || cudaDev2 == -1) { -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) || CUDART_VERSION >= 10010 +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) || CUDART_VERSION >= 10010 // CUDA 10.1 and later can use P2P with invisible devices. return ncclSuccess; #else @@ -165,7 +165,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop return ncclSuccess; } -#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) +#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) #else // This will always fail when using NCCL_CUMEM_ENABLE=1 if (p2p != 0 && !ncclCuMemEnable()) {