removed hcc mentions (#1291)

[ROCm/rccl commit: d6c317d6ae]
2024-08-14 15:04:13 -06:00
parent 09fe7f9b97
commit 38e189bb1e
19 changed files with 44 additions and 45 deletions
@@ -125,8 +125,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)  # Without this line, it will add -std=gnu++14 ins
 list(APPEND CMAKE_PREFIX_PATH  # Add ROCM_PATH to CMake search paths (for finding HIP / HSA
            ${ROCM_PATH}
            ${ROCM_PATH}/hip
-            ${ROCM_PATH}/llvm
-            ${ROCM_PATH}/hcc)
+            ${ROCM_PATH}/llvm)

 # Check for required dependencies
 #==================================================================================================
@@ -48,7 +48,7 @@ extern __shared__ struct mscclShmemData mscclShmem;
 #endif

 inline __device__ static void barrier(int nthreads) {
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
  assert(nthreads == NCCL_MAX_NTHREADS);
  #ifdef __GFX12__
    __asm__ __volatile__("s_waitcnt vmcnt(0) lgkmcnt(0)\ns_barrier_signal -1\ns_barrier_wait -1");
@@ -69,7 +69,7 @@ private:
  uint64_t* barrier_next;

  inline __device__ void barrier() {
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
    if (nthreads != WARP_SIZE)
      barrier_by_group();
 #else
@@ -147,7 +147,7 @@ private:
    }
 #endif

-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
    union ncclLLFifoLine i4;
    do {
 #ifdef __GFX11__
@@ -190,7 +190,7 @@ private:
    for (int i=BeginIx; i < MaxRecv; i++) {
      if (i < fan.nrecv()) {
        union ncclLLFifoLine* src = recvPtr(i) + offset;
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #ifdef __GFX11__
        asm volatile ("global_load_b128 %0, %1, off glc slc dlc\n"
          "s_waitcnt vmcnt(0)\n" : "=v"(line[i].i4) : "v"(&src->i4));
@@ -217,7 +217,7 @@ private:
 #endif

    do {
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #ifdef __GFX11__
      asm volatile ("global_load_b128 %0, %1, off glc slc dlc\n"
        "s_waitcnt vmcnt(0)\n" : "=v"(line[i].i4) : "v"(&src->i4));
@@ -246,7 +246,7 @@ private:
  }

  __device__ void storeLL(union ncclLLFifoLine* dst, uint64_t val, uint32_t flag) {
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
    union ncclLLFifoLine i4;
    i4.data1 = val & 0xffffffff;
    i4.flag1 = flag;
@@ -270,7 +270,7 @@ private:
      uint32_t u4;
      uint64_t u8;
    };
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
    if(sizeof(U) == 1)
 #ifdef __GFX11__
      u1 = __atomic_load_n((uint8_t*)src, __ATOMIC_RELAXED);
@@ -318,7 +318,7 @@ private:
      uint64_t u8;
    };
    elt = val;
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
    if(sizeof(U) == 1)
      __builtin_nontemporal_store(u1, (uint8_t*)dst);
    else if(sizeof(U) == 2)
@@ -74,7 +74,7 @@ private:
 #endif

  inline __device__ void barrier() {
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
  if (nthreads != WARP_SIZE)
    barrier_by_group();
 #else
@@ -1645,7 +1645,7 @@ static ncclResult_t getChannnelThreadInfo(struct ncclInfo* collInfo) {
    // Ring/Tree channel tuning
    while (collInfo->nBytes < nc*nt*threadThreshold) {
      if (nc >= 2) nc--;
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
      // do not reduce threads count on VEGA
 #else
      else if ((nt % 128) == 0) nt/=2;
@@ -1653,7 +1653,7 @@ static ncclResult_t getChannnelThreadInfo(struct ncclInfo* collInfo) {
      else break;
    }
  }
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #else
  if (collInfo->protocol == NCCL_PROTO_SIMPLE) {
    if (collInfo->algorithm == NCCL_ALGO_RING) nt += WARP_SIZE; // Extra warp for sync
@@ -304,7 +304,7 @@ compare:
  // Compute the PCI distance and compare with the p2pLevel.
  if (path->type <= p2pLevel) *p2p = 1;

-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #else
  if (*p2p == 1) {
    // NCCL_IGNORE_DISABLED_P2P=2 is used by unit tests that don't want to
@@ -386,7 +386,7 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int
  if (read) { // For reads (sends) only enable under certain conditions
    int gdrReadParam = ncclParamNetGdrRead();
    if (gdrReadParam == 0) return ncclSuccess;
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #else
    // Disable GDR Reads pre-Ampere when we have other PCI flows
    if (gdrReadParam < 0 && gpu->gpu.cudaCompCap < 80) {
@@ -455,7 +455,7 @@ ncclResult_t ncclTopoNeedFlush(struct ncclTopoSystem* system, int64_t busId, int
  int g;
  NCCLCHECK(ncclTopoIdToIndex(system, GPU, busId, &g));
  struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #else
  // Flush is required on Ampere and earlier
  *flush = gpu->gpu.cudaCompCap < 90 ? 1 : ncclParamNetForceFlush();
@@ -886,7 +886,7 @@ ncclResult_t ncclTopoDupChannels(struct ncclTopoGraph* graph, int ccMin, int ngp
  return ncclSuccess;
 }

-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 float speedArrayIntra[] = { 48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 float speedArrayInter[] = { 48.0, 24.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.4, 1.2, 0.24, 0.12 };
 #define NSPEEDSINTRA (sizeof(speedArrayIntra)/sizeof(float))
@@ -21,7 +21,7 @@
 #define BUSID_REDUCED_SIZE (sizeof("0000:00"))

 const char* topoNodeTypeStr[] = { "GPU", "PCI", "NVS", "CPU", "NIC", "NET" };
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 const char* topoLinkTypeStr[] = { "LOC", "XGMI", "",    "PCI",    "",    "",    "", "SYS", "NET" };
 const char* topoPathTypeStr[] = { "LOC", "XGMI", "NVB", "PIX", "PXB", "PXN", "PHB", "SYS", "DIS" };
 #else
@@ -376,7 +376,7 @@ ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* s
 }

 ncclResult_t ncclTopoAddGpu(struct ncclXmlNode* xmlGpu, struct ncclTopoSystem* system, struct ncclTopoNode* gpu) {
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
  // There is no direct mapping between CUDA SM to HIP GFX. Use SM60 as compatibility level.
  gpu->gpu.cudaCompCap = 60;
  // Repurpose previously unused "sm" as CU counts
@@ -526,7 +526,7 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
  return ncclSuccess;
 }

-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 ncclResult_t ncclTopoAddXGMI(struct ncclXmlNode* node, struct ncclTopoSystem* system, const char* parentBusId) {
  if (strcmp(node->name, "xgmi") == 0) {
    struct ncclTopoNode* gpu = NULL;
@@ -668,7 +668,7 @@ ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem
    struct ncclXmlNode* node = topNode->subs[s];
    if (strcmp(node->name, "cpu") == 0) NCCLCHECK(ncclTopoAddCpu(node, *topoSystem));
  }
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
  NCCLCHECK(ncclTopoAddXGMI(topNode, *topoSystem, NULL));
 #else
  NCCLCHECK(ncclTopoAddNvLinks(topNode, *topoSystem, NULL));
@@ -303,7 +303,7 @@ static float getNetOverhead(struct ncclComm* comm) {
 ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
  int simpleDefaultThreads = (graphs[NCCL_ALGO_RING]->bwIntra*graphs[NCCL_ALGO_RING]->nChannels <= PCI_BW) ? 256 : NCCL_SIMPLE_MAX_NTHREADS;
  comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] =
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, simpleDefaultThreads, comm->WarpSize);
  comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] =
    getNthreads("NCCL_NTHREADS", ncclParamNthreads(), 4*comm->WarpSize, NCCL_MAX_NTHREADS, NCCL_MAX_NTHREADS, comm->WarpSize);
@@ -374,7 +374,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
        if (a == NCCL_ALGO_NVLS_TREE) bw = std::min(graphs[a]->bwIntra, nNodes <= 2 ? graphs[a]->bwInter : graphs[a]->bwInter/2);

        // Various model refinements
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
        if (nNodes <= 2)
          busBw *= rcclTuningModel[comm->topo->tuning].bwRatio[0][a][p];
        else
@@ -510,7 +510,7 @@ ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCom
    // Disable LL protocol on gfx12xx
    int pEnable = (p == NCCL_PROTO_LL && IsArchMatch(comm->topo->nodes[GPU].nodes[0].gpu.gcn, "gfx12")) ? 0 : protoEnable[p];
    if (pEnable == 2 && p == NCCL_PROTO_LL128) {
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #if defined(ENABLE_LL128)
      // Enable LL128 by default only on gfx90a with available tuning table
      pEnable = (graphs[a]->typeInter <= PATH_PXB) && graphs[a]->typeIntra <= PATH_NVL &&
@@ -650,7 +650,7 @@ ncclResult_t ncclTopoGetAlgoTime(struct ncclInfo* info, int algorithm, int proto
  }
  int logSize = log2i(info->nBytes>>6);

-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
  if (algorithm == NCCL_ALGO_TREE) {
    if (logSize < 27) bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][logSize];
    else bw *= rcclTuningModel[info->comm->topo->tuning].treeCorrectionFactor[protocol][26];
@@ -266,7 +266,7 @@ ncclResult_t ncclTopoXmlLoadC2c(FILE* file, struct ncclXml* xml, struct ncclXmlN
  return ncclSuccess;
 }
 ncclResult_t ncclTopoXmlLoadGpu(FILE* file, struct ncclXml* xml, struct ncclXmlNode* head) {
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
  struct xmlHandler handlers[] = { { "xgmi", ncclTopoXmlLoadNvlink } };
 #else
  struct xmlHandler handlers[] = { { "nvlink", ncclTopoXmlLoadNvlink }, { "c2c", ncclTopoXmlLoadC2c } };
@@ -690,7 +690,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, uint32_t rocmDev
  struct ncclXmlNode* nvlNode = NULL;
  NCCLCHECK(xmlGetSub(gpuNode, "nvlink", &nvlNode));
  if (nvlNode == NULL) {
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
    const char* busId;
    NCCLCHECK(xmlGetAttr(pciNode, "busid", &busId));
    uint32_t deviceCnt;
@@ -813,7 +813,7 @@ ncclResult_t ncclTopoGetXmlFromGpu(struct ncclXmlNode* pciNode, uint32_t rocmDev
  // Fill target classes
  for (int s=0; s<gpuNode->nSubs; s++) {
    struct ncclXmlNode* sub = gpuNode->subs[s];
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
    if (strcmp(sub->name, "xgmi") != 0) continue;
 #else
    if (strcmp(sub->name, "nvlink") != 0) continue;
@@ -845,7 +845,7 @@ ncclResult_t ncclTopoFillGpu(struct ncclXml* xml, const char* busId, struct nccl
  NCCLCHECK(ncclTopoGetPciNode(xml, busId, &node));
  NCCLCHECK(xmlSetAttrIfUnset(node, "class", "0x03"));
  NCCLCHECK(ncclTopoGetXmlFromSys(node, xml));
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
  uint32_t devIndex = 0;
  static int rocmsmiInit = 0;
  if (rocmsmiInit == 0) {
@@ -17,7 +17,7 @@
 #include "nccl_net.h"
 #include "register.h"

-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #define HIPRT_CB
 #else
 #if CUDART_VERSION < 9000
@@ -157,7 +157,7 @@ typedef struct gdr_mem_desc {
  gdr_mh_t gdrMh;
 } gdr_mem_desc_t;

-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 static gdr_t ncclGdrInit() {
  INFO(NCCL_INIT, "Enabled GDRCopy equivalent memory allocation");
  return (gdr_t)0x12345678L;
@@ -77,7 +77,7 @@ class payload_schema {
 // @param N  schema name
 // @param S  schema (entries)
 // @param P  payload (struct)
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #define NVTX3_FUNC_WITH_PARAMS(ID, S, P) \
  nvtxPayloadData_t nvtx3_bpl__[] = { \
    {NVTX_PAYLOAD_ENTRY_TYPE_SCHEMA_ID_STATIC_START + NVTX_SID_##ID, sizeof(P), &(P)}}; \
@@ -2777,7 +2777,7 @@ inline void mark(Args const&... args) noexcept
 * `domain` to which the `registered_string_in` belongs. Else,
 * `domain::global` to  indicate that the global NVTX domain should be used.
 */
-#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HCC__) && !defined(__HIPCC__)
+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
 #define NVTX3_V1_FUNC_RANGE_IN(D)                                                  \
  static ::nvtx3::v1::registered_string_in<D> const nvtx3_func_name__{__func__};   \
  static ::nvtx3::v1::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
@@ -61,7 +61,7 @@
  #define STR(v) STR2(v)
 #endif

-#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if CUDART_VERSION >= 9020 || defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #define NCCL_GROUP_CUDA_STREAM 0 // CGMD: CUDA 9.2,10.X Don't need to use an internal CUDA stream
 #else
 #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
@@ -724,7 +724,7 @@ fail:
 }

 // Pre-process the string so that running "strings" on the lib can quickly reveal the version.
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #define VERSION_STRING "RCCL version : " STR(NCCL_MAJOR) "." STR(NCCL_MINOR) "." STR(NCCL_PATCH) NCCL_SUFFIX
 #define VERSION_STRING_EXTENDED "HIP version  : " HIP_BUILD_INFO "\nROCm version : " ROCM_BUILD_INFO
 #else
@@ -736,13 +736,13 @@ static void showVersion() {
  if (shown == 0 && ncclDebugLevel >= NCCL_LOG_VERSION) {
    char hostInfo[HOST_NAME_MAX] = {}, libPathInfo[2048] = {};
    size_t hostInfoSize = sizeof(hostInfo), libPathInfoSize = sizeof(libPathInfo);
-    
+
    // Retrieve Hostname info
    if (gethostname(hostInfo, hostInfoSize-1) != 0) {
      // Returns Unknown in hostInfo if function call unsuccessful
      strncpy(hostInfo, "Unknown", hostInfoSize-1);
    }
-    
+
    // Retrieve librccl path
    Dl_info pathInfo;
    if (dladdr((void*)ncclCommInitRank, &pathInfo)) {
@@ -802,7 +802,7 @@ static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, u
  info->comm = comm;
  info->cudaCompCap = comm->minCompCap = comm->maxCompCap = comm->compCap;

-#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HCC__) && !defined(__HIPCC__)
+#if !defined(__HIP_PLATFORM_AMD__) && !defined(__HIPCC__)
  // MNNVL support
  {
    // MNNVL: Request the fabric UUID and partition info
@@ -548,7 +548,7 @@ ncclResult_t ncclGpuGdrSupport(struct ncclComm* comm, int* gdrSupport) {
      ncclNetProperties_t props;
      NCCLCHECK(comm->ncclNet->getProperties(dev, &props));
      if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
      gdrSupportMatrix[comm->cudaDev] = 1;
      break;
 #endif
@@ -153,7 +153,7 @@ struct recvNetResources {
 /* Determine if two peers can communicate with NET */
 static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
  *ret = 1;
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #else
  if (info1->hostHash == info2->hostHash) {
    // If on the same host, check intra-node net is not disabled.
@@ -353,7 +353,7 @@ ncclResult_t ncclIbDevices(int* ndev) {
 ncclResult_t ncclIbGdrSupport() {
  static int moduleLoaded = -1;
  if (moduleLoaded == -1) {
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
    moduleLoaded = (access("/sys/kernel/mm/memory_peers/amdkfd/version", F_OK) == -1) ? 0 : 1;
    char strValue[MAX_STR_LEN];
    NCCLCHECK(ncclTopoGetStrFromSys("/sys/devices/virtual/dmi/id", "bios_version", strValue));
@@ -1100,7 +1100,7 @@ ib_recv:
    // Set the ece (enhanced connection establishment) on this QP before RTR
    if (remMeta.qpInfo[q].ece_supported) {
      NCCLCHECK(wrap_ibv_set_ece(qp->qp, &remMeta.qpInfo[q].ece, &meta.qpInfo[q].ece_supported));
-  
+
      // Query the reduced ece for this QP (matching enhancements between the requestor and the responder)
      // Store this in our own qpInfo for returning to the requestor
      if (meta.qpInfo[q].ece_supported)
@@ -106,7 +106,7 @@ static void initCeOperation();
 /* Determine if two peers can communicate through p2p */
 ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTopoGraph* graph, struct ncclPeerInfo* info1, struct ncclPeerInfo* info2) {
  initCeOperation();
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
  if (!info1->hasFineGrain || !info2->hasFineGrain)  {
    *ret = 0;
    return ncclSuccess;
@@ -146,7 +146,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
  int cudaDev1 = busIdToCudaDev(info1->busId);
  int cudaDev2 = busIdToCudaDev(info2->busId);
  if (cudaDev1 == -1 || cudaDev2 == -1) {
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__) || CUDART_VERSION >= 10010
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__) || CUDART_VERSION >= 10010
    // CUDA 10.1 and later can use P2P with invisible devices.
    return ncclSuccess;
 #else
@@ -165,7 +165,7 @@ ncclResult_t p2pCanConnect(int* ret, struct ncclTopoSystem* topo, struct ncclTop
    return ncclSuccess;
  }

-#if defined(__HIP_PLATFORM_AMD__) || defined(__HCC__) || defined(__HIPCC__)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #else
  // This will always fail when using NCCL_CUMEM_ENABLE=1
  if (p2p != 0 && !ncclCuMemEnable()) {