fix potential segfaults due to use after malloc fails (#2137)

* fix potential segfaults * replace NULL with nullptr --------- Co-authored-by: Prasannakumar Murugesan <prmuruge@amd.com> [ROCm/rccl commit: 4a32ec2501]
2026-01-20 14:11:29 -08:00
commit 520f309bb1
@@ -115,6 +115,7 @@ __hidden ncclResult_t exampleProfilerInit(void** context, uint64_t commId, int*

  // pre-allocate memory for event object pools in dedicated profiler context
  struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
+  if (ctx == nullptr) return ncclSystemError;
  ctx->commName = commName;
  ctx->commHash = commId;
  ctx->nranks = nranks;
@@ -1408,6 +1408,10 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
    static_assert(16 <= alignof(max_align_t), "We rely on 16-byte alignment.");
    fifoBufHost = malloc(workBytes);
    #endif
+    if (fifoBufHost == nullptr) {
+      WARN("Failed to allocate %zu bytes for work FIFO buffer", workBytes);
+      return ncclSystemError;
+    }
    fifoCursor = 0;
    fifoMask = ~0u;
    break;
@@ -1492,6 +1492,10 @@ extern const char* topoPathTypeStr[];
 static void parseOptions(struct ncclTopoSystem* system, const char *options) {
  if (strcmp(options, "")) {
    char *str_temp = (char *)malloc(strlen(options) + 1);
+    if (str_temp == nullptr) {
+      WARN("Failed to allocate memory for options parsing");
+      return;
+    }
    strcpy(str_temp, options);
    char* tokens[MAX_OPT_TOKENS];
    int numTokens = 0;
@@ -1536,6 +1540,10 @@ static void parseOptions(struct ncclTopoSystem* system, const char *options) {
 static bool checkOption(const char *options, const char *name) {
  if (strcmp(options, "")) {
    char *str_temp = (char *)malloc(strlen(options) + 1);
+    if (str_temp == nullptr) {
+      WARN("Failed to allocate memory for options checking");
+      return false;
+    }
    strcpy(str_temp, options);
    char* tokens[MAX_OPT_TOKENS];
    int numTokens = 0;
@@ -1932,6 +1940,10 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra

  int *g8, n[NCCL_TOPO_MAX_NODES];
  int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int));
+  if (all_gpu_permutations == nullptr) {
+    WARN("Failed to allocate memory for GPU permutations");
+    return ncclSystemError;
+  }
  struct timeval tvs, tve;
  gettimeofday(&tvs, NULL);
  std::vector<int> r(ngpus), g(ngpus), rdm(ngpus);
@@ -2313,6 +2325,10 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra

  int *g16, n[NCCL_TOPO_MAX_NODES], rdm[NUMA_GPUS*NUMA_CPUS];
  int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int));
+  if (all_gpu_permutations == nullptr) {
+    WARN("Failed to allocate memory for GPU permutations");
+    return ncclSystemError;
+  }
  struct timeval tvs, tve;
  gettimeofday(&tvs, NULL);
  std::vector<int> r(ngpus), g(ngpus);
@@ -517,6 +517,11 @@ static ncclResult_t commFree(ncclComm_t comm) {
 #ifdef ENABLE_PROFILING
  struct ncclProf *prof, *prof_seq;
  prof = (struct ncclProf*)malloc(sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES);
+  if (prof == nullptr) {
+    WARN("Failed to allocate profiling buffer");
+    // Skip profiling but continue with destruction
+    goto skip_profiling;
+  }
  CUDACHECK(hipMemcpy(prof, comm->devComm->devProf, sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES, hipMemcpyDeviceToHost));
  #define VEGA_GPU_RTC_FREQUENCY 2.5E7
  for (int i=0; i<comm->nChannels; i++) {
@@ -529,6 +534,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
  }
  free(prof);
  CUDACHECK(hipFree(comm->devComm->devProf));
+skip_profiling:
 #endif

 #ifdef ENABLE_COLLTRACE
@@ -2137,6 +2143,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
  CUDACHECKGOTO(hipGetDeviceProperties(&devProp, cudaDev), res, fail);
  cuCount = devProp.multiProcessorCount;
  archName = (char*)malloc(strlen(devProp.gcnArchName) + 1);
+  if (archName == nullptr) {
+    WARN("Failed to allocate memory for architecture name");
+    goto fail;
+  }
  strcpy(archName, devProp.gcnArchName);

  timers[TIMER_INIT_KERNELS] = clockNano();
@@ -2434,6 +2444,10 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
  if (tmpNetName != NULL) {
    int netNameLen = strlen(tmpNetName) + 1;
    comm->config.netName = (char*)malloc(netNameLen);
+    if (comm->config.netName == nullptr) {
+      WARN("Failed to allocate memory for network name");
+      return ncclSystemError;      
+    }
    memcpy((void*)comm->config.netName, tmpNetName, netNameLen);
  } else {
    comm->config.netName = NULL;
@@ -721,6 +721,10 @@ ncclResult_t mscclAlgoMetaXmlLoad(const char* xmlFilePath, struct mscclXmlNode*
 ncclResult_t mscclGetAlgoMetaFromXmlFile(const char* str, struct mscclAlgoMeta* algoMeta) {
  struct mscclXmlNode* node;
  node = (struct mscclXmlNode *)malloc(sizeof(struct mscclXmlNode));
+  if (node == nullptr) {
+    WARN("Failed to allocate memory for MSCCL XML node");
+    return ncclSystemError;
+  }
  NCCLCHECK(mscclAlgoMetaXmlLoad(str, node));

  algoMeta->filePath = str;
@@ -21,9 +21,18 @@ const char* ncclDataTypeStr[ncclNumTypes]                    = {"i8", "u8", "i32
 void roctxAlloc(roctxPayloadInfo_t payloadInfo, const size_t numEntries) {
  // Allocate enough memory for numEntries in payloadEntries
  payloadInfo->payloadEntries = (roctxPayloadSchemaEntryInfo*)malloc(numEntries * sizeof(roctxPayloadSchemaEntryInfo));
+  if (payloadInfo->payloadEntries == nullptr) {
+    payloadInfo->message = nullptr;
+    return;
+  }

  // Allocate memory for the message that will be constructed
  payloadInfo->message = (char*)malloc(MAX_MESSAGE_LENGTH * sizeof(char));
+  if (payloadInfo->message == nullptr) {
+    free(payloadInfo->payloadEntries);
+    payloadInfo->payloadEntries = nullptr;
+    return;
+  }
 }

 void roctxFree(roctxPayloadInfo_t payloadInfo) {
@@ -139,4 +148,4 @@ roctx_scoped_range_in::~roctx_scoped_range_in() noexcept {
 #endif
    roctxFree(&payloadInfo);
  }
-}
+}
@@ -37,6 +37,11 @@ static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmS
  if (create) {
    int slen = strlen(shmPath);
    handle->shmPath = (char*)malloc(slen + 1);
+    if (handle->shmPath == nullptr) {
+      WARN("Failed to allocate memory for shared memory path");
+      // handle->shmPath remains nullptr, caller should check
+      return;
+    }
    memcpy(handle->shmPath, shmPath, slen + 1);
    if (hptr) memset(hptr, 0, shmSize);
  } else {
@@ -40,6 +40,10 @@ ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) {
  while (1) {
    if (p == nullptr) {
      p = (struct ncclCudaContext*)calloc(1, sizeof(struct ncclCudaContext));
+      if (p == nullptr) {
+        result = ncclSystemError;
+        goto leave;
+      }
      p->refCount = 1;
      p->hcontext = hcontext;
      p->next = cxtListHead;
@@ -189,6 +193,10 @@ ncclResult_t ncclStrongStreamAcquire(
      cap = spare;
      if (cap == nullptr) {
        cap = (struct ncclStrongStreamCapture*)calloc(1, sizeof(struct ncclStrongStreamCapture));
+        if (cap == nullptr) {
+          ret = ncclSystemError;
+          goto do_unlock;
+        }
        CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock);
      }
      cap->graphId = graph.graphId;
@@ -96,6 +96,10 @@ static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, v

  // Pre-alloc response buffer
  ex->respBuff = malloc(respSize);
+  if (ex->respBuff == nullptr && respSize > 0) {
+    free(ex);
+    return ncclSystemError;
+  }
  ex->respSize = respSize;
  ex->res      = ncclInternalError;
  ex->done     = false;
@@ -1365,6 +1369,9 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec
      if (resp.opId != opId) {
        // Unexpected response, need to buffer the socket data
        respBuff = malloc(resp.respSize);
+        if (respBuff == nullptr) {
+          return ncclSystemError;
+        }
      }
      assert(respBuff != NULL);
      NCCLCHECK(ncclSocketRecv(sock, respBuff, resp.respSize));
@@ -1391,6 +1398,10 @@ ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnec
  // Alloc some memory to act as a handle
  ncclResult_t res = ncclSuccess;
  void* opId = malloc(1);
+  if (opId == nullptr) {
+    WARN("Failed to allocate proxy operation ID");
+    return ncclSystemError;
+  }

  NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, type, reqBuff, reqSize, respSize, opId), res, fail);

@@ -1281,6 +1281,10 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* u

    if (*outRegBufFlag) {
      record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback));
+      if (record == nullptr) {
+        WARN("Failed to allocate collnet cleanup callback");
+        return ncclSystemError;
+      }
      record->base.fn = cleanupCollnet;
      record->comm = comm;
      record->reg = regRecord;
@@ -2203,6 +2203,10 @@ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui
  assert(size > 0);
  struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
  struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle));
+  if (mhandleWrapper == nullptr) {
+    WARN("Failed to allocate IB MR handle wrapper");
+    return ncclSystemError;
+  }
  for (int i = 0; i < base->vProps.ndevs; i++) {
    // Each ncclIbNetCommDevBase is at different offset in send and recv netComms
    struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
@@ -2335,6 +2335,10 @@ ncclResult_t rocmIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui
  assert(size > 0);
  struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
  struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle));
+  if (mhandleWrapper == nullptr) {
+    WARN("Failed to allocate IB MR handle wrapper");
+    return ncclSystemError;
+  }
  for (int i = 0; i < base->vProps.ndevs; i++) {
    // Each ncclIbNetCommDevBase is at different offset in send and recv netComms
    struct ncclIbNetCommDevBase* devComm = rocmIbGetNetCommDevBase(base, i);
@@ -843,6 +843,10 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(
  if (*outRegBufUsed) {
    if (sendRegRecord) {
      sendRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
+      if (sendRecord == nullptr) {
+        WARN("Failed to allocate NVLS send cleanup callback");
+        return ncclSystemError;
+      }
      sendRecord->base.fn = cleanupNvls;
      sendRecord->reg = sendRegRecord;
      sendRecord->comm = comm;
@@ -852,6 +856,10 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(

    if (recvRegRecord) {
      recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
+      if (recvRecord == nullptr) {
+        WARN("Failed to allocate NVLS recv cleanup callback");
+        return ncclSystemError;
+      }
      recvRecord->base.fn = cleanupNvls;
      recvRecord->reg = recvRegRecord;
      recvRecord->comm = comm;