diff --git a/projects/rccl/ext-profiler/example/plugin.cc b/projects/rccl/ext-profiler/example/plugin.cc index f6d4956b3d..19b5042b3b 100644 --- a/projects/rccl/ext-profiler/example/plugin.cc +++ b/projects/rccl/ext-profiler/example/plugin.cc @@ -115,6 +115,7 @@ __hidden ncclResult_t exampleProfilerInit(void** context, uint64_t commId, int* // pre-allocate memory for event object pools in dedicated profiler context struct context* ctx = (struct context *)calloc(1, sizeof(*ctx)); + if (ctx == nullptr) return ncclSystemError; ctx->commName = commName; ctx->commHash = commId; ctx->nranks = nranks; diff --git a/projects/rccl/src/enqueue.cc b/projects/rccl/src/enqueue.cc index 465e480c95..d7f7316f9e 100644 --- a/projects/rccl/src/enqueue.cc +++ b/projects/rccl/src/enqueue.cc @@ -1408,6 +1408,10 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla static_assert(16 <= alignof(max_align_t), "We rely on 16-byte alignment."); fifoBufHost = malloc(workBytes); #endif + if (fifoBufHost == nullptr) { + WARN("Failed to allocate %zu bytes for work FIFO buffer", workBytes); + return ncclSystemError; + } fifoCursor = 0; fifoMask = ~0u; break; diff --git a/projects/rccl/src/graph/rome_models.cc b/projects/rccl/src/graph/rome_models.cc index 7f5fa22473..c6c4c9ddc0 100644 --- a/projects/rccl/src/graph/rome_models.cc +++ b/projects/rccl/src/graph/rome_models.cc @@ -1492,6 +1492,10 @@ extern const char* topoPathTypeStr[]; static void parseOptions(struct ncclTopoSystem* system, const char *options) { if (strcmp(options, "")) { char *str_temp = (char *)malloc(strlen(options) + 1); + if (str_temp == nullptr) { + WARN("Failed to allocate memory for options parsing"); + return; + } strcpy(str_temp, options); char* tokens[MAX_OPT_TOKENS]; int numTokens = 0; @@ -1536,6 +1540,10 @@ static void parseOptions(struct ncclTopoSystem* system, const char *options) { static bool checkOption(const char *options, const char *name) { if (strcmp(options, "")) { char *str_temp = (char *)malloc(strlen(options) + 1); + if (str_temp == nullptr) { + WARN("Failed to allocate memory for options checking"); + return false; + } strcpy(str_temp, options); char* tokens[MAX_OPT_TOKENS]; int numTokens = 0; @@ -1932,6 +1940,10 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra int *g8, n[NCCL_TOPO_MAX_NODES]; int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int)); + if (all_gpu_permutations == nullptr) { + WARN("Failed to allocate memory for GPU permutations"); + return ncclSystemError; + } struct timeval tvs, tve; gettimeofday(&tvs, NULL); std::vector r(ngpus), g(ngpus), rdm(ngpus); @@ -2313,6 +2325,10 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra int *g16, n[NCCL_TOPO_MAX_NODES], rdm[NUMA_GPUS*NUMA_CPUS]; int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int)); + if (all_gpu_permutations == nullptr) { + WARN("Failed to allocate memory for GPU permutations"); + return ncclSystemError; + } struct timeval tvs, tve; gettimeofday(&tvs, NULL); std::vector r(ngpus), g(ngpus); diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc index ea0421b8d0..e56d3b0e2d 100644 --- a/projects/rccl/src/init.cc +++ b/projects/rccl/src/init.cc @@ -517,6 +517,11 @@ static ncclResult_t commFree(ncclComm_t comm) { #ifdef ENABLE_PROFILING struct ncclProf *prof, *prof_seq; prof = (struct ncclProf*)malloc(sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES); + if (prof == nullptr) { + WARN("Failed to allocate profiling buffer"); + // Skip profiling but continue with destruction + goto skip_profiling; + } CUDACHECK(hipMemcpy(prof, comm->devComm->devProf, sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES, hipMemcpyDeviceToHost)); #define VEGA_GPU_RTC_FREQUENCY 2.5E7 for (int i=0; inChannels; i++) { @@ -529,6 +534,7 @@ static ncclResult_t commFree(ncclComm_t comm) { } free(prof); CUDACHECK(hipFree(comm->devComm->devProf)); +skip_profiling: #endif #ifdef ENABLE_COLLTRACE @@ -2137,6 +2143,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { CUDACHECKGOTO(hipGetDeviceProperties(&devProp, cudaDev), res, fail); cuCount = devProp.multiProcessorCount; archName = (char*)malloc(strlen(devProp.gcnArchName) + 1); + if (archName == nullptr) { + WARN("Failed to allocate memory for architecture name"); + goto fail; + } strcpy(archName, devProp.gcnArchName); timers[TIMER_INIT_KERNELS] = clockNano(); @@ -2434,6 +2444,10 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) { if (tmpNetName != NULL) { int netNameLen = strlen(tmpNetName) + 1; comm->config.netName = (char*)malloc(netNameLen); + if (comm->config.netName == nullptr) { + WARN("Failed to allocate memory for network name"); + return ncclSystemError; + } memcpy((void*)comm->config.netName, tmpNetName, netNameLen); } else { comm->config.netName = NULL; diff --git a/projects/rccl/src/misc/msccl/msccl_parser.cc b/projects/rccl/src/misc/msccl/msccl_parser.cc index 6cd9d0fadd..57bfd06882 100644 --- a/projects/rccl/src/misc/msccl/msccl_parser.cc +++ b/projects/rccl/src/misc/msccl/msccl_parser.cc @@ -721,6 +721,10 @@ ncclResult_t mscclAlgoMetaXmlLoad(const char* xmlFilePath, struct mscclXmlNode* ncclResult_t mscclGetAlgoMetaFromXmlFile(const char* str, struct mscclAlgoMeta* algoMeta) { struct mscclXmlNode* node; node = (struct mscclXmlNode *)malloc(sizeof(struct mscclXmlNode)); + if (node == nullptr) { + WARN("Failed to allocate memory for MSCCL XML node"); + return ncclSystemError; + } NCCLCHECK(mscclAlgoMetaXmlLoad(str, node)); algoMeta->filePath = str; diff --git a/projects/rccl/src/misc/roctx.cc b/projects/rccl/src/misc/roctx.cc index 16d68bb47d..2060e4a911 100644 --- a/projects/rccl/src/misc/roctx.cc +++ b/projects/rccl/src/misc/roctx.cc @@ -21,9 +21,18 @@ const char* ncclDataTypeStr[ncclNumTypes] = {"i8", "u8", "i32 void roctxAlloc(roctxPayloadInfo_t payloadInfo, const size_t numEntries) { // Allocate enough memory for numEntries in payloadEntries payloadInfo->payloadEntries = (roctxPayloadSchemaEntryInfo*)malloc(numEntries * sizeof(roctxPayloadSchemaEntryInfo)); + if (payloadInfo->payloadEntries == nullptr) { + payloadInfo->message = nullptr; + return; + } // Allocate memory for the message that will be constructed payloadInfo->message = (char*)malloc(MAX_MESSAGE_LENGTH * sizeof(char)); + if (payloadInfo->message == nullptr) { + free(payloadInfo->payloadEntries); + payloadInfo->payloadEntries = nullptr; + return; + } } void roctxFree(roctxPayloadInfo_t payloadInfo) { @@ -139,4 +148,4 @@ roctx_scoped_range_in::~roctx_scoped_range_in() noexcept { #endif roctxFree(&payloadInfo); } -} \ No newline at end of file +} diff --git a/projects/rccl/src/misc/shmutils.cc b/projects/rccl/src/misc/shmutils.cc index 29f97cea56..217020f913 100644 --- a/projects/rccl/src/misc/shmutils.cc +++ b/projects/rccl/src/misc/shmutils.cc @@ -37,6 +37,11 @@ static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmS if (create) { int slen = strlen(shmPath); handle->shmPath = (char*)malloc(slen + 1); + if (handle->shmPath == nullptr) { + WARN("Failed to allocate memory for shared memory path"); + // handle->shmPath remains nullptr, caller should check + return; + } memcpy(handle->shmPath, shmPath, slen + 1); if (hptr) memset(hptr, 0, shmSize); } else { diff --git a/projects/rccl/src/misc/strongstream.cc b/projects/rccl/src/misc/strongstream.cc index 411c0c91a9..4f70ee3415 100644 --- a/projects/rccl/src/misc/strongstream.cc +++ b/projects/rccl/src/misc/strongstream.cc @@ -40,6 +40,10 @@ ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) { while (1) { if (p == nullptr) { p = (struct ncclCudaContext*)calloc(1, sizeof(struct ncclCudaContext)); + if (p == nullptr) { + result = ncclSystemError; + goto leave; + } p->refCount = 1; p->hcontext = hcontext; p->next = cxtListHead; @@ -189,6 +193,10 @@ ncclResult_t ncclStrongStreamAcquire( cap = spare; if (cap == nullptr) { cap = (struct ncclStrongStreamCapture*)calloc(1, sizeof(struct ncclStrongStreamCapture)); + if (cap == nullptr) { + ret = ncclSystemError; + goto do_unlock; + } CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock); } cap->graphId = graph.graphId; diff --git a/projects/rccl/src/proxy.cc b/projects/rccl/src/proxy.cc index f8d3c2dfac..e4ada92ca1 100644 --- a/projects/rccl/src/proxy.cc +++ b/projects/rccl/src/proxy.cc @@ -96,6 +96,10 @@ static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, v // Pre-alloc response buffer ex->respBuff = malloc(respSize); + if (ex->respBuff == nullptr && respSize > 0) { + free(ex); + return ncclSystemError; + } ex->respSize = respSize; ex->res = ncclInternalError; ex->done = false; @@ -1365,6 +1369,9 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec if (resp.opId != opId) { // Unexpected response, need to buffer the socket data respBuff = malloc(resp.respSize); + if (respBuff == nullptr) { + return ncclSystemError; + } } assert(respBuff != NULL); NCCLCHECK(ncclSocketRecv(sock, respBuff, resp.respSize)); @@ -1391,6 +1398,10 @@ ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnec // Alloc some memory to act as a handle ncclResult_t res = ncclSuccess; void* opId = malloc(1); + if (opId == nullptr) { + WARN("Failed to allocate proxy operation ID"); + return ncclSystemError; + } NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, type, reqBuff, reqSize, respSize, opId), res, fail); diff --git a/projects/rccl/src/transport/coll_net.cc b/projects/rccl/src/transport/coll_net.cc index d2d7dbb0cc..c52fab0c1b 100644 --- a/projects/rccl/src/transport/coll_net.cc +++ b/projects/rccl/src/transport/coll_net.cc @@ -1281,6 +1281,10 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* u if (*outRegBufFlag) { record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback)); + if (record == nullptr) { + WARN("Failed to allocate collnet cleanup callback"); + return ncclSystemError; + } record->base.fn = cleanupCollnet; record->comm = comm; record->reg = regRecord; diff --git a/projects/rccl/src/transport/net_ib.cc b/projects/rccl/src/transport/net_ib.cc index 7b0c62bf9c..deac53a3cf 100644 --- a/projects/rccl/src/transport/net_ib.cc +++ b/projects/rccl/src/transport/net_ib.cc @@ -2203,6 +2203,10 @@ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui assert(size > 0); struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm; struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle)); + if (mhandleWrapper == nullptr) { + WARN("Failed to allocate IB MR handle wrapper"); + return ncclSystemError; + } for (int i = 0; i < base->vProps.ndevs; i++) { // Each ncclIbNetCommDevBase is at different offset in send and recv netComms struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i); diff --git a/projects/rccl/src/transport/net_ib_rocm.cc b/projects/rccl/src/transport/net_ib_rocm.cc index 200bbdf66d..8f0b9b19a4 100644 --- a/projects/rccl/src/transport/net_ib_rocm.cc +++ b/projects/rccl/src/transport/net_ib_rocm.cc @@ -2335,6 +2335,10 @@ ncclResult_t rocmIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui assert(size > 0); struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm; struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle)); + if (mhandleWrapper == nullptr) { + WARN("Failed to allocate IB MR handle wrapper"); + return ncclSystemError; + } for (int i = 0; i < base->vProps.ndevs; i++) { // Each ncclIbNetCommDevBase is at different offset in send and recv netComms struct ncclIbNetCommDevBase* devComm = rocmIbGetNetCommDevBase(base, i); diff --git a/projects/rccl/src/transport/nvls.cc b/projects/rccl/src/transport/nvls.cc index 1f13bb01b7..fcd7b14d0e 100644 --- a/projects/rccl/src/transport/nvls.cc +++ b/projects/rccl/src/transport/nvls.cc @@ -843,6 +843,10 @@ ncclResult_t ncclNvlsGraphRegisterBuffer( if (*outRegBufUsed) { if (sendRegRecord) { sendRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback)); + if (sendRecord == nullptr) { + WARN("Failed to allocate NVLS send cleanup callback"); + return ncclSystemError; + } sendRecord->base.fn = cleanupNvls; sendRecord->reg = sendRegRecord; sendRecord->comm = comm; @@ -852,6 +856,10 @@ ncclResult_t ncclNvlsGraphRegisterBuffer( if (recvRegRecord) { recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback)); + if (recvRecord == nullptr) { + WARN("Failed to allocate NVLS recv cleanup callback"); + return ncclSystemError; + } recvRecord->base.fn = cleanupNvls; recvRecord->reg = recvRegRecord; recvRecord->comm = comm;