fix potential segfaults due to use after malloc fails (#2137)

* fix potential segfaults

* replace NULL with nullptr

---------

Co-authored-by: Prasannakumar Murugesan <prmuruge@amd.com>

[ROCm/rccl commit: 4a32ec2501]
Este commit está contenido en:
prasanna-amd
2026-01-20 14:11:29 -08:00
cometido por GitHub
padre bb47eee7cc
commit 520f309bb1
Se han modificado 13 ficheros con 93 adiciones y 1 borrados
@@ -115,6 +115,7 @@ __hidden ncclResult_t exampleProfilerInit(void** context, uint64_t commId, int*
// pre-allocate memory for event object pools in dedicated profiler context
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
if (ctx == nullptr) return ncclSystemError;
ctx->commName = commName;
ctx->commHash = commId;
ctx->nranks = nranks;
+4
Ver fichero
@@ -1408,6 +1408,10 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
static_assert(16 <= alignof(max_align_t), "We rely on 16-byte alignment.");
fifoBufHost = malloc(workBytes);
#endif
if (fifoBufHost == nullptr) {
WARN("Failed to allocate %zu bytes for work FIFO buffer", workBytes);
return ncclSystemError;
}
fifoCursor = 0;
fifoMask = ~0u;
break;
+16
Ver fichero
@@ -1492,6 +1492,10 @@ extern const char* topoPathTypeStr[];
static void parseOptions(struct ncclTopoSystem* system, const char *options) {
if (strcmp(options, "")) {
char *str_temp = (char *)malloc(strlen(options) + 1);
if (str_temp == nullptr) {
WARN("Failed to allocate memory for options parsing");
return;
}
strcpy(str_temp, options);
char* tokens[MAX_OPT_TOKENS];
int numTokens = 0;
@@ -1536,6 +1540,10 @@ static void parseOptions(struct ncclTopoSystem* system, const char *options) {
static bool checkOption(const char *options, const char *name) {
if (strcmp(options, "")) {
char *str_temp = (char *)malloc(strlen(options) + 1);
if (str_temp == nullptr) {
WARN("Failed to allocate memory for options checking");
return false;
}
strcpy(str_temp, options);
char* tokens[MAX_OPT_TOKENS];
int numTokens = 0;
@@ -1932,6 +1940,10 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
int *g8, n[NCCL_TOPO_MAX_NODES];
int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int));
if (all_gpu_permutations == nullptr) {
WARN("Failed to allocate memory for GPU permutations");
return ncclSystemError;
}
struct timeval tvs, tve;
gettimeofday(&tvs, NULL);
std::vector<int> r(ngpus), g(ngpus), rdm(ngpus);
@@ -2313,6 +2325,10 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
int *g16, n[NCCL_TOPO_MAX_NODES], rdm[NUMA_GPUS*NUMA_CPUS];
int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int));
if (all_gpu_permutations == nullptr) {
WARN("Failed to allocate memory for GPU permutations");
return ncclSystemError;
}
struct timeval tvs, tve;
gettimeofday(&tvs, NULL);
std::vector<int> r(ngpus), g(ngpus);
+14
Ver fichero
@@ -517,6 +517,11 @@ static ncclResult_t commFree(ncclComm_t comm) {
#ifdef ENABLE_PROFILING
struct ncclProf *prof, *prof_seq;
prof = (struct ncclProf*)malloc(sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES);
if (prof == nullptr) {
WARN("Failed to allocate profiling buffer");
// Skip profiling but continue with destruction
goto skip_profiling;
}
CUDACHECK(hipMemcpy(prof, comm->devComm->devProf, sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES, hipMemcpyDeviceToHost));
#define VEGA_GPU_RTC_FREQUENCY 2.5E7
for (int i=0; i<comm->nChannels; i++) {
@@ -529,6 +534,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
}
free(prof);
CUDACHECK(hipFree(comm->devComm->devProf));
skip_profiling:
#endif
#ifdef ENABLE_COLLTRACE
@@ -2137,6 +2143,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
CUDACHECKGOTO(hipGetDeviceProperties(&devProp, cudaDev), res, fail);
cuCount = devProp.multiProcessorCount;
archName = (char*)malloc(strlen(devProp.gcnArchName) + 1);
if (archName == nullptr) {
WARN("Failed to allocate memory for architecture name");
goto fail;
}
strcpy(archName, devProp.gcnArchName);
timers[TIMER_INIT_KERNELS] = clockNano();
@@ -2434,6 +2444,10 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
if (tmpNetName != NULL) {
int netNameLen = strlen(tmpNetName) + 1;
comm->config.netName = (char*)malloc(netNameLen);
if (comm->config.netName == nullptr) {
WARN("Failed to allocate memory for network name");
return ncclSystemError;
}
memcpy((void*)comm->config.netName, tmpNetName, netNameLen);
} else {
comm->config.netName = NULL;
@@ -721,6 +721,10 @@ ncclResult_t mscclAlgoMetaXmlLoad(const char* xmlFilePath, struct mscclXmlNode*
ncclResult_t mscclGetAlgoMetaFromXmlFile(const char* str, struct mscclAlgoMeta* algoMeta) {
struct mscclXmlNode* node;
node = (struct mscclXmlNode *)malloc(sizeof(struct mscclXmlNode));
if (node == nullptr) {
WARN("Failed to allocate memory for MSCCL XML node");
return ncclSystemError;
}
NCCLCHECK(mscclAlgoMetaXmlLoad(str, node));
algoMeta->filePath = str;
+10 -1
Ver fichero
@@ -21,9 +21,18 @@ const char* ncclDataTypeStr[ncclNumTypes] = {"i8", "u8", "i32
void roctxAlloc(roctxPayloadInfo_t payloadInfo, const size_t numEntries) {
// Allocate enough memory for numEntries in payloadEntries
payloadInfo->payloadEntries = (roctxPayloadSchemaEntryInfo*)malloc(numEntries * sizeof(roctxPayloadSchemaEntryInfo));
if (payloadInfo->payloadEntries == nullptr) {
payloadInfo->message = nullptr;
return;
}
// Allocate memory for the message that will be constructed
payloadInfo->message = (char*)malloc(MAX_MESSAGE_LENGTH * sizeof(char));
if (payloadInfo->message == nullptr) {
free(payloadInfo->payloadEntries);
payloadInfo->payloadEntries = nullptr;
return;
}
}
void roctxFree(roctxPayloadInfo_t payloadInfo) {
@@ -139,4 +148,4 @@ roctx_scoped_range_in::~roctx_scoped_range_in() noexcept {
#endif
roctxFree(&payloadInfo);
}
}
}
+5
Ver fichero
@@ -37,6 +37,11 @@ static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmS
if (create) {
int slen = strlen(shmPath);
handle->shmPath = (char*)malloc(slen + 1);
if (handle->shmPath == nullptr) {
WARN("Failed to allocate memory for shared memory path");
// handle->shmPath remains nullptr, caller should check
return;
}
memcpy(handle->shmPath, shmPath, slen + 1);
if (hptr) memset(hptr, 0, shmSize);
} else {
+8
Ver fichero
@@ -40,6 +40,10 @@ ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) {
while (1) {
if (p == nullptr) {
p = (struct ncclCudaContext*)calloc(1, sizeof(struct ncclCudaContext));
if (p == nullptr) {
result = ncclSystemError;
goto leave;
}
p->refCount = 1;
p->hcontext = hcontext;
p->next = cxtListHead;
@@ -189,6 +193,10 @@ ncclResult_t ncclStrongStreamAcquire(
cap = spare;
if (cap == nullptr) {
cap = (struct ncclStrongStreamCapture*)calloc(1, sizeof(struct ncclStrongStreamCapture));
if (cap == nullptr) {
ret = ncclSystemError;
goto do_unlock;
}
CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock);
}
cap->graphId = graph.graphId;
+11
Ver fichero
@@ -96,6 +96,10 @@ static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, v
// Pre-alloc response buffer
ex->respBuff = malloc(respSize);
if (ex->respBuff == nullptr && respSize > 0) {
free(ex);
return ncclSystemError;
}
ex->respSize = respSize;
ex->res = ncclInternalError;
ex->done = false;
@@ -1365,6 +1369,9 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec
if (resp.opId != opId) {
// Unexpected response, need to buffer the socket data
respBuff = malloc(resp.respSize);
if (respBuff == nullptr) {
return ncclSystemError;
}
}
assert(respBuff != NULL);
NCCLCHECK(ncclSocketRecv(sock, respBuff, resp.respSize));
@@ -1391,6 +1398,10 @@ ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnec
// Alloc some memory to act as a handle
ncclResult_t res = ncclSuccess;
void* opId = malloc(1);
if (opId == nullptr) {
WARN("Failed to allocate proxy operation ID");
return ncclSystemError;
}
NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, type, reqBuff, reqSize, respSize, opId), res, fail);
+4
Ver fichero
@@ -1281,6 +1281,10 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* u
if (*outRegBufFlag) {
record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback));
if (record == nullptr) {
WARN("Failed to allocate collnet cleanup callback");
return ncclSystemError;
}
record->base.fn = cleanupCollnet;
record->comm = comm;
record->reg = regRecord;
+4
Ver fichero
@@ -2203,6 +2203,10 @@ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui
assert(size > 0);
struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle));
if (mhandleWrapper == nullptr) {
WARN("Failed to allocate IB MR handle wrapper");
return ncclSystemError;
}
for (int i = 0; i < base->vProps.ndevs; i++) {
// Each ncclIbNetCommDevBase is at different offset in send and recv netComms
struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
+4
Ver fichero
@@ -2335,6 +2335,10 @@ ncclResult_t rocmIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui
assert(size > 0);
struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle));
if (mhandleWrapper == nullptr) {
WARN("Failed to allocate IB MR handle wrapper");
return ncclSystemError;
}
for (int i = 0; i < base->vProps.ndevs; i++) {
// Each ncclIbNetCommDevBase is at different offset in send and recv netComms
struct ncclIbNetCommDevBase* devComm = rocmIbGetNetCommDevBase(base, i);
+8
Ver fichero
@@ -843,6 +843,10 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(
if (*outRegBufUsed) {
if (sendRegRecord) {
sendRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
if (sendRecord == nullptr) {
WARN("Failed to allocate NVLS send cleanup callback");
return ncclSystemError;
}
sendRecord->base.fn = cleanupNvls;
sendRecord->reg = sendRegRecord;
sendRecord->comm = comm;
@@ -852,6 +856,10 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(
if (recvRegRecord) {
recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
if (recvRecord == nullptr) {
WARN("Failed to allocate NVLS recv cleanup callback");
return ncclSystemError;
}
recvRecord->base.fn = cleanupNvls;
recvRecord->reg = recvRegRecord;
recvRecord->comm = comm;