fix potential segfaults due to use after malloc fails (#2137)
* fix potential segfaults
* replace NULL with nullptr
---------
Co-authored-by: Prasannakumar Murugesan <prmuruge@amd.com>
[ROCm/rccl commit: 4a32ec2501]
Este commit está contenido en:
@@ -115,6 +115,7 @@ __hidden ncclResult_t exampleProfilerInit(void** context, uint64_t commId, int*
|
||||
|
||||
// pre-allocate memory for event object pools in dedicated profiler context
|
||||
struct context* ctx = (struct context *)calloc(1, sizeof(*ctx));
|
||||
if (ctx == nullptr) return ncclSystemError;
|
||||
ctx->commName = commName;
|
||||
ctx->commHash = commId;
|
||||
ctx->nranks = nranks;
|
||||
|
||||
@@ -1408,6 +1408,10 @@ static ncclResult_t uploadWork(struct ncclComm* comm, struct ncclKernelPlan* pla
|
||||
static_assert(16 <= alignof(max_align_t), "We rely on 16-byte alignment.");
|
||||
fifoBufHost = malloc(workBytes);
|
||||
#endif
|
||||
if (fifoBufHost == nullptr) {
|
||||
WARN("Failed to allocate %zu bytes for work FIFO buffer", workBytes);
|
||||
return ncclSystemError;
|
||||
}
|
||||
fifoCursor = 0;
|
||||
fifoMask = ~0u;
|
||||
break;
|
||||
|
||||
@@ -1492,6 +1492,10 @@ extern const char* topoPathTypeStr[];
|
||||
static void parseOptions(struct ncclTopoSystem* system, const char *options) {
|
||||
if (strcmp(options, "")) {
|
||||
char *str_temp = (char *)malloc(strlen(options) + 1);
|
||||
if (str_temp == nullptr) {
|
||||
WARN("Failed to allocate memory for options parsing");
|
||||
return;
|
||||
}
|
||||
strcpy(str_temp, options);
|
||||
char* tokens[MAX_OPT_TOKENS];
|
||||
int numTokens = 0;
|
||||
@@ -1536,6 +1540,10 @@ static void parseOptions(struct ncclTopoSystem* system, const char *options) {
|
||||
static bool checkOption(const char *options, const char *name) {
|
||||
if (strcmp(options, "")) {
|
||||
char *str_temp = (char *)malloc(strlen(options) + 1);
|
||||
if (str_temp == nullptr) {
|
||||
WARN("Failed to allocate memory for options checking");
|
||||
return false;
|
||||
}
|
||||
strcpy(str_temp, options);
|
||||
char* tokens[MAX_OPT_TOKENS];
|
||||
int numTokens = 0;
|
||||
@@ -1932,6 +1940,10 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
|
||||
|
||||
int *g8, n[NCCL_TOPO_MAX_NODES];
|
||||
int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int));
|
||||
if (all_gpu_permutations == nullptr) {
|
||||
WARN("Failed to allocate memory for GPU permutations");
|
||||
return ncclSystemError;
|
||||
}
|
||||
struct timeval tvs, tve;
|
||||
gettimeofday(&tvs, NULL);
|
||||
std::vector<int> r(ngpus), g(ngpus), rdm(ngpus);
|
||||
@@ -2313,6 +2325,10 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
|
||||
|
||||
int *g16, n[NCCL_TOPO_MAX_NODES], rdm[NUMA_GPUS*NUMA_CPUS];
|
||||
int *all_gpu_permutations = (int *)malloc(TOTAL_PERMUTE_COUNT*NUMA_CPUS*NUMA_GPUS*sizeof(int));
|
||||
if (all_gpu_permutations == nullptr) {
|
||||
WARN("Failed to allocate memory for GPU permutations");
|
||||
return ncclSystemError;
|
||||
}
|
||||
struct timeval tvs, tve;
|
||||
gettimeofday(&tvs, NULL);
|
||||
std::vector<int> r(ngpus), g(ngpus);
|
||||
|
||||
@@ -517,6 +517,11 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
#ifdef ENABLE_PROFILING
|
||||
struct ncclProf *prof, *prof_seq;
|
||||
prof = (struct ncclProf*)malloc(sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES);
|
||||
if (prof == nullptr) {
|
||||
WARN("Failed to allocate profiling buffer");
|
||||
// Skip profiling but continue with destruction
|
||||
goto skip_profiling;
|
||||
}
|
||||
CUDACHECK(hipMemcpy(prof, comm->devComm->devProf, sizeof(struct ncclProf)*MAXCHANNELS*PROFILE_NUM_LAUNCHES, hipMemcpyDeviceToHost));
|
||||
#define VEGA_GPU_RTC_FREQUENCY 2.5E7
|
||||
for (int i=0; i<comm->nChannels; i++) {
|
||||
@@ -529,6 +534,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
|
||||
}
|
||||
free(prof);
|
||||
CUDACHECK(hipFree(comm->devComm->devProf));
|
||||
skip_profiling:
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_COLLTRACE
|
||||
@@ -2137,6 +2143,10 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) {
|
||||
CUDACHECKGOTO(hipGetDeviceProperties(&devProp, cudaDev), res, fail);
|
||||
cuCount = devProp.multiProcessorCount;
|
||||
archName = (char*)malloc(strlen(devProp.gcnArchName) + 1);
|
||||
if (archName == nullptr) {
|
||||
WARN("Failed to allocate memory for architecture name");
|
||||
goto fail;
|
||||
}
|
||||
strcpy(archName, devProp.gcnArchName);
|
||||
|
||||
timers[TIMER_INIT_KERNELS] = clockNano();
|
||||
@@ -2434,6 +2444,10 @@ static ncclResult_t envConfigOverride(ncclComm_t comm) {
|
||||
if (tmpNetName != NULL) {
|
||||
int netNameLen = strlen(tmpNetName) + 1;
|
||||
comm->config.netName = (char*)malloc(netNameLen);
|
||||
if (comm->config.netName == nullptr) {
|
||||
WARN("Failed to allocate memory for network name");
|
||||
return ncclSystemError;
|
||||
}
|
||||
memcpy((void*)comm->config.netName, tmpNetName, netNameLen);
|
||||
} else {
|
||||
comm->config.netName = NULL;
|
||||
|
||||
@@ -721,6 +721,10 @@ ncclResult_t mscclAlgoMetaXmlLoad(const char* xmlFilePath, struct mscclXmlNode*
|
||||
ncclResult_t mscclGetAlgoMetaFromXmlFile(const char* str, struct mscclAlgoMeta* algoMeta) {
|
||||
struct mscclXmlNode* node;
|
||||
node = (struct mscclXmlNode *)malloc(sizeof(struct mscclXmlNode));
|
||||
if (node == nullptr) {
|
||||
WARN("Failed to allocate memory for MSCCL XML node");
|
||||
return ncclSystemError;
|
||||
}
|
||||
NCCLCHECK(mscclAlgoMetaXmlLoad(str, node));
|
||||
|
||||
algoMeta->filePath = str;
|
||||
|
||||
@@ -21,9 +21,18 @@ const char* ncclDataTypeStr[ncclNumTypes] = {"i8", "u8", "i32
|
||||
void roctxAlloc(roctxPayloadInfo_t payloadInfo, const size_t numEntries) {
|
||||
// Allocate enough memory for numEntries in payloadEntries
|
||||
payloadInfo->payloadEntries = (roctxPayloadSchemaEntryInfo*)malloc(numEntries * sizeof(roctxPayloadSchemaEntryInfo));
|
||||
if (payloadInfo->payloadEntries == nullptr) {
|
||||
payloadInfo->message = nullptr;
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocate memory for the message that will be constructed
|
||||
payloadInfo->message = (char*)malloc(MAX_MESSAGE_LENGTH * sizeof(char));
|
||||
if (payloadInfo->message == nullptr) {
|
||||
free(payloadInfo->payloadEntries);
|
||||
payloadInfo->payloadEntries = nullptr;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void roctxFree(roctxPayloadInfo_t payloadInfo) {
|
||||
@@ -139,4 +148,4 @@ roctx_scoped_range_in::~roctx_scoped_range_in() noexcept {
|
||||
#endif
|
||||
roctxFree(&payloadInfo);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -37,6 +37,11 @@ static void shmHandleInit(int fd, char* shmPath, size_t shmSize, size_t realShmS
|
||||
if (create) {
|
||||
int slen = strlen(shmPath);
|
||||
handle->shmPath = (char*)malloc(slen + 1);
|
||||
if (handle->shmPath == nullptr) {
|
||||
WARN("Failed to allocate memory for shared memory path");
|
||||
// handle->shmPath remains nullptr, caller should check
|
||||
return;
|
||||
}
|
||||
memcpy(handle->shmPath, shmPath, slen + 1);
|
||||
if (hptr) memset(hptr, 0, shmSize);
|
||||
} else {
|
||||
|
||||
@@ -40,6 +40,10 @@ ncclResult_t ncclCudaContextTrack(struct ncclCudaContext** out) {
|
||||
while (1) {
|
||||
if (p == nullptr) {
|
||||
p = (struct ncclCudaContext*)calloc(1, sizeof(struct ncclCudaContext));
|
||||
if (p == nullptr) {
|
||||
result = ncclSystemError;
|
||||
goto leave;
|
||||
}
|
||||
p->refCount = 1;
|
||||
p->hcontext = hcontext;
|
||||
p->next = cxtListHead;
|
||||
@@ -189,6 +193,10 @@ ncclResult_t ncclStrongStreamAcquire(
|
||||
cap = spare;
|
||||
if (cap == nullptr) {
|
||||
cap = (struct ncclStrongStreamCapture*)calloc(1, sizeof(struct ncclStrongStreamCapture));
|
||||
if (cap == nullptr) {
|
||||
ret = ncclSystemError;
|
||||
goto do_unlock;
|
||||
}
|
||||
CUDACHECKGOTO(cudaStreamCreateWithFlags(&cap->captureStream, cudaStreamNonBlocking), ret, do_unlock);
|
||||
}
|
||||
cap->graphId = graph.graphId;
|
||||
|
||||
@@ -96,6 +96,10 @@ static ncclResult_t expectedProxyResponseEnqueue(struct ncclProxyState* state, v
|
||||
|
||||
// Pre-alloc response buffer
|
||||
ex->respBuff = malloc(respSize);
|
||||
if (ex->respBuff == nullptr && respSize > 0) {
|
||||
free(ex);
|
||||
return ncclSystemError;
|
||||
}
|
||||
ex->respSize = respSize;
|
||||
ex->res = ncclInternalError;
|
||||
ex->done = false;
|
||||
@@ -1365,6 +1369,9 @@ ncclResult_t ncclPollProxyResponse(struct ncclComm* comm, struct ncclProxyConnec
|
||||
if (resp.opId != opId) {
|
||||
// Unexpected response, need to buffer the socket data
|
||||
respBuff = malloc(resp.respSize);
|
||||
if (respBuff == nullptr) {
|
||||
return ncclSystemError;
|
||||
}
|
||||
}
|
||||
assert(respBuff != NULL);
|
||||
NCCLCHECK(ncclSocketRecv(sock, respBuff, resp.respSize));
|
||||
@@ -1391,6 +1398,10 @@ ncclResult_t ncclProxyCallBlocking(struct ncclComm* comm, struct ncclProxyConnec
|
||||
// Alloc some memory to act as a handle
|
||||
ncclResult_t res = ncclSuccess;
|
||||
void* opId = malloc(1);
|
||||
if (opId == nullptr) {
|
||||
WARN("Failed to allocate proxy operation ID");
|
||||
return ncclSystemError;
|
||||
}
|
||||
|
||||
NCCLCHECKGOTO(ncclProxyCallAsync(comm, proxyConn, type, reqBuff, reqSize, respSize, opId), res, fail);
|
||||
|
||||
|
||||
@@ -1281,6 +1281,10 @@ ncclResult_t ncclCollnetGraphRegisterBuffer(struct ncclComm* comm, const void* u
|
||||
|
||||
if (*outRegBufFlag) {
|
||||
record = (struct ncclCollnetCleanupCallback*)malloc(sizeof(struct ncclCollnetCleanupCallback));
|
||||
if (record == nullptr) {
|
||||
WARN("Failed to allocate collnet cleanup callback");
|
||||
return ncclSystemError;
|
||||
}
|
||||
record->base.fn = cleanupCollnet;
|
||||
record->comm = comm;
|
||||
record->reg = regRecord;
|
||||
|
||||
@@ -2203,6 +2203,10 @@ ncclResult_t ncclIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui
|
||||
assert(size > 0);
|
||||
struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
|
||||
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle));
|
||||
if (mhandleWrapper == nullptr) {
|
||||
WARN("Failed to allocate IB MR handle wrapper");
|
||||
return ncclSystemError;
|
||||
}
|
||||
for (int i = 0; i < base->vProps.ndevs; i++) {
|
||||
// Each ncclIbNetCommDevBase is at different offset in send and recv netComms
|
||||
struct ncclIbNetCommDevBase* devComm = ncclIbGetNetCommDevBase(base, i);
|
||||
|
||||
@@ -2335,6 +2335,10 @@ ncclResult_t rocmIbRegMrDmaBuf(void* comm, void* data, size_t size, int type, ui
|
||||
assert(size > 0);
|
||||
struct ncclIbNetCommBase* base = (struct ncclIbNetCommBase*) comm;
|
||||
struct ncclIbMrHandle* mhandleWrapper = (struct ncclIbMrHandle*) malloc(sizeof(struct ncclIbMrHandle));
|
||||
if (mhandleWrapper == nullptr) {
|
||||
WARN("Failed to allocate IB MR handle wrapper");
|
||||
return ncclSystemError;
|
||||
}
|
||||
for (int i = 0; i < base->vProps.ndevs; i++) {
|
||||
// Each ncclIbNetCommDevBase is at different offset in send and recv netComms
|
||||
struct ncclIbNetCommDevBase* devComm = rocmIbGetNetCommDevBase(base, i);
|
||||
|
||||
@@ -843,6 +843,10 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(
|
||||
if (*outRegBufUsed) {
|
||||
if (sendRegRecord) {
|
||||
sendRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
|
||||
if (sendRecord == nullptr) {
|
||||
WARN("Failed to allocate NVLS send cleanup callback");
|
||||
return ncclSystemError;
|
||||
}
|
||||
sendRecord->base.fn = cleanupNvls;
|
||||
sendRecord->reg = sendRegRecord;
|
||||
sendRecord->comm = comm;
|
||||
@@ -852,6 +856,10 @@ ncclResult_t ncclNvlsGraphRegisterBuffer(
|
||||
|
||||
if (recvRegRecord) {
|
||||
recvRecord = (struct ncclNvlsCleanupCallback*)malloc(sizeof(struct ncclNvlsCleanupCallback));
|
||||
if (recvRecord == nullptr) {
|
||||
WARN("Failed to allocate NVLS recv cleanup callback");
|
||||
return ncclSystemError;
|
||||
}
|
||||
recvRecord->base.fn = cleanupNvls;
|
||||
recvRecord->reg = recvRegRecord;
|
||||
recvRecord->comm = comm;
|
||||
|
||||
Referencia en una nueva incidencia
Block a user