diff --git a/projects/rccl/CMakeLists.txt b/projects/rccl/CMakeLists.txt
index 4204dc5ed2..0e27118a94 100644
--- a/projects/rccl/CMakeLists.txt
+++ b/projects/rccl/CMakeLists.txt
@@ -148,6 +148,9 @@ check_symbol_exists("hipEventDisableSystemFence" "hip/hip_runtime_api.h" HIP_EVE
 ### Check for hipDeviceMallocUncached support
 check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
 
+### Check for hipDeviceMallocContiguous support
+check_symbol_exists("hipDeviceMallocContiguous" "hip/hip_runtime_api.h" HIP_CONTIGUOUS_MEMORY)
+
 ### Check for indirect function call support
 if(ENABLE_IFC)
   if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.5.30201")
@@ -575,6 +578,9 @@ endif()
 if(${HIP_EVENT_DISABLE_FENCE})
   target_compile_definitions(rccl PRIVATE HIP_EVENT_DISABLE_FENCE)
 endif()
+if(${HIP_CONTIGUOUS_MEMORY})
+  target_compile_definitions(rccl PRIVATE HIP_CONTIGUOUS_MEMORY)
+endif()
 if(${hipcc_version_string} VERSION_GREATER_EQUAL "5.7.31920")
   target_compile_definitions(rccl PRIVATE HIP_UNCACHED_MEMORY)
   message(STATUS "HIP_UNCACHED_MEMORY enabled")
diff --git a/projects/rccl/src/clique/CliqueManager.cc b/projects/rccl/src/clique/CliqueManager.cc
index 7abfc67fda..368d99a797 100644
--- a/projects/rccl/src/clique/CliqueManager.cc
+++ b/projects/rccl/src/clique/CliqueManager.cc
@@ -190,7 +190,7 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
       hipIpcMemHandle_t handle;
       // Allocate fine-grained device memory on rank 0 and get IPC handle for it
       // Re-usable barrier consists of (globalCount / globalSense) pair of integers
-      NCCLCHECKGOTO(ncclCudaCalloc(&m_fineGrainBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, true), res, dropback);
+      NCCLCHECKGOTO(ncclCudaCalloc(&m_fineGrainBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, hipDeviceMallocFinegrained), res, dropback);
       if (hipIpcGetMemHandle(&handle, m_fineGrainBarrierMem) != hipSuccess)
       {
         WARN("Unable to get IPC handle for barrier memory");
@@ -228,7 +228,7 @@ ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
     // First rank prepares fine-grained memory shared across ranks used for the two barrier variables
     if (m_rank == 0)
     {
-      NCCLCHECKGOTO(ncclCudaCalloc(&m_staticGpuBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, true), res, dropback);
+      NCCLCHECKGOTO(ncclCudaCalloc(&m_staticGpuBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), nullptr, hipDeviceMallocFinegrained), res, dropback);
       // Prepare all barriers
       for (int opIndex = 0; opIndex < NCCL_MAX_OPS; opIndex++)
       {
diff --git a/projects/rccl/src/include/alloc.h b/projects/rccl/src/include/alloc.h
index 3258202856..09f5386dda 100644
--- a/projects/rccl/src/include/alloc.h
+++ b/projects/rccl/src/include/alloc.h
@@ -160,19 +160,12 @@ static inline ncclResult_t ncclCuMemFree(void *ptr) {
 #endif
 
 template <typename T>
-ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, bool isFineGrain = false) {
+ncclResult_t ncclCudaMallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, unsigned int flags = hipDeviceMallocDefault) {
   ncclResult_t result = ncclSuccess;
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (isFineGrain) {
-#if defined(HIP_UNCACHED_MEMORY)
-    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish);
-#else
-    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
-#endif
-  } else
-    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), flags), result, finish);
 finish:
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
   if (*ptr == nullptr) WARN("Failed to CUDA malloc %ld bytes", nelem*sizeof(T));
@@ -182,7 +175,7 @@ finish:
 #define ncclCudaMalloc(...) ncclCudaMallocDebug( __FILE__, __LINE__, __VA_ARGS__)
 
 template <typename T>
-ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, cudaStream_t sideStream = nullptr, bool isFineGrain = false) {
+ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t nelem, cudaStream_t sideStream = nullptr, unsigned int flags = hipDeviceMallocDefault) {
   ncclResult_t result = ncclSuccess;
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
@@ -190,15 +183,8 @@ ncclResult_t ncclCudaCallocDebug(const char *filefunc, int line, T** ptr, size_t
   // Need a side stream so as not to interfere with graph capture.
   cudaStream_t stream = sideStream;
   if (stream == nullptr)
-    CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-  if (isFineGrain) {
-#if defined(HIP_UNCACHED_MEMORY)
-    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish);
-#else
-    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
-#endif
-  } else
-    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+  CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), flags), result, finish);
   CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
   CUDACHECKGOTO(cudaStreamSynchronize(stream), result, finish);
   if (sideStream == nullptr)
@@ -218,19 +204,12 @@ finish:
 #define ncclCudaCalloc(...) ncclCudaCallocDebug(__FILE__, __LINE__, __VA_ARGS__)
 
 template <typename T>
-ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, bool isFineGrain = false) {
+ncclResult_t ncclCudaCallocAsyncDebug(const char *filefunc, int line, T** ptr, size_t nelem, hipStream_t stream, unsigned int flags = hipDeviceMallocDefault) {
   ncclResult_t result = ncclSuccess;
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (isFineGrain) {
-#if defined(HIP_UNCACHED_MEMORY)
-    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocUncached), result, finish);
-#else
-    CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), hipDeviceMallocFinegrained), result, finish);
-#endif
-  } else
-    CUDACHECKGOTO(cudaMalloc(ptr, nelem*sizeof(T)), result, finish);
+  CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*sizeof(T), flags), result, finish);
   CUDACHECKGOTO(cudaMemsetAsync(*ptr, 0, nelem*sizeof(T), stream), result, finish);
   int dev;
   CUDACHECK(hipGetDevice(&dev));
diff --git a/projects/rccl/src/include/gdrwrap.h b/projects/rccl/src/include/gdrwrap.h
index f532a705e1..a791e050f0 100644
--- a/projects/rccl/src/include/gdrwrap.h
+++ b/projects/rccl/src/include/gdrwrap.h
@@ -176,8 +176,11 @@ static ncclResult_t ncclGdrCudaCalloc(T** ptr, T** devPtr, size_t nelem, void**
   // GDRCOPY Pinned buffer has to be a minimum of a GPU_PAGE_SIZE
   ALIGN_SIZE(mapSize, GPU_PAGE_SIZE);
   // GDRCOPY Pinned buffer has to be GPU_PAGE_SIZE aligned too
-  NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, true));
-
+#if defined(HIP_UNCACHED_MEMORY)
+  NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, hipDeviceMallocUncached));
+#else
+  NCCLCHECK(ncclCudaCalloc(&devMem, mapSize+GPU_PAGE_SIZE-1, stream, hipDeviceMallocFinegrained));
+#endif
   gdr_mem_desc_t* md;
   NCCLCHECK(ncclCalloc(&md, 1));
   md->gdrDevMem = devMem;
diff --git a/projects/rccl/src/transport/coll_net.cc b/projects/rccl/src/transport/coll_net.cc
index 9f58017d48..fed72c6ff4 100644
--- a/projects/rccl/src/transport/coll_net.cc
+++ b/projects/rccl/src/transport/coll_net.cc
@@ -378,7 +378,11 @@ static ncclResult_t sharedBuffersInit(struct ncclCollNetSharedRes* collNet, int
   *size = collNet->size;
 
   if (cuda && collNet->cudaBuff == NULL) {
-    NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda));
+#if defined(HIP_UNCACHED_MEMORY)
+    NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault));
+#else
+    NCCLCHECK(ncclCudaCalloc(&collNet->cudaBuff, *size, nullptr, cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
+#endif
   }
   if (!cuda && collNet->hostBuff == NULL) {
     NCCLCHECK(ncclCudaHostCalloc(&collNet->hostBuff, *size));
diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc
index cd8b2e8c11..2cdac7efd9 100644
--- a/projects/rccl/src/transport/net.cc
+++ b/projects/rccl/src/transport/net.cc
@@ -166,6 +166,10 @@ static ncclResult_t canConnect(int* ret, struct ncclTopoSystem* topo, struct ncc
 NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
 NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1);
 
+#if defined(HIP_CONTIGUOUS_MEMORY)
+RCCL_PARAM(NetContiguousMem, "NET_CONTIGUOUS_MEM", 0);
+#endif
+
 struct setupReq {
   int tpRank;
   int tpLocalRank;
@@ -521,7 +525,18 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
     if (sameProcess == 0 || ncclCuMemEnable()) {
       NCCLCHECK(ncclP2pAllocateShareableBuffer(state->size, &state->ipcDesc, (void**)&state->cudaBuff));
     } else {
-      NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr, cuda));
+#if defined(HIP_UNCACHED_MEMORY)
+#if defined(HIP_CONTIGUOUS_MEMORY)
+      NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr,
+        cuda ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault));
+#else
+      NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr,
+        cuda ? hipDeviceMallocUncached : hipDeviceMallocDefault));
+#endif
+#else
+      NCCLCHECK(ncclCudaCalloc(&state->cudaBuff, state->size, nullptr,
+        cuda ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
+#endif
     }
   }
   if (!cuda && state->hostBuff == NULL) {
@@ -750,7 +765,18 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
         NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
                                                  (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
       } else {
-        NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr));
+#if defined(HIP_UNCACHED_MEMORY)
+#if defined(HIP_CONTIGUOUS_MEMORY)
+        NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
+          resources->useGdr ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault));
+#else
+        NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
+          resources->useGdr ? hipDeviceMallocUncached : hipDeviceMallocDefault));
+#endif
+#else
+        NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
+          resources->useGdr ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
+#endif
       }
       map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
     }
@@ -911,7 +937,18 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
         NCCLCHECK(ncclP2pAllocateShareableBuffer(map->mems[NCCL_NET_MAP_DEVMEM].size, &map->mems[NCCL_NET_MAP_DEVMEM].ipcDesc,
                                                  (void**)&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr));
       } else {
-        NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr, resources->useGdr));
+#if defined(HIP_UNCACHED_MEMORY)
+#if defined(HIP_CONTIGUOUS_MEMORY)
+        NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
+          resources->useGdr ? (rcclParamNetContiguousMem() ? hipDeviceMallocContiguous : hipDeviceMallocUncached) : hipDeviceMallocDefault));
+#else
+        NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
+          resources->useGdr ? hipDeviceMallocUncached : hipDeviceMallocDefault));
+#endif
+#else
+        NCCLCHECK(ncclCudaCalloc(&map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr, map->mems[NCCL_NET_MAP_DEVMEM].size, nullptr,
+          resources->useGdr ? hipDeviceMallocFinegrained : hipDeviceMallocDefault));
+#endif
       }
       map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
     }
diff --git a/projects/rccl/src/transport/p2p.cc b/projects/rccl/src/transport/p2p.cc
index 5e5dc24b83..c412022276 100644
--- a/projects/rccl/src/transport/p2p.cc
+++ b/projects/rccl/src/transport/p2p.cc
@@ -218,7 +218,11 @@ ncclResult_t ncclP2pAllocateShareableBuffer(size_t size, ncclIpcDesc *ipcDesc, v
 #endif
   } else {
     // Allocate a CUDA buffer and generate an IPC handle for it
-    NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, true));
+#if defined(HIP_UNCACHED_MEMORY)
+    NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, hipDeviceMallocUncached));
+#else
+    NCCLCHECK(ncclCudaCalloc((char **)ptr, size, nullptr, hipDeviceMallocFinegrained));
+#endif
     cudaError_t res = cudaIpcGetMemHandle(&ipcDesc->devIpc, *ptr);
     if (res != cudaSuccess) {
       WARN("cudaIpcGetMemHandle failed : %s", cudaGetErrorString(res));
@@ -593,7 +597,11 @@ static ncclResult_t p2pSendProxySetup(struct ncclProxyConnection* connection, st
     NCCLCHECK(ncclCalloc(&proxyInfo, 1));
     connection->transportResources = proxyInfo;
 
-    NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, true));
+#if defined(HIP_UNCACHED_MEMORY)
+    NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, hipDeviceMallocUncached));
+#else
+    NCCLCHECK(ncclCudaCalloc(&proxyInfo->ceDevBuff, proxyState->buffSizes[NCCL_PROTO_SIMPLE], nullptr, hipDeviceMallocFinegrained));
+#endif
 
     char shmPath[PATH_MAX];
     shmPath[0] = '\0';