diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a80d1a8af..7b24d0abcf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,8 +103,6 @@ set(CU_SOURCES
     src/collectives/device/broadcast.cu
     src/collectives/device/reduce_scatter.cu
     src/collectives/device/sendrecv.cu
-    src/collectives/device/all_to_all.cu
-    src/collectives/device/all_to_allv.cu
     src/collectives/device/functions.cu)
 
 set(CPP_SOURCES)
diff --git a/src/channel.cc b/src/channel.cc
index 635f07869b..12cc44238d 100644
--- a/src/channel.cc
+++ b/src/channel.cc
@@ -29,14 +29,12 @@ ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
 
   // Per-channel operation list.
   NCCLCHECK(ncclCudaHostCalloc(&channel->workFifo, NCCL_MAX_OPS));
-  NCCLCHECK(ncclCudaHostCalloc(&channel->a2avParams, comm->nRanks*NCCL_MAX_OPS*4));
   return ncclSuccess;
 }
 
 ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
   if (channel->id == -1) return ncclSuccess;
   // Operation list
-  NCCLCHECK(ncclCudaHostFree(channel->a2avParams));
   NCCLCHECK(ncclCudaHostFree(channel->workFifo));
 
   // Free Ring index to rank tables
diff --git a/src/collectives/all_to_all_api.cc b/src/collectives/all_to_all_api.cc
index 9ecc1c47e9..a58f42e1eb 100644
--- a/src/collectives/all_to_all_api.cc
+++ b/src/collectives/all_to_all_api.cc
@@ -12,22 +12,15 @@ NCCL_API(ncclResult_t, ncclAllToAll, const void* sendbuff, void* recvbuff, size_
   ncclComm_t comm, hipStream_t stream);
 ncclResult_t ncclAllToAll(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
   ncclComm_t comm, hipStream_t stream) {
-  if (comm->alltoallDisable) {
-    int nRanks;
-    NCCLCHECK(ncclCommCount(comm, &nRanks));
-    size_t rankOffset = count * ncclTypeSize(datatype);
-    if (count == 0) return ncclSuccess;
-    NCCLCHECK(ncclGroupStart());
-    for (int r=0; r<nRanks; r++) {
-      NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, datatype, r, comm, stream));
-      NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, datatype, r, comm, stream));
-    }
-    NCCLCHECK(ncclGroupEnd());
-    return ncclSuccess;
-  } else {
-    struct ncclInfo info = { ncclFuncAllToAll, "AllToAll",
-      sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream, /* Args */
-      ALLTOALL_CHUNKSTEPS, ALLTOALL_SLICESTEPS };
-    return ncclEnqueueCheck(&info);
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  size_t rankOffset = count * ncclTypeSize(datatype);
+  if (count == 0) return ncclSuccess;
+  NCCLCHECK(ncclGroupStart());
+  for (int r=0; r<nRanks; r++) {
+    NCCLCHECK(ncclSend(((char*)sendbuff)+r*rankOffset, count, datatype, r, comm, stream));
+    NCCLCHECK(ncclRecv(((char*)recvbuff)+r*rankOffset, count, datatype, r, comm, stream));
   }
+  NCCLCHECK(ncclGroupEnd());
+  return ncclSuccess;
 }
diff --git a/src/collectives/all_to_allv_api.cc b/src/collectives/all_to_allv_api.cc
index ca405fbf5b..b0ac549d43 100755
--- a/src/collectives/all_to_allv_api.cc
+++ b/src/collectives/all_to_allv_api.cc
@@ -14,32 +14,25 @@ NCCL_API(ncclResult_t, ncclAllToAllv, const void *sendbuff, const size_t sendcou
 ncclResult_t ncclAllToAllv(const void *sendbuff, const size_t sendcounts[], const size_t sdispls[],
     void *recvbuff, const size_t recvcounts[], const size_t rdispls[],
     ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream) {
-  if (comm->alltoallDisable) {
-    int nRanks;
-    NCCLCHECK(ncclCommCount(comm, &nRanks));
-    NCCLCHECK(ncclGroupStart());
-    for (int r=0; r<nRanks; r++) {
-      if (sendcounts[r]) NCCLCHECK(ncclSend(
-          ((char*)sendbuff) + sdispls[r]*ncclTypeSize(datatype),
-          sendcounts[r],
-          datatype,
-          r,
-          comm,
-          stream));
-      if (recvcounts[r]) NCCLCHECK(ncclRecv(
-          ((char*)recvbuff) + rdispls[r]*ncclTypeSize(datatype),
-          recvcounts[r],
-          datatype,
-          r,
-          comm,
-          stream));
-    }
-    NCCLCHECK(ncclGroupEnd());
-    return ncclSuccess;
-  } else {
-    struct ncclInfo info = { ncclFuncAllToAllv, "AllToAllv",
-      sendbuff, recvbuff, 0, datatype, ncclSum, 0, comm, stream, /* Args */
-      ALLTOALLV_CHUNKSTEPS, ALLTOALLV_SLICESTEPS, sendcounts, sdispls, recvcounts, rdispls };
-    return ncclEnqueueCheck(&info);
+  int nRanks;
+  NCCLCHECK(ncclCommCount(comm, &nRanks));
+  NCCLCHECK(ncclGroupStart());
+  for (int r=0; r<nRanks; r++) {
+    if (sendcounts[r]) NCCLCHECK(ncclSend(
+        ((char*)sendbuff) + sdispls[r]*ncclTypeSize(datatype),
+        sendcounts[r],
+        datatype,
+        r,
+        comm,
+        stream));
+    if (recvcounts[r]) NCCLCHECK(ncclRecv(
+        ((char*)recvbuff) + rdispls[r]*ncclTypeSize(datatype),
+        recvcounts[r],
+        datatype,
+        r,
+        comm,
+        stream));
   }
+  NCCLCHECK(ncclGroupEnd());
+  return ncclSuccess;
 }
diff --git a/src/collectives/device/all_to_all.cu b/src/collectives/device/all_to_all.cu
deleted file mode 100644
index 46d0486d67..0000000000
--- a/src/collectives/device/all_to_all.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "all_to_all.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_FUNC(AllToAll, RING, SIMPLE, Sum, int8_t);
diff --git a/src/collectives/device/all_to_all.h b/src/collectives/device/all_to_all.h
deleted file mode 100644
index 05927a2e8a..0000000000
--- a/src/collectives/device/all_to_all.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "devcomm.h"
-#include "primitives.h"
-#include "collectives.h"
-
-template<class FUNC, typename T, int UNROLL>
-class ncclFunction<ncclFuncAllToAll, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
-  public:
-  __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
-    const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int nChannels = args->coll.nChannels;
-    struct ncclDevComm* comm = args->comm;
-    struct ncclChannel* channel = comm->channels+blockIdx.x;
-    struct ncclRing* ring = &channel->ring;
-    const ssize_t size = args->coll.count;
-    const int nranks = comm->nRanks;
-    const int bid = args->coll.bid;
-    const int rank = ring->devUserRanks[0];
-    const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-    const int chunkSize = stepSize * ALLTOALL_CHUNKSTEPS;
-    const int peersPerChan = DIVUP(nranks, nChannels);
-    const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
-
-    // Compute pointers
-    const T * __restrict__ thisInput = (const T*)args->sendbuff;
-    T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      for (int i = 0; i < peersPerChan; i++) {
-        if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
-          (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
-          continue;
-        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-        ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-        ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-        int nelem = min(realChunkSize, size-chunkOffset);
-        if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
-          if (tid < nthreads && thisInput != thisOutput) {
-            const T* sendbuff = thisInput+chunkOffset+rank*size;
-            T* recvbuff = thisOutput+chunkOffset+rank*size;
-            // local copy
-            ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
-          }
-        }
-      }
-    }
-
-    for (int i = 0; i < peersPerChan; i++) {
-      if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
-        (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
-        continue;
-      if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
-        int nthreadsSplit = nthreads/2;
-        if (tid < nthreadsSplit ) {
-          int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
-          ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 0, 1, 0, FUNC>
-            prims(tid, nthreadsSplit, NULL, &peerSend, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
-          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-            int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-            ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-            ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-            int nelem = min(realChunkSize, size-chunkOffset);
-            ssize_t send_offset = chunkOffset + peerSend*size;
-            prims.send(thisInput+send_offset, nelem);
-          }
-        } else {
-          int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
-          ncclPrimitives<UNROLL, ALLTOALL_CHUNKSTEPS/ALLTOALL_SLICESTEPS, ALLTOALL_SLICESTEPS, T, 1, 0, 0, FUNC>
-            prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, NULL, NULL, stepSize, channel, comm, ncclShmem->ptrs, 1);
-          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-            int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-            ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-            ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-            int nelem = min(realChunkSize, size-chunkOffset);
-            ssize_t recv_offset = chunkOffset + peerRecv*size;
-            prims.recv(thisOutput+recv_offset, nelem);
-          }
-        }
-      }
-    }
-  }
-};
\ No newline at end of file
diff --git a/src/collectives/device/all_to_allv.cu b/src/collectives/device/all_to_allv.cu
deleted file mode 100644
index 49d883c530..0000000000
--- a/src/collectives/device/all_to_allv.cu
+++ /dev/null
@@ -1,11 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "all_to_allv.h"
-#include "common.h"
-#include "collectives.h"
-
-IMPL_COLL_FUNC(AllToAllv, RING, SIMPLE, Sum, int8_t);
diff --git a/src/collectives/device/all_to_allv.h b/src/collectives/device/all_to_allv.h
deleted file mode 100644
index dffa2e1d60..0000000000
--- a/src/collectives/device/all_to_allv.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*************************************************************************
- * Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
- *
- * See LICENSE.txt for license information
- ************************************************************************/
-
-#include "devcomm.h"
-#include "primitives.h"
-#include "collectives.h"
-
-template<class FUNC, typename T, int UNROLL>
-class ncclFunction<ncclFuncAllToAllv, NCCL_ALGO_RING, NCCL_PROTO_SIMPLE, FUNC, T, UNROLL> {
-  public:
-  __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {
-    const int tid = threadIdx.x;
-    const int nthreads = args->nThreads;
-    const int nChannels = args->a2av.nChannels;
-    struct ncclDevComm* comm = args->comm;
-    struct ncclChannel* channel = comm->channels+blockIdx.x;
-    struct ncclRing* ring = &channel->ring;
-    const ssize_t typesize = args->a2av.count;
-    const int nranks = comm->nRanks;
-    const int bid = args->a2av.bid;
-    const int rank = ring->devUserRanks[0];
-    const int stepSize = comm->buffSizes[NCCL_PROTO_SIMPLE] / (sizeof(T)*NCCL_STEPS);
-    const int chunkSize = stepSize * ALLTOALLV_CHUNKSTEPS;
-    const int peersPerChan = DIVUP(nranks, nChannels);
-    const ssize_t loopSize = (peersPerChan == 1 ? (nChannels/nranks)*(ssize_t)chunkSize : (ssize_t)chunkSize);
-
-    // Compute pointers
-    const T * __restrict__ thisInput = (const T*)args->sendbuff;
-    T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-    size_t* params = channel->a2avParams + nranks*4*args->index;
-    size_t *sendcounts = params;
-    size_t *sdispls = params + nranks;
-    size_t *recvcounts = params + nranks*2;
-    size_t *rdispls = params + nranks*3;
-    ssize_t size = sendcounts[rank]*typesize;
-    for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-      for (int i = 0; i < peersPerChan; i++) {
-        if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
-          (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
-          continue;
-        int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-        ALIGN_SIZE(realChunkSize, nthreads*sizeof(uint64_t)/sizeof(T));
-        ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-        int nelem = min(realChunkSize, size-chunkOffset);
-        if ((blockIdx.x*peersPerChan+i)%nranks == 0) {
-          if (tid < nthreads && thisInput != thisOutput) {
-            const T* sendbuff = thisInput+chunkOffset+sdispls[rank]*typesize;
-            T* recvbuff = thisOutput+chunkOffset+rdispls[rank]*typesize;
-            // local copy
-            ReduceOrCopyMulti<UNROLL, FUNC, T, 1, 1, 1, 1>(tid, nthreads, 1, &sendbuff, 1, &recvbuff, nelem);
-          }
-        }
-      }
-    }
-
-    for (int i = 0; i < peersPerChan; i++) {
-      if ((peersPerChan == 1 && blockIdx.x >= (nChannels/nranks)*nranks) ||
-        (peersPerChan > 1 && blockIdx.x*peersPerChan+i >= nranks))
-        continue;
-      if ((blockIdx.x*peersPerChan+i)%nranks != 0) {
-        int nthreadsSplit = nthreads/2;
-        if (tid < nthreadsSplit ) {
-          int peerSend = (rank+(blockIdx.x*peersPerChan)+i)%nranks;
-          ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 0, 1, 0, FUNC>
-            prims(tid, nthreadsSplit, NULL, &peerSend, NULL, stepSize, channel, comm, ncclShmem->ptrs, 0);
-          size = sendcounts[peerSend]*typesize;
-          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-            int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-            ALIGN_SIZE(realChunkSize, nthreadsSplit*sizeof(uint64_t)/sizeof(T));
-            ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-            int nelem = min(realChunkSize, size-chunkOffset);
-            ssize_t send_offset = chunkOffset + sdispls[peerSend]*typesize;
-            prims.send(thisInput+send_offset, nelem);
-          }
-        } else {
-          int peerRecv = (2*nranks+rank-((blockIdx.x*peersPerChan)%nranks)-(i%nranks))%nranks;
-          ncclPrimitives<UNROLL, ALLTOALLV_CHUNKSTEPS/ALLTOALLV_SLICESTEPS, ALLTOALLV_SLICESTEPS, T, 1, 0, 0, FUNC>
-            prims(tid-nthreadsSplit, nthreads-nthreadsSplit, &peerRecv, NULL, NULL, stepSize, channel, comm, ncclShmem->ptrs, 1);
-          size = recvcounts[peerRecv]*typesize;
-          for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-            int realChunkSize = min(chunkSize, DIVUP(size-gridOffset, (peersPerChan == 1 ? (nChannels/nranks) : 1)));
-            ALIGN_SIZE(realChunkSize, (nthreads-nthreadsSplit)*sizeof(uint64_t)/sizeof(T));
-            ssize_t chunkOffset = gridOffset + (peersPerChan == 1 ? (bid/nranks)*realChunkSize : 0);
-            int nelem = min(realChunkSize, size-chunkOffset);
-            ssize_t recv_offset = chunkOffset + rdispls[peerRecv]*typesize;
-            prims.recv(thisOutput+recv_offset, nelem);
-          }
-        }
-      }
-    }
-  }
-};
\ No newline at end of file
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
index 80fdca2b63..6fd5ea28ae 100644
--- a/src/collectives/device/common.h
+++ b/src/collectives/device/common.h
@@ -106,9 +106,7 @@
   NCCL_FUNCS2B(AllGather), \
   NCCL_FUNCS2A(ReduceScatter), \
   NCCL_FUNCS2C(AllReduce), \
-  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t), \
-  NCCL_FUNC_NAME(AllToAll, RING, SIMPLE, Sum, int8_t), \
-  NCCL_FUNC_NAME(AllToAllv, RING, SIMPLE, Sum, int8_t) }
+  NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t) }
 // [/RCCL]
 
 // Must be consistent with the ncclFuncSet enum
@@ -125,8 +123,6 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
   NCCL_FUNCS2A(ReduceScatter),
   NCCL_FUNCS2C(AllReduce),
   NCCL_FUNC_NAME(SendRecv, RING, SIMPLE, Sum, int8_t),
-  NCCL_FUNC_NAME(AllToAll, RING, SIMPLE, Sum, int8_t),
-  NCCL_FUNC_NAME(AllToAllv, RING, SIMPLE, Sum, int8_t),
 #endif
 };
 
@@ -176,9 +172,7 @@ void NCCL_CALL_FUNCTIONS(struct ncclWorkElem* const c) noexcept {
     else ncclFunction_AllGather_COLLNET_SIMPLE_Sum_int8_t(c);
   }
   else if (c->funcIndex < 1800) Caller<1080, 1800>::call(c);
-  else if (c->funcIndex == 1800) ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(c);
-  else if (c->funcIndex == 1801) ncclFunction_AllToAll_RING_SIMPLE_Sum_int8_t(c);
-  else if (c->funcIndex == 1802) ncclFunction_AllToAllv_RING_SIMPLE_Sum_int8_t(c);
+  else ncclFunction_SendRecv_RING_SIMPLE_Sum_int8_t(c);
 }
 
 static __device__ void load_parallel(void* dst, void* src, size_t size, int tid) {
@@ -213,10 +207,6 @@ class ncclFunction {
     if (fIdx == FUNC_INDEX_P2P) { \
       comm->collTrace[pos].p2p.nThreads = w->p2p.nThreads; \
       comm->collTrace[pos].p2p.delta = (uint16_t)(w->p2p.delta); \
-    } else if (fIdx == FUNC_INDEX_A2AV) { \
-      comm->collTrace[pos].coll.nThreads = w->nThreads; \
-      comm->collTrace[pos].coll.bid = w->a2av.bid; \
-      comm->collTrace[pos].coll.nChannels = w->a2av.nChannels; \
     } else { \
       comm->collTrace[pos].coll.nThreads = w->nThreads; \
       comm->collTrace[pos].coll.bid = w->coll.bid; \
diff --git a/src/enqueue.cc b/src/enqueue.cc
index e331a86b47..002fb20db8 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -280,13 +280,6 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
   struct ncclComm* comm = info->comm;
   float minTime = 3600000000.0; // Hopefully no operation will take an hour to complete.
   // Find algorithm / protocol.
-  if (info->coll == ncclFuncAllToAll || info->coll == ncclFuncAllToAllv) {
-    info->algorithm = NCCL_ALGO_RING;
-    info->protocol = NCCL_PROTO_SIMPLE;
-    info->nChannels = comm->nChannels;
-    info->nThreads = NCCL_MAX_NTHREADS;
-    return ncclSuccess;
-  }
   info->algorithm = -1;
   info->protocol = -1;
   int nAlgos = NCCL_NUM_ALGORITHMS;
@@ -348,9 +341,6 @@ static ncclResult_t getPatternInfo(struct ncclInfo* info) {
       info->pattern = ncclPatternRing; break;
     case ncclFuncAllReduce:
       info->pattern = info->algorithm == NCCL_ALGO_COLLNET ? ncclPatternCollTreeUp : info->algorithm == NCCL_ALGO_TREE ? ncclPatternTreeUpDown : ncclPatternRingTwice; break;
-    case ncclFuncAllToAll:
-    case ncclFuncAllToAllv:
-      info->pattern = ncclPatternAll; break;
     default:
       WARN("Unknown pattern for collective %d algorithm %d", info->coll, info->algorithm);
       return ncclInternalError;
@@ -372,9 +362,6 @@ static ncclResult_t getLoopInfo(struct ncclInfo* info) {
       info->nstepsPerLoop = info->comm->nRanks-1; info->nchunksPerLoop = info->comm->nRanks; break;
     case ncclPatternRingTwice:
       info->nstepsPerLoop = 2*(info->comm->nRanks-1); info->nchunksPerLoop = info->comm->nRanks; break;
-    case ncclPatternAll:
-      info->nstepsPerLoop = 1;
-      info->nchunksPerLoop = info->comm->nRanks; break;
     default:
       WARN("Unknown pattern %d", info->pattern);
       return ncclInternalError;
@@ -390,21 +377,12 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclWo
   NCCLCHECK(getPatternInfo(info));
   NCCLCHECK(getLoopInfo(info));
 
-  if ((info->coll == ncclFuncAllToAll || info->coll == ncclFuncAllToAllv)
-    && info->comm->topo->nodes[GPU].count == info->comm->topo->nRanks && (info->comm->topo->type & RCCL_TOPO_4P2H_ROME))
-    info->nChannels = info->comm->p2pnChannels;
-
   work->opCount = info->comm->opCount;
   work->sendbuff = info->sendbuff;
   work->recvbuff = info->recvbuff;
-  if (info->coll == ncclFuncAllToAllv)  {
-    work->a2av.count = info->count;
-    work->a2av.nChannels = info->nChannels;
-  } else {
-    work->coll.root = info->root;
-    work->coll.count = info->count;
-    work->coll.nChannels = info->nChannels;
-  }
+  work->coll.root = info->root;
+  work->coll.count = info->count;
+  work->coll.nChannels = info->nChannels;
   work->nThreads = info->nThreads;
 
   work->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
@@ -522,7 +500,7 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
 
   info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);
 
-  int nChannels = (info->coll == ncclFuncAllToAllv) ? work.a2av.nChannels : work.coll.nChannels;
+  int nChannels = work.coll.nChannels;
   int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;
 
   for (int bid=0; bid<nChannels*nSubChannels; bid++) {
@@ -539,11 +517,6 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
     if (proxyArgs.nsteps) NCCLCHECK(ncclProxySaveColl(&proxyArgs, info->pattern, info->root, info->comm->nRanks));
 
     info->comm->myParams->gridDim.x++;
-    if (info->coll == ncclFuncAllToAllv) {
-      work.a2av.bid = bid % work.a2av.nChannels;
-    } else {
-      work.coll.bid = bid % nChannels;
-    }
 
     // [RCCL] Setup pointers to where all the input/output pointers will be
     if (info->protocol == NCCL_PROTO_CLIQUE) {
@@ -551,16 +524,8 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
     }
     // [/RCCL]
 
-    struct ncclWork* w;
-    NCCLCHECK(getNextOp(channel, &w, &work));
-    if (info->coll == ncclFuncAllToAllv) {
-      struct ncclWorkElem* e = w->elems;
-      size_t* params = channel->a2avParams + info->comm->nRanks*4*e->index;
-      memcpy(params, info->sendcounts, sizeof(size_t*)*(info->comm->nRanks));
-      memcpy(params+info->comm->nRanks, info->sdispls, sizeof(size_t*)*(info->comm->nRanks));
-      memcpy(params+info->comm->nRanks*2, info->recvcounts, sizeof(size_t*)*(info->comm->nRanks));
-      memcpy(params+info->comm->nRanks*3, info->rdispls, sizeof(size_t*)*(info->comm->nRanks));
-    }
+    work.coll.bid = bid % nChannels;
+    NCCLCHECK(getNextOp(channel, NULL, &work));
   }
   info->comm->opCount++;
   return ncclSuccess;
@@ -714,12 +679,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
     NCCLCHECKGOTO(ncclAsyncColl(info->comm), ret, end);
     NCCLCHECKGOTO(checkSetStream(info), ret, end);
 
-    if (info->coll == ncclFuncAllToAllv)
-      INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
-        info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
-        info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
-    else if (info->coll != ncclFuncSendRecv)
-      INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+    INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
         info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
         info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
 
@@ -741,12 +701,7 @@ end:
     NCCLCHECK(ArgsCheck(info));
     NCCLCHECK(checkSetStream(info));
 
-    if (info->coll == ncclFuncAllToAllv)
-      INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p sendcounts %p sdispls %p recvbuff %p recvcounts %p rdispls %p datatype %d typesize %zi op %d root %d comm %p [nranks=%d] stream %p",
-        info->opName, info->comm->opCount, info->sendbuff, info->sendcounts, info->sdispls, info->recvbuff, info->recvcounts, info->rdispls,
-        info->datatype, info->count, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
-    else
-      INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
+    INFO(NCCL_COLL,"%s: opCount %lx sendbuff %p recvbuff %p count %zi datatype %d op %d root %d comm %p [nranks=%d] stream %p",
         info->opName, info->comm->opCount, info->sendbuff, info->recvbuff, info->count,
         info->datatype, info->op, info->root, info->comm, info->comm->nRanks, info->stream);
 
diff --git a/src/include/collectives.h b/src/include/collectives.h
index eef7a3cd16..ee4cc9b53c 100644
--- a/src/include/collectives.h
+++ b/src/include/collectives.h
@@ -9,12 +9,7 @@
 #define NCCL_COLLECTIVES_H_
 
 #define FUNC_INDEX_P2P (NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps)
-#define FUNC_INDEX_A2A (FUNC_INDEX_P2P+1)
-#define FUNC_INDEX_A2AV (FUNC_INDEX_P2P+2)
-
-#define FUNC_INDEX(func, redop, ncclType, al, pr) ((func >= NCCL_NUM_FUNCTIONS) \
-  ? (func-NCCL_NUM_FUNCTIONS+NCCL_NUM_FUNCTIONS*NCCL_NUM_ALGORITHMS*NCCL_NUM_PROTOCOLS*ncclNumTypes*ncclNumOps) \
-  : ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr)))
+#define FUNC_INDEX(func, redop, ncclType, al, pr) ((((((func)*ncclNumOps + (redop))*ncclNumTypes) + (ncclType))*NCCL_NUM_ALGORITHMS+(al))*NCCL_NUM_PROTOCOLS+(pr))
 
 #define NCCL_FUNC_NAME(func, algo, proto, redop, type) \
   ncclFunction_##func##_##algo##_##proto##_##redop##_##type
@@ -65,8 +60,6 @@
   DECL(ReduceScatter) \
   DECL(AllReduce) \
   DECL5(SendRecv, RING, SIMPLE, Sum, int8_t) \
-  DECL5(AllToAll, RING, SIMPLE, Sum, int8_t) \
-  DECL5(AllToAllv, RING, SIMPLE, Sum, int8_t) \
 
 DECL_ALL
 
@@ -88,12 +81,4 @@ DECL_ALL
 #define REDUCE_SLICESTEPS 1
 #define REDUCE_CHUNKSTEPS 1
 #define SENDRECV_SLICEFACTOR 1
-#define GATHER_SLICESTEPS 4
-#define GATHER_CHUNKSTEPS 4
-#define SCATTER_SLICESTEPS 4
-#define SCATTER_CHUNKSTEPS 4
-#define ALLTOALL_SLICESTEPS 4
-#define ALLTOALL_CHUNKSTEPS 4
-#define ALLTOALLV_SLICESTEPS 4
-#define ALLTOALLV_CHUNKSTEPS 4
 #endif
diff --git a/src/include/comm.h b/src/include/comm.h
index a507a5616f..cbdb66a9ea 100644
--- a/src/include/comm.h
+++ b/src/include/comm.h
@@ -163,7 +163,6 @@ struct ncclComm {
   int p2pRecvCount;
 
   // [RCCL]
-  bool alltoallDisable;            // RCCL AllToAll/Scatter/Gather API
   CliqueManager* cliqueManager;    // CliqueManager handles pointer collection / distribution for clique-based kernels
   int rootPid;                     // Process ID of root
   // [/RCCL]
diff --git a/src/include/devcomm.h b/src/include/devcomm.h
index 06a2742acf..9997de7f7c 100644
--- a/src/include/devcomm.h
+++ b/src/include/devcomm.h
@@ -27,8 +27,8 @@
 
 
 #define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
-typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncAllToAll, ncclFuncAllToAllv } ncclFunc_t;
-extern const char* ncclFuncStr[];
+typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv} ncclFunc_t;
+extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+1];
 
 #define NCCL_NUM_ALGORITHMS 3 // Tree/Ring/CollNet
 #define NCCL_ALGO_TREE 0
@@ -191,11 +191,6 @@ struct ncclWorkElem {
       int32_t delta;
       uint16_t nThreads;
     } p2p;
-    struct {
-      size_t count;
-      uint8_t bid;
-      uint8_t nChannels;
-    } a2av;
     // [RCCL] Clique-based arguments
     //        NOTE: Follows same field structure as coll
     //              because nChannels is accessed from "coll" struct.
@@ -232,7 +227,6 @@ struct ncclChannel {
       struct ncclWork* workFifo;
       int workCount;
       uint64_t workFifoTail; // Only used by CPU
-      size_t* a2avParams;
 
 #ifdef ENABLE_PROFILING
       struct timeval tvs;
diff --git a/src/include/info.h b/src/include/info.h
index fa9402f350..6b71492bd1 100644
--- a/src/include/info.h
+++ b/src/include/info.h
@@ -20,8 +20,7 @@ typedef enum {
   ncclPatternTreeDown,
   ncclPatternTreeUpDown,
   ncclPatternCollTreeUp,
-  ncclPatternCollTreeDown,
-  ncclPatternAll
+  ncclPatternCollTreeDown
 } ncclPattern_t;
 
 // Used to pass NCCL call information between functions
@@ -40,11 +39,6 @@ struct ncclInfo {
   // Algorithm details
   int chunkSteps;
   int sliceSteps;
-  // For alltoallv
-  const size_t *sendcounts;
-  const size_t *sdispls;
-  const size_t *recvcounts;
-  const size_t *rdispls;
   // Computed later
   int algorithm;
   int protocol;
diff --git a/src/init.cc b/src/init.cc
index 1c7c81f7b8..3ebe870465 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -45,7 +45,7 @@ std::chrono::high_resolution_clock::time_point ncclEpoch;
 #define NCCL_GROUP_CUDA_STREAM 1 // CGMD: CUDA 9.0,9.1 Need to use an internal CUDA stream
 #endif
 
-const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+3] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "AllToAll", "AllToAllv" };
+const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+1] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv" };
 const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" };
 const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
 const char* ncclRedOpStr[ncclNumOps] = { "Sum", "Prod", "Max", "Min" };
@@ -168,7 +168,7 @@ void *ncclCommThreadMain(void *arg) {
   ncclComm_t comm = (ncclComm_t)arg;
   int head = comm->hostDevComm.collTraceHead;
   #define MAX_NAME_LENGTH 32
-  char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_A2AV+1));
+  char* func_names = (char *)malloc(MAX_NAME_LENGTH*(FUNC_INDEX_P2P+1));
   for (int func = 0; func < NCCL_NUM_FUNCTIONS; func++) {
     for (int al = 0; al < NCCL_NUM_ALGORITHMS; al++) {
       for (int type = 0; type < ncclNumTypes; type++) {
@@ -182,10 +182,8 @@ void *ncclCommThreadMain(void *arg) {
       }
     }
   }
-  for (int func = NCCL_NUM_FUNCTIONS; func < NCCL_NUM_FUNCTIONS+3; func++) {
-    char* line = func_names+MAX_NAME_LENGTH*(FUNC_INDEX_P2P+func-NCCL_NUM_FUNCTIONS);
-    sprintf(line, "%s", ncclFuncStr[func]);
-  }
+  char* line = func_names+MAX_NAME_LENGTH*FUNC_INDEX_P2P;
+  sprintf(line, "%s", ncclFuncStr[NCCL_NUM_FUNCTIONS]);
   do {
     int tail = LOAD(comm->hostDevComm.collTraceTail)%COLLTRACE_NUM_ITEMS;
     int count;
@@ -223,7 +221,7 @@ void *ncclCommThreadMain(void *arg) {
             sprintf(line+offset, " KL HWID %8x %s ",
               td->data_0, func_names+MAX_NAME_LENGTH*fIdx);
             offset = strlen(line);
-            if (fIdx > FUNC_INDEX_A2AV)
+            if (fIdx > FUNC_INDEX_P2P)
               sprintf(line+offset, "ERROR bad function index %d", fIdx);
             else if (fIdx == FUNC_INDEX_P2P)
               sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta);
@@ -234,7 +232,7 @@ void *ncclCommThreadMain(void *arg) {
             if (fIdx != 0xffff) {
               sprintf(line+offset, " CE %s ", func_names+MAX_NAME_LENGTH*fIdx);
               offset = strlen(line);
-              if (fIdx > FUNC_INDEX_A2AV)
+              if (fIdx > FUNC_INDEX_P2P)
                 sprintf(line+offset, "ERROR bad function index %d", fIdx);
               else if (fIdx == FUNC_INDEX_P2P)
                 sprintf(line+offset, "nt %d dt %d", td->p2p.nThreads, td->p2p.delta);
@@ -318,8 +316,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
 #ifdef ENABLE_COLLTRACE
   STORE(&comm->hostDevComm.collTraceExit, 1);
   if (comm->hostDevComm.collTraceThread) pthread_join(comm->hostDevComm.collTraceThread, NULL);
-  CUDACHECK(hipHostFree((void *)comm->hostDevComm.collTrace));
-  CUDACHECK(hipHostFree((void *)comm->hostDevComm.collTraceTail));
+  NCCLCHECK(ncclCudaHostFree((void *)comm->hostDevComm.collTrace));
+  NCCLCHECK(ncclCudaHostFree((void *)comm->hostDevComm.collTraceTail));
 #endif
 
   free(comm->peerInfo);
@@ -361,7 +359,6 @@ static ncclResult_t commFree(ncclComm_t comm) {
   return ncclSuccess;
 }
 
-RCCL_PARAM(AllToAllDisable, "ALLTOALL_KERNEL_DISABLE", 1);
 RCCL_PARAM(ForceEnableClique, "FORCE_ENABLE_CLIQUE", 0);
 RCCL_PARAM(P2pNetDisable, "P2P_NET_DISABLE", 0);
 
@@ -413,8 +410,8 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
 #endif
 
 #ifdef ENABLE_COLLTRACE
-  CUDACHECK(hipHostMalloc((void**) &comm->hostDevComm.collTraceTail, sizeof(uint32_t), hipHostMallocMapped));
-  CUDACHECK(hipHostMalloc((void**) &comm->hostDevComm.collTrace, sizeof(struct ncclCollTrace) * COLLTRACE_NUM_ITEMS, hipHostMallocMapped));
+  NCCLCHECK(ncclCudaHostCalloc(&comm->hostDevComm.collTraceTail, 1));
+  NCCLCHECK(ncclCudaHostCalloc(&comm->hostDevComm.collTrace, COLLTRACE_NUM_ITEMS));
   memset(comm->hostDevComm.collTrace, 0, sizeof(struct ncclCollTrace) * COLLTRACE_NUM_ITEMS);
   comm->hostDevComm.collTraceExit = comm->hostDevComm.collTraceHead = *comm->hostDevComm.collTraceTail = 0;
   if ((ncclDebugLevel >= NCCL_LOG_INFO) && (ncclDebugMask & NCCL_COLL))
@@ -440,9 +437,6 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
   // Mark channels as non initialized.
   for (int c=0; c<MAXCHANNELS; c++) comm->channels[c].id = -1;
 
-  comm->alltoallDisable = true;
-  if (rcclParamAllToAllDisable() == 0) comm->alltoallDisable = false;
-
   *comret = comm;
   return ncclSuccess;
 }
@@ -929,7 +923,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     int fullCudaCompCap;
     int nChannels;
     int gcn;
-    int alltoallDisable;
     struct ncclGraphInfo tree;
     struct ncclGraphInfo ring;
     struct ncclGraphInfo collNet;
@@ -941,7 +934,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
   allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
   allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
-  allGather3Data[rank].alltoallDisable = comm->topo->nodes[GPU].count == comm->topo->nRanks ? 1 : comm->alltoallDisable;
 
   allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
     std::min(treeGraph.nChannels, ringGraph.nChannels);
@@ -991,11 +983,9 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   struct ncclTopoRanks** allTopoRanks;
   NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
   int gcn = allGather3Data[0].gcn;
-  int alltoallDisable = 0;
   for (int i=0; i<nranks; i++) {
     allTopoRanks[i] = &allGather3Data[i].topoRanks;
     gcn = std::min(allGather3Data[i].gcn, gcn);
-    alltoallDisable = std::max(allGather3Data[i].alltoallDisable, alltoallDisable);
     // Make sure we align all ranks so that the tuning is consistent across ranks
     treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
     treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
@@ -1015,11 +1005,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
     collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
   }
 
-  if (comm->alltoallDisable != alltoallDisable) {
-    comm->alltoallDisable = alltoallDisable;
-  }
-  INFO(NCCL_INIT, "RCCL AllToAll(v)/Scatter/Gather kernels %s", comm->alltoallDisable ? "disabled" : "enabled");
-
   // count NETs used by ring
   int nNets = 0;
   int nets[MAXCHANNELS*2];
@@ -1134,29 +1119,6 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
   // Compute nChannels per peer for p2p
   NCCLCHECK(ncclTopoComputeP2pChannels(comm));
 
-  if (!alltoallDisable) {
-    int nc = comm->p2pnChannels;
-    for (int c=0; c<nc; c++) {
-      const int peersPerChan = DIVUP(nranks, nc);
-      for (int p=0; p<peersPerChan; p++) {
-        // first channel is reserved for self copy
-        if ((c*peersPerChan+p)%nranks == 0)
-          continue;
-        int peerSend = (rank+c*peersPerChan+p)%nranks;
-        int peerRecv = (2*nranks+rank-(c*peersPerChan)%nranks-p)%nranks;
-        if (comm->channels[c].peers[peerSend].send.connected == 0) {
-          comm->connectSend[peerSend] |= (1<<c);
-          comm->connect = 1;
-        }
-        if (comm->channels[c].peers[peerRecv].recv.connected == 0) {
-          comm->connectRecv[peerRecv] |= (1<<c);
-          comm->connect = 1;
-        }
-      }
-    }
-    NCCLCHECK(ncclTransportP2pSetup(comm, NULL));
-  }
-
   NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));
 
   if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));
diff --git a/src/misc/argcheck.cc b/src/misc/argcheck.cc
index 741a3d6141..716cd53408 100644
--- a/src/misc/argcheck.cc
+++ b/src/misc/argcheck.cc
@@ -46,16 +46,11 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
   }
   // Type is OK, compute nbytes. Convert Allgather/Broadcast/P2P calls to chars.
   info->nBytes = info->count * ncclTypeSize(info->datatype);
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast || info->coll == ncclFuncAllToAll) {
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncBroadcast) {
     info->count = info->nBytes;
     info->datatype = ncclInt8;
   }
-  if (info->coll == ncclFuncAllToAllv) {
-    // Use count to store data type size for alltoallv
-    info->count = ncclTypeSize(info->datatype);
-    info->datatype = ncclInt8;
-  }
-  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter || info->coll == ncclFuncAllToAll) info->nBytes *= info->comm->nRanks; // count is per rank
+  if (info->coll == ncclFuncAllGather || info->coll == ncclFuncReduceScatter) info->nBytes *= info->comm->nRanks; // count is per rank
 
   if (info->op < 0 || info->op >= ncclNumOps) {
     WARN("%s : invalid reduction operation %d", info->opName, info->op);
diff --git a/test/test_AllToAll.cpp b/test/test_AllToAll.cpp
index a1b0c00510..17ab932978 100644
--- a/test/test_AllToAll.cpp
+++ b/test/test_AllToAll.cpp
@@ -62,6 +62,6 @@ namespace CorrectnessTests
                                 testing::Values(2,3,4,5,6,7,8),
                                 // In-place or not
                                 testing::Values(false),
-                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
+                                testing::Values("")),
                             CorrectnessTest::PrintToStringParamName());
 } // namespace
diff --git a/test/test_AllToAllMultiProcess.cpp b/test/test_AllToAllMultiProcess.cpp
index 842c1ee9e7..8b4ae6d417 100644
--- a/test/test_AllToAllMultiProcess.cpp
+++ b/test/test_AllToAllMultiProcess.cpp
@@ -55,6 +55,6 @@ namespace CorrectnessTests
                                 testing::Values(2,3,4,8),
                                 // In-place or not
                                 testing::Values(false),
-                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
+                                testing::Values("")),
                             CorrectnessTest::PrintToStringParamName());
 } // namespace
diff --git a/test/test_AllToAllv.cpp b/test/test_AllToAllv.cpp
index fe78b0f352..1d204fb30e 100644
--- a/test/test_AllToAllv.cpp
+++ b/test/test_AllToAllv.cpp
@@ -70,6 +70,6 @@ namespace CorrectnessTests
                                 testing::Values(2,3,4,5,6,7,8),
                                 // In-place or not
                                 testing::Values(false),
-                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
+                                testing::Values("")),
                             CorrectnessTest::PrintToStringParamName());
 } // namespace
diff --git a/test/test_CombinedCalls.cpp b/test/test_CombinedCalls.cpp
index 9e71e75b8a..ac980b2bef 100644
--- a/test/test_CombinedCalls.cpp
+++ b/test/test_CombinedCalls.cpp
@@ -115,6 +115,6 @@ namespace CorrectnessTests
                                 testing::Values(2,3,4,5,6,7,8),
                                 // In-place or not
                                 testing::Values(false),
-                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1", "RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
+                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
                             CorrectnessTest::PrintToStringParamName());
 } // namespace
diff --git a/test/test_Gather.cpp b/test/test_Gather.cpp
index bf917c739d..837ec30ea7 100644
--- a/test/test_Gather.cpp
+++ b/test/test_Gather.cpp
@@ -66,6 +66,6 @@ namespace CorrectnessTests
                                 testing::Values(2,3,4,5,6,7,8),
                                 // In-place or not
                                 testing::Values(false),
-                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=1")),
+                                testing::Values("")),
                             CorrectnessTest::PrintToStringParamName());
 } // namespace
diff --git a/test/test_GatherMultiProcess.cpp b/test/test_GatherMultiProcess.cpp
index 2ba477e168..c95f0ee21d 100644
--- a/test/test_GatherMultiProcess.cpp
+++ b/test/test_GatherMultiProcess.cpp
@@ -55,6 +55,6 @@ namespace CorrectnessTests
                                 testing::Values(2,3,4,8),
                                 // In-place or not
                                 testing::Values(false),
-                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
+                                testing::Values("")),
                             CorrectnessTest::PrintToStringParamName());
 } // namespace
diff --git a/test/test_Scatter.cpp b/test/test_Scatter.cpp
index aaca748217..82f4f9088b 100644
--- a/test/test_Scatter.cpp
+++ b/test/test_Scatter.cpp
@@ -66,6 +66,6 @@ namespace CorrectnessTests
                                 testing::Values(2,3,4,5,6,7,8),
                                 // In-place or not
                                 testing::Values(false),
-                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=1")),
+                                testing::Values("")),
                             CorrectnessTest::PrintToStringParamName());
 } // namespace
diff --git a/test/test_ScatterMultiProcess.cpp b/test/test_ScatterMultiProcess.cpp
index 0f12c84308..c732e4e91d 100644
--- a/test/test_ScatterMultiProcess.cpp
+++ b/test/test_ScatterMultiProcess.cpp
@@ -55,6 +55,6 @@ namespace CorrectnessTests
                                 testing::Values(2,3,4,8),
                                 // In-place or not
                                 testing::Values(false),
-                                testing::Values("RCCL_ALLTOALL_KERNEL_DISABLE=0", "RCCL_ALLTOALL_KERNEL_DISABLE=1")),
+                                testing::Values("")),
                             CorrectnessTest::PrintToStringParamName());
 } // namespace
diff --git a/tools/topo_expl/utils.cpp b/tools/topo_expl/utils.cpp
index 0a82949eaf..d8a438bea6 100644
--- a/tools/topo_expl/utils.cpp
+++ b/tools/topo_expl/utils.cpp
@@ -30,7 +30,7 @@
 #include "model.h"
 #include "utils.h"
 
-const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+3] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv", "AllToAll", "AllToAllv" };
+const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+1] = { "Broadcast", "Reduce", "AllGather", "ReduceScatter", "AllReduce", "SendRecv" };
 const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS] = { "Tree", "Ring", "CollNet" };
 const char* ncclProtoStr[NCCL_NUM_PROTOCOLS] = { "LL", "LL128", "Simple" };
 
@@ -270,7 +270,6 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
     int fullCudaCompCap;
     int nChannels;
     int gcn;
-    int alltoallDisable;
     struct ncclGraphInfo tree;
     struct ncclGraphInfo ring;
     struct ncclGraphInfo collNet;
@@ -283,7 +282,6 @@ ncclResult_t initTransportsRank_1(struct ncclComm* comm, struct allGather1Data_t
   NCCLCHECK(ncclTopoIdToIndex(comm->topo, GPU, myInfo->busId, &idx));
   allGather3Data[rank].cudaCompCap = comm->topo->nodes[GPU].nodes[idx].gpu.cudaCompCap;
   allGather3Data[rank].gcn = comm->topo->nodes[GPU].nodes[idx].gpu.gcn;
-  allGather3Data[rank].alltoallDisable = comm->topo->nodes[NET].count? 1 : comm->alltoallDisable;
 
   allGather3Data[rank].nChannels = comm->nChannels = treeGraph.nChannels = ringGraph.nChannels =
     std::min(treeGraph.nChannels, ringGraph.nChannels);
@@ -450,8 +448,6 @@ ncclResult_t ncclTransportP2pSetup(struct ncclComm* comm, struct ncclTopoGraph*
 }
 
 
-RCCL_PARAM(AllToAllDisable, "ALLTOALL_KERNEL_DISABLE", 0);
-
 ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t *allGather3Data,
   struct ncclTopoGraph& treeGraph, struct ncclTopoGraph& ringGraph, struct ncclTopoGraph& collNetGraph) {
   int rank = comm->rank;
@@ -481,11 +477,9 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
   struct ncclTopoRanks** allTopoRanks;
   NCCLCHECK(ncclCalloc(&allTopoRanks, comm->nRanks));
   int gcn = allGather3Data[0].gcn;
-  int alltoallDisable = 0;
   for (int i=0; i<nranks; i++) {
     allTopoRanks[i] = &allGather3Data[i].topoRanks;
     gcn = std::min(allGather3Data[i].gcn, gcn);
-    alltoallDisable = std::max(allGather3Data[i].alltoallDisable, alltoallDisable);
     // Make sure we align all ranks so that the tuning is consistent across ranks
     treeGraph.nChannels = ringGraph.nChannels = comm->nChannels = std::min(allGather3Data[i].nChannels, comm->nChannels);
     treeGraph.sameChannels = std::min(allGather3Data[i].tree.sameChannels, treeGraph.sameChannels);
@@ -505,11 +499,6 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
     collNetGraph.typeInter = std::min(allGather3Data[i].collNet.typeInter, collNetGraph.typeInter);
   }
 
-  if (comm->alltoallDisable != alltoallDisable) {
-    comm->alltoallDisable = alltoallDisable;
-  }
-  INFO(NCCL_INIT, "RCCL AllToAll(v)/Scatter/Gather kernels %s", comm->alltoallDisable ? "disabled" : "enabled");
-
   // count NETs used by ring
   int nNets = 0;
   int nets[MAXCHANNELS*2];
@@ -625,29 +614,6 @@ ncclResult_t initTransportsRank_3(struct ncclComm* comm, struct allGather3Data_t
   // Compute nChannels per peer for p2p
   NCCLCHECK(ncclTopoComputeP2pChannels(comm));
 
-  if (!alltoallDisable) {
-    int nc = comm->nChannels;
-    for (int c=0; c<nc; c++) {
-      const int peersPerChan = DIVUP(nranks, nc);
-      for (int p=0; p<peersPerChan; p++) {
-        // first channel is reserved for self copy
-        if ((c*peersPerChan+p)%nranks == 0)
-          continue;
-        int peerSend = (rank+c*peersPerChan+p)%nranks;
-        int peerRecv = (2*nranks+rank-(c*peersPerChan)%nranks-p)%nranks;
-        if (comm->channels[c].peers[peerSend].send.connected == 0) {
-          comm->connectSend[peerSend] |= (1<<c);
-          comm->connect = 1;
-        }
-        if (comm->channels[c].peers[peerRecv].recv.connected == 0) {
-          comm->connectRecv[peerRecv] |= (1<<c);
-          comm->connect = 1;
-        }
-      }
-    }
-    NCCLCHECK(ncclTransportP2pSetup(comm, NULL));
-  }
-
   //NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, intraRank0Comm));
 
   //if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));