Merge pull request #299 from ROCmSoftwarePlatform/develop

Enable target id build [ROCm/rccl commit: 377b43470b]
2020-11-10 15:47:42 -07:00
@@ -5,11 +5,10 @@ def runCompileCommand(platform, project, jobName)
 {
    project.paths.construct_build_prefix()

-    String hipclangArgs = jobName.contains('hipclang') ? '--hip-clang' : ''
    def command = """#!/usr/bin/env bash
                set -x
                cd ${project.paths.project_build_prefix}
-                LD_LIBRARY_PATH=/opt/rocm/hcc/lib ${project.paths.build_command} ${hipclangArgs}
+                LD_LIBRARY_PATH=/opt/rocm/hcc/lib ${project.paths.build_command}
            """

    platform.runCommand(this,command)
@@ -22,7 +21,7 @@ def runTestCommand (platform, project)
    def command = """#!/usr/bin/env bash
                set -x
                cd ${project.paths.project_build_prefix}/build/release/test
-                NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_output=xml --gtest_color=yes
+                ${sudo} NCCL_DEBUG=INFO HSA_FORCE_FINE_GRAIN_PCIE=1 ./UnitTests --gtest_output=xml --gtest_color=yes
            """

   platform.runCommand(this, command)
@@ -55,7 +55,7 @@ ci: {

    propertyList = auxiliary.appendPropertyList(propertyList)

-    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['4gfx906','4gfx908']])]
+    def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([sles15sp1:['4gfx906'],centos8:['4gfx908'],centos7:['4gfx906'],ubuntu18:['4gfx906']])]
    
    jobNameList = auxiliary.appendJobNameList(jobNameList)
    
@@ -80,7 +80,7 @@ ci: {
    {
        properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])]))
        stage(urlJobName) {
-            runCI([ubuntu16:['rccl906']], urlJobName)
+            runCI([ubuntu18:['4gfx906']], urlJobName)
        }
    }
-}
+}
@@ -0,0 +1,53 @@
+# Change Log for RCCL
+
+Full documentation for RCCL is available at [https://rccl.readthedocs.io](https://rccl.readthedocs.io)
+
+## [Unreleased]
+### Added
+- Experimental support for clique-based kernels (opt in with RCCL_ENABLE_CLIQUE=1)
+- Clique-based kernels may offer better performance for smaller input sizes
+- Clique-based kernels are currently only enabled for AllReduce under a certain byte limit (controlled via RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT)
+### Optimizations
+- Performance improvements for Rome-based systems
+### Known issues
+- Clique-based kernels are currently experimental and have not been fully tested on all topologies.  By default, clique-based kernels are disabled if the detected topology is not supported (override with RCCL_FORCE_ENABLE_CLIQUE)
+- Clique-based kernels may hang if there are differences between environment variables set across ranks.
+- Clique-based kernels may fail if the input / output device pointers are not the base device pointers returned by hipMalloc.
+
+
+## [RCCL-2.7.8 for ROCm 3.9.0]
+### Added
+- Adding support for alltoallv RCCL kernel
+### Optimizations
+- Modifications to topology based on XGMI links
+### Known issues
+- None
+
+## [RCCL-2.7.6 for ROCm 3.8.0]
+### Added
+- Support for static library builds
+### Known issues
+- None
+
+## [RCCL-2.7.6 for ROCm 3.7.0]
+### Added
+- Updated to RCCL API version of 2.7.6
+- Added gather, scatter and all-to-all collectives
+
+## [RCCL-2.7.0 for ROCm 3.6.0]
+### Added
+- Updated to RCCL API version of 2.6.4
+
+## [RCCL-2.7.0 for ROCm 3.5.0]
+### Added
+- Compatibility with NCCL 2.6
+- Network interface improvements with API v3
+### Optimizations
+- Fixing issues and built time improvements for hip-clang
+- Network topology detection
+- Improved CPU type detection
+- Infiniband adaptive routing support
+### Changed
+- Switched to hip-clang as default compiler
+### Deprecated
+- Deprecated hcc build
@@ -11,7 +11,7 @@ set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")

 project(rccl CXX)

-set(AMDGPU_TARGETS gfx803;gfx900;gfx906;gfx908 CACHE STRING "List of specific machine types for library to target")
+set(AMDGPU_TARGETS gfx803;gfx900;gfx906:xnack-;gfx908:xnack- CACHE STRING "List of specific machine types for library to target")

 option(BUILD_TESTS "Build test programs" OFF)
 option(INSTALL_DEPENDENCIES "Force install dependencies" OFF)
@@ -126,6 +126,12 @@ set(CC_SOURCES
    src/collectives/all_to_all_api.cc
    src/collectives/all_to_allv_api.cc
    src/channel.cc
+    src/clique/CliqueManager.cc     # RCCL
+    src/clique/HandleCache.cc       # RCCL
+    src/clique/HandleShm.cc         # RCCL
+    src/clique/Hash.cc              # RCCL
+    src/clique/MsgQueue.cc          # RCCL
+    src/clique/ShmObject.cc         # RCCL
    src/misc/argcheck.cc
    src/misc/nvmlwrap_stub.cc
    src/misc/utils.cc
@@ -169,7 +175,7 @@ endforeach()

 if("${HIP_COMPILER}" MATCHES "clang")
  foreach(target ${AMDGPU_TARGETS})
-    target_compile_options(rccl PRIVATE --cuda-gpu-arch=${target} PRIVATE -fgpu-rdc PRIVATE -mno-xnack -Xarch_gfx906 -msram-ecc -Xarch_gfx908 -mno-sram-ecc)
+    target_compile_options(rccl PRIVATE --cuda-gpu-arch=${target} PRIVATE -fgpu-rdc)
  endforeach()
  target_link_libraries(rccl PRIVATE -fgpu-rdc)
  target_include_directories(rccl PRIVATE /opt/rocm/hsa/include)
@@ -223,17 +229,47 @@ rocm_export_targets(NAMESPACE
                    hip)

 set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip-rocclr (>= 3.5.0)")
+set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
 set(CPACK_RPM_PACKAGE_REQUIRES "hip-rocclr >= 3.5.0")
-
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "/opt/rocm")

+find_file (DEBIAN debian_version debconf.conf PATHS /etc)
+if(DEBIAN)
+  # Write copyright file
+  file(WRITE "${CMAKE_BINARY_DIR}/copyright"
+  "Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
+Upstream-Name: rccl
+Source: https://github.com/ROCmSoftwarePlatform/rccl
+
+Files: *
+Copyright: (c) 2016-2020, NVIDIA CORPORATION. All rights reserved.
+Modifications Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+License: See LICENSE.txt for license information\n")
+  install(FILES "${CMAKE_BINARY_DIR}/copyright" DESTINATION /usr/share/doc/rccl)
+  # Write changelog file
+  find_program( date_executable date )
+  execute_process(COMMAND ${date_executable} -R OUTPUT_VARIABLE TIMESTAMP)
+  file(WRITE "${CMAKE_BINARY_DIR}/changelog"
+  "rccl (${VERSION_STRING}-1) unstable; urgency=medium
+
+  * Initial release.
+
+ -- RCCL Maintainer <rccl-maintainer@amd.com>  ${TIMESTAMP}\n")
+  find_program( gzip_executable gzip )
+  execute_process(COMMAND bash "-c" "${gzip_executable} -9 -c ${CMAKE_BINARY_DIR}/changelog"
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR} OUTPUT_FILE "${CMAKE_BINARY_DIR}/changelog.Debian.gz")
+  install(FILES "${CMAKE_BINARY_DIR}/changelog.Debian.gz" DESTINATION /usr/share/doc/rccl)
+  set(CPACK_DEBIAN_PACKAGE_DESCRIPTION "ROCm Communication Collectives Library
+  Optimized primitives for collective multi-GPU communication")
+endif()
+
 rocm_create_package(
  NAME
  rccl
  DESCRIPTION
-  "Optimized primitives for collective multi-GPU communication"
+  "ROCm Communication Collectives Library"
  MAINTAINER
-  "<rccl-maintainer@amd.com>"
+  "RCCL Maintainer <rccl-maintainer@amd.com>"
  LDCONFIG)

 rocm_install_symlink_subdir(rccl)
@@ -56,7 +56,7 @@ master_doc = 'index'

 # General information about the project.
 project = u'RCCL'
-copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019 Advanced Mirco Devices'
+copyright = u'2015-2018, NVIDIA CORPORATION; Modifications Copyright 2019-2020 Advanced Mirco Devices'
 author = u'Advanced Mirco Devices'

 # The version info for the project you're documenting, acts as replacement for
@@ -12,6 +12,11 @@
 #include "socket.h"
 #include <unistd.h>
 #include <sys/types.h>
+// [RCCL]
+#include "clique/CliqueManager.h"
+#include "clique/CliqueShmNames.h"
+#include "clique/Hash.h"
+// [/RCCL]

 struct bootstrapNetComm {
  int fd;
@@ -163,7 +168,14 @@ static ncclResult_t setFilesLimit() {
  return ncclSuccess;
 }

-static void *bootstrapRoot(void* listenComm) {
+static void *bootstrapRoot(void* bootstrapRootStruct) { // [RCCL] Modified to include hash argument)
+  // [RCCL] Unpack bootstrapRootStruct
+  struct bootstrapRootStruct* rootStruct = (struct bootstrapRootStruct*) bootstrapRootStruct;
+  void* listenComm = rootStruct->listenComm;
+  unsigned long hash = rootStruct->hash;
+  int pid = getpid(); // sharing PID to other ranks for creating shared memory files for CliqueManager
+  // [/RCCL]
+
  struct extInfo info;
  ncclNetHandle_t *rankHandles = NULL;
  ncclNetHandle_t *rankHandlesRoot = NULL; // for initial rank <-> root information exchange
@@ -205,12 +217,19 @@ static void *bootstrapRoot(void* listenComm) {
  } while (c < nranks);
  TRACE(NCCL_INIT, "COLLECTED ALL %d HANDLES", nranks);

+  { // [RCCL] Initialize message queues / shared memory files
+    NCCLCHECKGOTO(CliqueManager::BootstrapRootInit(pid, hash), res, out);
+  } // [/RCCL]
+
  // Send the connect handle for the next rank in the AllGather ring
  for (int r=0; r<nranks; ++r) {
    int next = (r+1) % nranks;
    void *tmpSendComm;
    NCCLCHECKGOTO(bootstrapNetConnect(0, rankHandlesRoot+r, &tmpSendComm), res, out);
    NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, rankHandles+next, sizeof(ncclNetHandle_t)), res, out);
+    { // [RCCL] Send the root pid for shared file naming
+      NCCLCHECKGOTO(bootstrapNetSend(tmpSendComm, &pid, sizeof(int)), res, out);
+    } // [/RCCL]
    NCCLCHECKGOTO(bootstrapNetCloseSend(tmpSendComm), res, out);
  }
  TRACE(NCCL_INIT, "SENT OUT ALL %d HANDLES", nranks);
@@ -229,7 +248,14 @@ ncclResult_t bootstrapCreateRoot(ncclUniqueId* id, bool idFromEnv) {
  void* listenComm;
  NCCLCHECK(bootstrapNetListen(idFromEnv ? dontCareIf : 0, netHandle, &listenComm));
  pthread_t thread;
-  pthread_create(&thread, NULL, bootstrapRoot, listenComm);
+
+  // [RCCL] Use the ncclUniqueId to get a hash for bootstrap
+  struct bootstrapRootStruct* rootStruct = new bootstrapRootStruct;
+  rootStruct->hash = djb2Hash(id->internal);
+  rootStruct->listenComm = listenComm;
+  pthread_create(&thread, NULL, bootstrapRoot, (void *)rootStruct);
+  // [/RCCL]
+
  return ncclSuccess;
 }

@@ -267,9 +293,10 @@ struct extState {
  int rank;
  int nranks;
  int dev;
+  int rootPid;  // [RCCL] PID of root
 };

-ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState) {
+ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commState, int* rootPid) { // [RCCL] Adding rootPid
  ncclNetHandle_t* netHandle = (ncclNetHandle_t*) id;
  bool idFromEnv = getenv("NCCL_COMM_ID") != NULL;
  struct extState* state;
@@ -314,6 +341,9 @@ ncclResult_t bootstrapInit(ncclUniqueId * id, int rank, int nranks, void** commS
  ncclNetHandle_t extHandleNext;
  NCCLCHECK(bootstrapNetAccept(extBstrapListenCommRoot, &tmpRecvComm));
  NCCLCHECK(bootstrapNetRecv(tmpRecvComm, &extHandleNext, sizeof(extHandleNext)));
+  { // [RCCL] Receive PID from root
+    NCCLCHECK(bootstrapNetRecv(tmpRecvComm, rootPid, sizeof(int)));
+  } // [/RCCL]
  NCCLCHECK(bootstrapNetCloseRecv(tmpRecvComm));
  NCCLCHECK(bootstrapNetCloseListen(extBstrapListenCommRoot));

@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef ALLREDUCECLIQUEKERNEL_H
+#define ALLREDUCECLIQUEKERNEL_H
+
+#include "CliqueCommon.h"
+#include "devcomm.h"
+#include "reduce_kernel.h"
+#include "common_kernel.h"
+
+template <class FUNC, typename T, int NUM_RANKS>
+__device__ void AllReduceCliqueSplitKernel(struct CollectiveArgs* args)
+{
+  // Clique-specific kernel arguments
+  cliqueDevicePtrs_t* cliquePtrs = args->clique.ptrs;      // Collection of all input/output pointers across ranks in clique
+  size_t const N                 = args->clique.count;     // Total number of elements to reduce
+  int    const nBlocks           = args->clique.nChannels; // Total number of blocks assigned to this kernel (may be different than gridDim.x)
+  int    const blockId           = args->clique.bid;       // 0-indexed blockIdx for this threadblock (may be different than blockIdx.x)
+  int    const rank              = args->comm->rank;       // Current rank
+
+  // Each threadblock works independently of others on a subsection of the input
+  // First split evently across ranks, while maintaining multiples of blocksize
+  size_t const perRankN       = RoundUp((N + NUM_RANKS - 1) / NUM_RANKS, blockDim.x);
+  size_t const perBlockN      = RoundUp((perRankN + nBlocks - 1) / nBlocks, blockDim.x);
+  size_t const currBlockStart = min((rank * nBlocks + blockId) * perBlockN, N);
+  size_t const currBlockStop  = min(currBlockStart + perBlockN, N);
+  size_t const blockN         = currBlockStop - currBlockStart;
+
+  if (blockN > 0)
+  {
+    // Prepare input / output subarrays
+    T const** inputs  = (T const**)cliquePtrs->inputs;
+    T**       outputs = (T      **)cliquePtrs->outputs;
+    T const* srcs[NUM_RANKS];
+    T*       dsts[NUM_RANKS];
+
+    #pragma unroll
+    for (int r = 0; r < NUM_RANKS; r++)
+    {
+      srcs[r] = inputs[r]  + currBlockStart;
+      dsts[r] = outputs[r] + currBlockStart;
+    }
+
+    // Perform the reduction
+    #define ALL_REDUCE_CLIQUE_UNROLL 2
+    ReduceOrCopyMulti<ALL_REDUCE_CLIQUE_UNROLL, FUNC, T, NUM_RANKS, NUM_RANKS, NUM_RANKS, NUM_RANKS>(
+      threadIdx.x, blockDim.x, NUM_RANKS, srcs, NUM_RANKS, dsts, blockN);
+  }
+
+  // Even if there was nothing for this GPU to do, it must participate in a barrier
+  // because other GPUs may be modifying this GPUs output buffer still
+  if (blockId == 0) WaitForBarrier<NUM_RANKS>(cliquePtrs->barrier);
+}
+
+#endif
@@ -0,0 +1,93 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef CLIQUE_COMMON_H
+#define CLIQUE_COMMON_H
+
+#include "nccl.h"
+#include <cstdint>
+
+#define MIN_CLIQUE_SIZE 2
+#define MAX_CLIQUE_SIZE 8
+
+typedef struct
+{
+  int* globalCount;   // Shared across GPUs
+  int* globalSense;   // Shared across GPUs
+  int* localSense;    // Local to this GPU
+} gpuBarrier_t;
+
+typedef struct
+{
+  // Input/output pointers from participating ranks
+  void const* inputs[MAX_CLIQUE_SIZE];
+  void*       outputs[MAX_CLIQUE_SIZE];
+
+  // Barrier variable
+  gpuBarrier_t barrier;
+} cliqueDevicePtrs_t;
+
+// Helper macro to launch an appropriate kernel by converting rank to a template argument
+#define LAUNCH_CLIQUE_KERNEL(kernelname, FUNC, T, args)  \
+  {                                                      \
+    switch (args->comm->nRanks){                         \
+    case 2: kernelname<FUNC, T, 2>(args); break;         \
+    case 3: kernelname<FUNC, T, 3>(args); break;         \
+    case 4: kernelname<FUNC, T, 4>(args); break;         \
+    case 5: kernelname<FUNC, T, 5>(args); break;         \
+    case 6: kernelname<FUNC, T, 6>(args); break;         \
+    case 7: kernelname<FUNC, T, 7>(args); break;         \
+    case 8: kernelname<FUNC, T, 8>(args); break;         \
+    }                                                    \
+  }
+
+// Multi-GPU (on same node) barrier.  One thread per grid per GPU updates barrier / waits
+template <int NUM_RANKS>
+__forceinline__ __device__ void WaitForBarrier(gpuBarrier_t const& barrier)
+{
+  if (threadIdx.x == 0)
+  {
+    // Sense inversion barrier
+    *barrier.localSense = 1 - *barrier.localSense;
+    int localSense = *barrier.localSense;
+
+    int val = __atomic_add_fetch(barrier.globalCount, 1, __ATOMIC_SEQ_CST);
+    if (val == NUM_RANKS)
+    {
+      // Last arrival resets barrier
+      __atomic_store_n(barrier.globalCount, 0, __ATOMIC_SEQ_CST);
+      __atomic_store_n(barrier.globalSense, localSense, __ATOMIC_SEQ_CST);
+    }
+    else
+    {
+      // Wait for all ranks to reach barrier
+      while (__atomic_load_n(barrier.globalSense, __ATOMIC_SEQ_CST) != localSense);
+    }
+  }
+}
+
+__forceinline__ __host__ __device__ size_t RoundUp(size_t X, size_t Y)
+{
+  return (X+Y-1)/Y * Y;
+}
+
+#endif
@@ -0,0 +1,519 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "CliqueManager.h"
+#include "CliqueShmNames.h"
+#include "MsgQueue.h"
+
+#include "nccl.h"
+#include "core.h"
+
+#include "Hash.h"
+
+#include "AllReduceCliqueKernel.h"
+
+#include <hip/hip_runtime.h>
+#include <hsa/hsa_ext_amd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <chrono>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <thread>
+
+cliqueDevicePtrs_t CliqueManager::m_staticCliquePtrs[NCCL_MAX_OPS]  = {};
+int*               CliqueManager::m_staticGpuBarrierMem             = NULL;
+
+// Define some environment variables that affect clique-based kernels
+RCCL_PARAM(EnableClique, "ENABLE_CLIQUE", 0);                                  // Opt-in environment variable for clique-based kernels
+RCCL_PARAM(AllReduceCliqueByteLimit, "CLIQUE_ALLREDUCE_BYTE_LIMIT", 2097152);  // Max number of bytes to use clique-based kernels for all reduce
+RCCL_PARAM(AllReduceNumChannels,     "CLIQUE_ALLREDUCE_NCHANNELS", 4);         // Number of channels to use for all-reduce
+
+CliqueManager::CliqueManager(int          const  rank,
+                             int          const  numRanks,
+                             cliqueMode_t const  cliqueMode) :
+  m_rank(rank),
+  m_numRanks(numRanks),
+  m_cliqueMode(cliqueMode),
+  m_init(false),
+  m_pinnedCliquePtrs(NULL),
+  m_fineGrainBarrierMem(NULL)
+{
+}
+
+CliqueManager::~CliqueManager()
+{
+  if (m_init)
+  {
+    CleanUp();
+  }
+}
+
+void CliqueManager::CleanUp()
+{
+  if (m_cliqueMode == CLIQUE_DISABLED) return;
+
+  // Free variables that are shared between SINGLE_PROCESS / SINGLE_NODE
+  if (m_pinnedCliquePtrs) hipHostFree(m_pinnedCliquePtrs);
+  if (m_gpuBarrierLocalSense) hipFree(m_gpuBarrierLocalSense);
+
+  if (m_cliqueMode == CLIQUE_SINGLE_NODE)
+  {
+    // Release caches
+    if (m_ipcHandleSendCache) delete m_ipcHandleSendCache;
+    if (m_ipcHandleSendCache) delete m_ipcHandleRecvCache;
+
+    // Close shared memory
+    m_shmHandles.Close();
+    m_sharedCpuMemory.Close();
+    m_sharedIpcHandle.Close();
+
+    if (m_fineGrainBarrierMem)
+    {
+      if (m_rank == 0)
+        hipFree(m_fineGrainBarrierMem);
+      else
+        hipIpcCloseMemHandle(m_fineGrainBarrierMem);
+    }
+  }
+  else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
+  {
+    if (m_rank == 0 && m_staticGpuBarrierMem)
+      hipFree(m_staticGpuBarrierMem);
+  }
+  m_init = false;
+}
+
+ncclResult_t CliqueManager::Init(ncclUniqueId const* commId, int suffix)
+{
+  ncclResult_t res;
+
+  if (m_init) return ncclSuccess;
+  m_init = true;
+
+  if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
+
+  // Check parameters
+  if (m_rank < 0 || m_rank >= m_numRanks)
+  {
+    WARN("Invalid rank specified.  Expected 0 <= %d < %d for CliqueManager", m_rank, m_numRanks);
+    return ncclInvalidUsage;
+  }
+  if (commId == NULL)
+  {
+    WARN("CommId should not be empty");
+    return ncclInvalidUsage;
+  }
+
+  // For now, opt-into clique based kernels via RCCL_ENABLE_CLIQUE env var
+  if (!rcclParamEnableClique())
+  {
+    INFO(NCCL_INIT, "Disabling clique-based kernels (did not find env var RCCL_ENABLE_CLIQUE)");
+    m_cliqueMode = CLIQUE_DISABLED;
+    return ncclSuccess;
+  }
+
+  // Allocate pinned CPU memory for holding clique pointers, which kernels will have access to
+  if (hipHostMalloc(&m_pinnedCliquePtrs, sizeof(cliqueDevicePtrs_t) * NCCL_MAX_OPS) != hipSuccess)
+  {
+    WARN("Unable to allocated pinned host memory for clique pointers.  Disabling clique-based kernels");
+    m_cliqueMode = CLIQUE_DISABLED;
+    m_init = true;
+    return ncclSuccess;
+  }
+
+  unsigned long hash = djb2Hash(commId->internal);
+  std::string shmSuffix = std::to_string(hash) + "_" + std::to_string(suffix);
+
+  // Allocate sense barrier variable on local GPU
+  NCCLCHECKGOTO(ncclCudaCalloc(&m_gpuBarrierLocalSense, NCCL_MAX_OPS * sizeof(int)), res, dropback);
+
+  if (m_cliqueMode == CLIQUE_SINGLE_NODE)
+  {
+    // Initialize shared memory file for IPC handles (based on commId hash)
+    m_shmHandles = NcclIpcHandleShm(m_rank, m_numRanks, hash, NUM_HANDLES_PER_RANK, NCCL_MAX_OPS, shmSuffix);
+    NCCLCHECKGOTO(m_shmHandles.Open(), res, dropback);
+
+    // Initialize IPC caches
+    m_ipcHandleSendCache = new NcclIpcHandleSendCache(m_numRanks * NUM_HANDLES_PER_RANK * NCCL_MAX_OPS);
+    m_ipcHandleRecvCache = new NcclIpcHandleRecvCache(m_numRanks * NUM_HANDLES_PER_RANK * NCCL_MAX_OPS,
+                                                      100,
+                                                      hipIpcMemHandleHash,
+                                                      hipIpcMemHandleEqual);
+
+    // Initialize shared object for GPU barrier IPC handle
+    m_sharedIpcHandle = ShmObject<hipIpcMemHandle_t>(std::max(4096LU, sizeof(hipIpcMemHandle_t)),
+                                                     CliqueShmNames["Barriers"] + shmSuffix,
+                                                     m_rank,
+                                                     m_numRanks,
+                                                     hash);
+    NCCLCHECKGOTO(m_sharedIpcHandle.Open(), res, dropback);
+
+    if (m_rank == 0)
+    {
+      hipIpcMemHandle_t handle;
+      // Allocate fine-grained device memory on rank 0 and get IPC handle for it
+      // Re-usable barrier consists of (globalCount / globalSense) pair of integers
+      NCCLCHECKGOTO(ncclCudaCalloc(&m_fineGrainBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), true), res, dropback);
+      if (hipIpcGetMemHandle(&handle, m_fineGrainBarrierMem) != hipSuccess)
+      {
+        WARN("Unable to get IPC handle for barrier memory");
+        goto dropback;
+      }
+      // Write IPC handle to shared memory for other ranks to receive
+      *m_sharedIpcHandle.Get() = handle;
+
+      // Set up global count/sense for first rank
+      m_gpuBarrierGlobalCount = &m_fineGrainBarrierMem[0];
+      m_gpuBarrierGlobalSense = &m_fineGrainBarrierMem[NCCL_MAX_OPS];
+    }
+
+    // Initialize shared CPU memory to be used for barrier variables
+    m_sharedCpuMemory = ShmObject<int32_t>(2 * sizeof(int32_t),
+                                           CliqueShmNames["SharedCounters"] + shmSuffix,
+                                           m_rank,
+                                           m_numRanks,
+                                           hash);
+    NCCLCHECKGOTO(m_sharedCpuMemory.Open(), res, dropback);
+
+    // Split up the shared CPU memory for barrier counters / global sense
+    m_cpuBarrierGlobalCount = &m_sharedCpuMemory.Get()[0];
+    m_cpuBarrierGlobalSense = &m_sharedCpuMemory.Get()[1];
+
+    // Initialize CPU barriers
+    if (m_rank == 0)
+    {
+      *m_cpuBarrierGlobalCount = 0;
+      *m_cpuBarrierGlobalSense = 0;
+    }
+    m_cpuBarrierLocalSense = 0;
+  }
+  else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
+  {
+    // First rank prepares fine-grained memory shared across ranks used for the two barrier variables
+    if (m_rank == 0)
+    {
+      NCCLCHECKGOTO(ncclCudaCalloc(&m_staticGpuBarrierMem, NCCL_MAX_OPS * 2 * sizeof(int), true), res, dropback);
+
+      // Prepare all barriers
+      for (int opIndex = 0; opIndex < NCCL_MAX_OPS; opIndex++)
+      {
+        m_staticCliquePtrs[opIndex].barrier.globalCount = &m_staticGpuBarrierMem[opIndex];
+        m_staticCliquePtrs[opIndex].barrier.globalSense = &m_staticGpuBarrierMem[opIndex + NCCL_MAX_OPS];;
+      }
+    }
+  }
+
+
+  m_init = true;
+  INFO(NCCL_INIT, "Clique-based kernels enabled (mode %d)", m_cliqueMode);
+  return ncclSuccess;
+
+dropback:
+  // NOTE: This currently assumes that all ranks will fail the same way
+  //       Additional support is required to handle cases when some processes succeed while others fail
+  WARN("Unable to initialize shared memory. Disabling clique-based kernels");
+  CleanUp();
+  m_cliqueMode = CLIQUE_DISABLED;
+  return ncclSuccess;
+}
+
+bool CliqueManager::IsSupported(ncclFunc_t const coll,
+                                size_t const count,
+                                ncclDataType_t const datatype,
+                                ncclRedOp_t const op) const
+{
+  if (m_cliqueMode == CLIQUE_DISABLED) return false;
+
+  // Filter based on total input size for each collective type
+  size_t totalBytes = count * ncclTypeSize(datatype);
+  if (coll == ncclCollAllReduce && (totalBytes <= rcclParamAllReduceCliqueByteLimit())) return true;
+
+  return false;
+}
+
+ncclResult_t CliqueManager::DeclarePointers(uint64_t opCount, void const* inputPtr, void* outputPtr)
+{
+  // Do nothing if disabled
+  if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
+
+  if (!m_init)
+  {
+    WARN("CliqueManager must be initialized before use");
+    return ncclInvalidUsage;
+  }
+
+  int const opIndex = opCount % NCCL_MAX_OPS;
+
+  // Add opIndex to queue of in-progress collectives
+  m_inProgress.push(opIndex);
+
+  if (m_cliqueMode == CLIQUE_SINGLE_NODE)
+  {
+    // Get fine-grained device memory if not already done
+    if (m_fineGrainBarrierMem == NULL)
+    {
+      hipIpcMemHandle_t handle = *m_sharedIpcHandle.Get();
+      CUDACHECK(hipIpcOpenMemHandle((void**)&m_fineGrainBarrierMem, handle, hipIpcMemLazyEnablePeerAccess));
+
+      // Prepare global count/sense barrier variables used the ipc-shared gpu device memory
+      m_gpuBarrierGlobalCount = &m_fineGrainBarrierMem[0];
+      m_gpuBarrierGlobalSense = &m_fineGrainBarrierMem[NCCL_MAX_OPS];
+    }
+
+    std::vector<std::pair<hipIpcMemHandle_t,size_t>> handles(NUM_HANDLES_PER_RANK);
+
+    // Get IPC handles for input/output pointers from cache
+    NCCLCHECK(CheckCacheForPtr(const_cast<void*>(inputPtr), m_ipcHandleSendCache, m_rank, &handles[0]));
+    NCCLCHECK(CheckCacheForPtr(outputPtr                  , m_ipcHandleSendCache, m_rank, &handles[1]));
+
+    // Prepare barrier pointers (done after the IpcOpenMemory)
+    m_pinnedCliquePtrs[opIndex].barrier.globalCount = &m_gpuBarrierGlobalCount[opIndex];
+    m_pinnedCliquePtrs[opIndex].barrier.globalSense = &m_gpuBarrierGlobalSense[opIndex];
+    m_pinnedCliquePtrs[opIndex].barrier.localSense  = &m_gpuBarrierLocalSense[opIndex];
+
+    // Write IPC handles to shared memory for given rank / opCount
+    NCCLCHECK(m_shmHandles.WriteHandles(opIndex, handles));
+  }
+  else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
+  {
+    // Store this rank's input/output pointers into static member
+    m_staticCliquePtrs[opIndex].inputs[m_rank]  = inputPtr;
+    m_staticCliquePtrs[opIndex].outputs[m_rank] = outputPtr;
+  }
+
+  return ncclSuccess;
+}
+
+ncclResult_t CliqueManager::GetNumChannelsToUse(ncclFunc_t const coll,
+                                                size_t const count,
+                                                ncclDataType_t const datatype,
+                                                ncclRedOp_t const op,
+                                                int const totalNumChannels,
+                                                uint8_t* numChannelstoUse)
+{
+  size_t const totalBytes = count * ncclTypeSize(datatype);
+  *numChannelstoUse = 1;
+
+  if (coll == ncclCollAllReduce) {
+    *numChannelstoUse = std::min((int)rcclParamAllReduceNumChannels(), totalNumChannels);
+  }
+
+  return ncclSuccess;
+}
+
+
+
+ncclResult_t CliqueManager::SetCliqueCollectiveArgs(CollectiveArgs* args)
+{
+  // Do nothing if disabled
+  if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
+  if (!m_init)
+  {
+    WARN("CliqueManager must be initialized before use");
+    return ncclInvalidUsage;
+  }
+
+  // Prepare clique argments (NOTE: clique pointers are not ready yet)
+  int opIndex = args->opCount % NCCL_MAX_OPS;
+  args->clique.ptrs = &m_pinnedCliquePtrs[opIndex];
+
+
+  // Determine number of channels to use for this collective
+  args->clique.nChannels = rcclParamAllReduceNumChannels();
+
+  return ncclSuccess;
+}
+
+ncclResult_t CliqueManager::WaitForPointers()
+{
+  // Do nothing if disabled
+  if (m_cliqueMode == CLIQUE_DISABLED) return ncclSuccess;
+
+  if (!m_init)
+  {
+    WARN("CliqueManager must be initialized before use");
+    return ncclInvalidUsage;
+  }
+
+  // Do nothing if there are no outstanding clique-kernels
+  if (m_inProgress.empty()) return ncclSuccess;
+
+  // Copy clique device pointers to pinned device memory
+  if (m_cliqueMode == CLIQUE_SINGLE_NODE)
+  {
+    // Wait for all ranks to arrive
+    WaitForBarrier();
+
+    int numHandles = m_numRanks * NUM_HANDLES_PER_RANK;
+    std::vector<std::pair<hipIpcMemHandle_t,size_t>> handles(numHandles);
+
+    while (!m_inProgress.empty())
+    {
+      int const opIndex = m_inProgress.front();
+      m_inProgress.pop();
+
+      // Collect the ready handles from shared memory and convert them to device pointers
+      NCCLCHECK(m_shmHandles.ReadHandles(opIndex, handles));
+      for (int i = 0; i < m_numRanks; i++)
+      {
+        void *input;
+        NCCLCHECK(CheckCacheForHandle(handles[i * NUM_HANDLES_PER_RANK],
+                                      m_ipcHandleRecvCache, &input));
+        m_pinnedCliquePtrs[opIndex].inputs[i] = const_cast<const void *>(input);
+
+        NCCLCHECK(CheckCacheForHandle(handles[(i * NUM_HANDLES_PER_RANK) + 1],
+                                      m_ipcHandleRecvCache, &m_pinnedCliquePtrs[opIndex].outputs[i]));
+      }
+    }
+  }
+  else if (m_cliqueMode == CLIQUE_SINGLE_PROCESS)
+  {
+    while (!m_inProgress.empty())
+    {
+      int const opIndex = m_inProgress.front();
+      m_inProgress.pop();
+
+      // Copy from static memory to pinned host memory and set local sense
+      memcpy(&m_pinnedCliquePtrs[opIndex], &m_staticCliquePtrs[opIndex], sizeof(cliqueDevicePtrs_t));
+      m_pinnedCliquePtrs[opIndex].barrier.localSense = &m_gpuBarrierLocalSense[opIndex];
+    }
+  }
+  return ncclSuccess;
+}
+
+std::string HandleToString(hipIpcMemHandle_t handle)
+{
+  char mapping[17] = "0123456789ABCDEF";
+  std::string result;
+  for (int i = 0; i < 4; i++)
+  {
+    unsigned char val = (unsigned char)handle.reserved[i];
+    result += mapping[val / 16];
+    result += mapping[val % 16];
+  }
+  return result;
+}
+
+
+ncclResult_t CliqueManager::CheckCacheForPtr(void* devPtr,
+                                             NcclIpcHandleSendCache* cache,
+                                             int rank,
+                                             std::pair<hipIpcMemHandle_t, size_t>* handlePair)
+{
+  // Get the base address for this device allocation
+  hsa_status_t status;
+  hsa_amd_pointer_info_t info;
+  info.size = sizeof(hsa_amd_pointer_info_t);
+  status = hsa_amd_pointer_info(devPtr, &info, NULL, NULL, NULL);
+  if (status != HSA_STATUS_SUCCESS) {
+    WARN("Uanble to get pointer information for %p", devPtr);
+    return ncclInvalidArgument;
+  }
+
+  // Compute the offset between the device addres and the base address
+  uint64_t baseAddr = (uint64_t)info.agentBaseAddress;
+  uint64_t realAddr = (uint64_t)devPtr;
+  handlePair->second = realAddr - baseAddr;
+
+  // IPC handles are only supported for base address pointers
+  NcclIpcHandleSendCache::iterator it = cache->find(baseAddr);
+
+   if (it == cache->end())
+   {
+     CUDACHECK(hipIpcGetMemHandle(&handlePair->first, (void*)baseAddr));
+     cache->insert(baseAddr, handlePair->first);
+   }
+   else
+   {
+     handlePair->first = (it->second).first;
+   }
+   return ncclSuccess;
+}
+
+ncclResult_t CliqueManager::CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
+                                                NcclIpcHandleRecvCache* cache,
+                                                void** ptr)
+{
+  NcclIpcHandleRecvCache::iterator it = cache->find(handlePair.first);
+
+  // Get base address pointer from cache if it exists
+  void* baseAddr;
+  if (it == cache->end())
+  {
+    CUDACHECK(hipIpcOpenMemHandle(&baseAddr, handlePair.first, hipIpcMemLazyEnablePeerAccess));
+    cache->insert(handlePair.first, baseAddr);
+  }
+  else
+  {
+    baseAddr = (it->second).first;
+  }
+
+  // Modify base address pointer with offset
+  uint64_t realAddr = (uint64_t)baseAddr + handlePair.second;
+  *ptr = (void*)realAddr;
+  return ncclSuccess;
+}
+
+void CliqueManager::WaitForBarrier()
+{
+  // Sense inversion barrier
+  m_cpuBarrierLocalSense = 1 - m_cpuBarrierLocalSense;
+
+  if (__sync_add_and_fetch(m_cpuBarrierGlobalCount, 1) == m_numRanks)
+  {
+    // Reset the barrier
+    STORE(m_cpuBarrierGlobalCount, 0);
+    STORE(m_cpuBarrierGlobalSense, m_cpuBarrierLocalSense);
+  } else {
+    while (LOAD(m_cpuBarrierGlobalSense) != m_cpuBarrierLocalSense);
+  }
+}
+
+ncclResult_t CliqueManager::BootstrapRootInit(int pid, unsigned long hash)
+{
+  for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
+  {
+    int msgid, fd;
+    std::string msgQueueName = "/tmp/" + it->second + std::to_string(hash) + "_" + std::to_string(pid);
+    SYSCHECKVAL(open(msgQueueName.c_str(), O_CREAT | O_RDWR, 0606), "open", fd);
+    NCCLCHECK(MsgQueueGetId(msgQueueName, hash, true, msgid));
+    SYSCHECK(close(fd), "close");
+  }
+
+  std::string shmDir = "/dev/shm/";
+
+  for (auto it = CliqueShmNames.begin(); it != CliqueShmNames.end(); it++)
+  {
+    struct stat fileStatus;
+    std::string shmFileName = it->second + std::to_string(hash) + "_" + std::to_string(pid);
+    std::string shmFullPath = shmDir + shmFileName;
+
+    // Check if shm file already exists; if so, unlink it
+    if (stat(shmFullPath.c_str(), &fileStatus) == 0)
+    {
+      NCCLCHECK(shmUnlink(shmFileName.c_str()));
+    }
+  }
+  return ncclSuccess;
+}
@@ -0,0 +1,128 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef RCCL_CLIQUE_MANAGER_HPP_
+#define RCCL_CLIQUE_MANAGER_HPP_
+
+#include <semaphore.h>
+#include <mutex>
+#include <queue>
+
+#include "nccl.h"
+#include "devcomm.h"
+#include "CliqueCommon.h"
+#include "HandleCache.h"
+#include "HandleShm.h"
+
+#define NUM_HANDLES_PER_RANK 2
+
+class CliqueManager
+{
+public:
+  typedef enum
+  {
+    CLIQUE_DISABLED       = 0,
+    CLIQUE_SINGLE_PROCESS = 1,
+    CLIQUE_SINGLE_NODE    = 2
+  } cliqueMode_t;
+
+  CliqueManager(int const rank, int const numRanks, cliqueMode_t const cliqueMode);
+
+  ~CliqueManager();
+
+  void CleanUp();
+
+  ncclResult_t Init(ncclUniqueId const* commId, int suffix);
+
+  // Returns true if the collective is supported via a clique-based kernel
+  bool IsSupported(ncclFunc_t const coll,
+                   size_t const count,
+                   ncclDataType_t const datatype,
+                   ncclRedOp_t const op) const;
+
+  // Provide the pointers to be exchanged across the clique for the given rank / opCount
+  ncclResult_t DeclarePointers(uint64_t opCount, void const* inputPtr, void* outputPtr);
+
+  // Determine the number of channels / CUs to use for this call
+  ncclResult_t GetNumChannelsToUse(ncclFunc_t const coll,
+                                   size_t const count,
+                                   ncclDataType_t const datatype,
+                                   ncclRedOp_t const op,
+                                   int const totalNumChannels,
+                                   uint8_t* numChannelstoUse);
+
+  // Set pointers for where clique-related arguments will be found
+  // This sets pointers to device-accessible memory where the arguments will eventually reside
+  ncclResult_t SetCliqueCollectiveArgs(CollectiveArgs* args);
+
+  // Blocking call that only returns after all out-standing clique pointers are ready
+  ncclResult_t WaitForPointers();
+
+  // Prepares shared memory files upon initialization
+  static ncclResult_t BootstrapRootInit(int pid, unsigned long hash);
+
+protected:
+  ncclResult_t CheckCacheForPtr(void* devPtr,
+                                NcclIpcHandleSendCache* cache,
+                                int rank,
+                                std::pair<hipIpcMemHandle_t, size_t>* handlePair);
+
+  ncclResult_t CheckCacheForHandle(std::pair<hipIpcMemHandle_t, size_t> const& handlePair,
+                                   NcclIpcHandleRecvCache* cache,
+                                   void** ptr);
+
+  // Race-condition helper functions
+  void WaitForBarrier();
+
+  int                          m_rank;                               // Associated rank
+  int                          m_numRanks;                           // Total number of ranks
+  cliqueMode_t                 m_cliqueMode;                         // Clique mode (off/single process/single node)
+  bool                         m_init;                               // Whether CliqueManager has been initialized
+  cliqueDevicePtrs_t*          m_pinnedCliquePtrs;                   // Pinned-host-memory (device accessible) containing device pointers
+  int*                         m_gpuBarrierGlobalCount;              // Part of GPU barrier (count variable shared across ranks)
+  int*                         m_gpuBarrierGlobalSense;              // Part of GPU barrier (reset variable shared across ranks)
+  int*                         m_gpuBarrierLocalSense;               // Part of GPU barrier (reset variable local to this rank)
+  std::queue<int>              m_inProgress;                         // Queue of clique-based collectives waiting for pointers
+
+  // IPC-related (CLIQUE_SINGLE_NODE)
+  NcclIpcHandleShm             m_shmHandles;                         // Used to exchange IPC handles between ranks
+  NcclIpcHandleSendCache*      m_ipcHandleSendCache;                 // Caches pointers to IPC handles (to send to other processes)
+  NcclIpcHandleRecvCache*      m_ipcHandleRecvCache;                 // Caches IPC handles to pointers (received from other processes)
+  ShmObject<int32_t>           m_sharedCpuMemory;                    // Used to pass shared memory used for CPU barrier
+  ShmObject<hipIpcMemHandle_t> m_sharedIpcHandle;                    // Used to pass fine-grained device memory buffer IPC handle
+  int*                         m_fineGrainBarrierMem;                // Fine-grained GPU memory barrier (allocated only on 1st rank, shared on others)
+  int*                         m_cpuBarrierGlobalCount;              // Part of CPU barrier (count variable shared across ranks)
+  int*                         m_cpuBarrierGlobalSense;              // Part of CPU barrier (reset variable shared across ranks)
+  int                          m_cpuBarrierLocalSense;               // Part of CPU barrier (reset variable local to this rank)
+
+  // Single-process (CLIQUE_SINGLE_PROCESS)
+  static cliqueDevicePtrs_t    m_staticCliquePtrs[NCCL_MAX_OPS];     // Use shared static memory to exchange pointer info
+  static int*                  m_staticGpuBarrierMem;                // Static storage backing for fine-grained gpu barrier
+};
+
+// For use in bootstrapping code
+struct bootstrapRootStruct {
+    void* listenComm;
+    unsigned long hash;
+};
+
+#endif
@@ -0,0 +1,37 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef NCCL_CLIQUE_SHM_NAMES_H_
+#define NCCL_CLIQUE_SHM_NAMES_H_
+
+#include <string>
+#include <map>
+
+static std::map<std::string, std::string> CliqueShmNames =
+{
+    {"SharedCounters", "RcclCounters"  },
+    {"Mutexes"       , "RcclMutexes"   },
+    {"IpcHandles"    , "RcclIpcHandles"},
+    {"Barriers"      , "RcclBarriers"  }
+};
+
+#endif
@@ -0,0 +1,31 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "HandleCache.h"
+
+#include "Hash.h"
+
+// djb2 hash function for hashing char array in hipIpcMemHandle_t
+unsigned long hipIpcMemHandleHash(const hipIpcMemHandle_t& handle)
+{
+    return djb2Hash(handle.reserved);
+}
@@ -0,0 +1,142 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef NCCL_HANDLE_CACHE_H_
+#define NCCL_HANDLE_CACHE_H_
+
+#include <list>
+#include <unordered_map>
+#include <functional>
+
+#include "core.h"
+
+//#include "llvm/ADT/DenseMap.h"
+
+template <
+    class Key,
+    class Value,
+    class Hash,
+    class KeyEqual,
+    class Allocator
+>
+class NcclIpcHandleCache
+{
+public:
+    typedef std::pair<Value, typename std::list<Key>::iterator> NcclIpcHandleCacheValueType;
+    typedef std::unordered_map<Key, NcclIpcHandleCacheValueType, Hash, KeyEqual, Allocator> LRUCache;
+    using iterator = typename LRUCache::iterator;
+    NcclIpcHandleCache(size_t size,
+                       size_t bucket_count = 100,
+                       const Hash& hash = Hash(),
+                       const KeyEqual& eql = KeyEqual(),
+                       const Allocator& alloc = Allocator() ) : m_cache(bucket_count, hash, eql, alloc)
+    {
+        m_capacity = size;
+    }
+
+    ~NcclIpcHandleCache()
+    {
+        m_lruHistory.clear();
+        m_cache.clear();
+    }
+
+    iterator begin()
+    {
+        return m_cache.begin();
+    }
+
+    iterator end()
+    {
+        return m_cache.end();
+    }
+
+    iterator find(const Key& key)
+    {
+        iterator it = m_cache.find(key);
+        if (it != m_cache.end())
+        {
+            updateHistory(it);
+        }
+
+        return it;
+    }
+
+    std::pair<iterator, bool> insert(const Key& key, const Value& value)
+    {
+        if (m_cache.size() == m_capacity)
+        {
+            // remove entry
+            pop();
+        }
+
+        typename LRUCache::iterator it = m_cache.find(key);
+        bool inserted;
+        if (it == m_cache.end())
+        {
+            typename std::list<Key>::iterator it = m_lruHistory.insert(m_lruHistory.end(), key);
+            m_cache.insert(std::make_pair(key, std::make_pair(value, it)));
+            inserted = true;
+        }
+        else
+        {
+            inserted = false;
+        }
+
+        return std::pair<iterator, bool>(it, inserted);
+    }
+
+private:
+    void pop()
+    {
+        typename LRUCache::iterator it = m_cache.find(m_lruHistory.front());
+        m_cache.erase(it);
+        m_lruHistory.pop_front();
+    }
+
+    void updateHistory(const iterator& it)
+    {
+        if (m_lruHistory.size() > 0)
+        {
+            m_lruHistory.splice(m_lruHistory.end(), m_lruHistory, (it->second).second);
+        }
+    }
+    size_t m_capacity;
+    std::list<Key> m_lruHistory;
+    LRUCache m_cache;
+};
+
+// djb2 hash function for hashing char array in hipIpcMemHandle_t
+unsigned long hipIpcMemHandleHash(const hipIpcMemHandle_t& handle);
+
+// equality function required for unordered_map
+auto hipIpcMemHandleEqual = [](const hipIpcMemHandle_t& l, const hipIpcMemHandle_t& r)
+{
+    return memcmp(l.reserved, r.reserved, sizeof(l.reserved)) == 0;
+};
+
+//typedef llvm::DenseMap<uint64_t, hipIpcMemHandle_t> SendCache;
+//typedef llvm::DenseMap<hipIpcMemHandle_t, void*, decltype(&HandleHash), decltype(HandleEqual)> RecvCache;
+
+typedef NcclIpcHandleCache<uint64_t, hipIpcMemHandle_t, std::hash<uint64_t>, std::equal_to<uint64_t>, std::allocator< std::pair<const uint64_t, std::pair<hipIpcMemHandle_t, std::list<uint64_t>::iterator>>>> NcclIpcHandleSendCache;
+typedef NcclIpcHandleCache<hipIpcMemHandle_t, void*, decltype(&hipIpcMemHandleHash), decltype(hipIpcMemHandleEqual), std::allocator< std::pair<const hipIpcMemHandle_t, std::pair<void*, std::list<hipIpcMemHandle_t>::iterator>>>> NcclIpcHandleRecvCache;
+
+#endif
@@ -0,0 +1,67 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <hip/hip_runtime.h>
+
+#include "HandleShm.h"
+#include "CliqueShmNames.h"
+#include "core.h"
+#include "Hash.h"
+#include "shm.h"
+
+NcclIpcHandleShm::NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string suffix) :
+  ShmObject<std::pair<hipIpcMemHandle_t,size_t>>(numRanks * numHandlesPerRank * capacity * sizeof(std::pair<hipIpcMemHandle_t,size_t>),
+                                                 CliqueShmNames["IpcHandles"] + suffix,
+                                                 rank,
+                                                 numRanks,
+                                                 projid),
+  m_numHandlesPerRank(numHandlesPerRank),
+  m_numHandlesPerOpCount(numRanks * numHandlesPerRank)
+{
+}
+
+NcclIpcHandleShm::NcclIpcHandleShm()
+{
+}
+
+NcclIpcHandleShm::~NcclIpcHandleShm()
+{
+}
+
+ncclResult_t NcclIpcHandleShm::Open()
+{
+  return ShmObject::Open();
+}
+
+ncclResult_t NcclIpcHandleShm::WriteHandles(uint64_t opCount, std::vector<std::pair<hipIpcMemHandle_t,size_t>> const& sendHandles)
+{
+  size_t idx = (opCount * m_numHandlesPerOpCount) + (m_rank *  m_numHandlesPerRank);
+  memcpy(m_shmPtr + idx, sendHandles.data(), sizeof(std::pair<hipIpcMemHandle_t,size_t>) * m_numHandlesPerRank);
+  return ncclSuccess;
+}
+
+ncclResult_t NcclIpcHandleShm::ReadHandles(uint64_t opCount, std::vector<std::pair<hipIpcMemHandle_t,size_t>>& recvHandles)
+{
+  size_t idx = opCount * m_numHandlesPerOpCount;
+  memcpy(recvHandles.data(), m_shmPtr + idx, m_numHandlesPerOpCount * sizeof(std::pair<hipIpcMemHandle_t,ssize_t>));
+  return ncclSuccess;
+}
@@ -0,0 +1,53 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef NCCL_IPC_HANDLE_SHM_H_
+#define NCCL_IPC_HANDLE_SHM_H_
+
+#include <hip/hip_runtime.h>
+#include <vector>
+#include <string>
+
+#include "nccl.h"
+#include "ShmObject.h"
+
+class NcclIpcHandleShm : public ShmObject<std::pair<hipIpcMemHandle_t,size_t>>
+{
+public:
+    NcclIpcHandleShm(int rank, int numRanks, int projid, int numHandlesPerRank, int capacity, std::string suffix);
+
+    NcclIpcHandleShm();
+
+    ~NcclIpcHandleShm();
+
+    ncclResult_t Open();
+
+    ncclResult_t WriteHandles(uint64_t opCount, std::vector<std::pair<hipIpcMemHandle_t,size_t>> const& sendHandles);
+
+    ncclResult_t ReadHandles(uint64_t opCount, std::vector<std::pair<hipIpcMemHandle_t,size_t>>& recvHandles);
+
+private:
+    int m_numHandlesPerRank;
+    int m_numHandlesPerOpCount;
+};
+
+#endif
@@ -0,0 +1,34 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "Hash.h"
+
+unsigned long djb2Hash(const char* data)
+{
+    unsigned long hash = 5381;
+    int c;
+
+    while ((c = *(data)++))
+        hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
+
+    return hash;
+}
@@ -0,0 +1,28 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef NCCL_HASH_H_
+#define NCCL_HASH_H_
+
+unsigned long djb2Hash(const char* data);
+
+#endif
@@ -0,0 +1,72 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "MsgQueue.h"
+
+#include <sys/ipc.h>
+#include <sys/msg.h>
+
+#define MSG_QUEUE_PERM 0666
+
+ncclResult_t MsgQueueGetId(std::string name, int projid, bool exclusive, int& msgid)
+{
+  key_t key;
+  SYSCHECKVAL(ftok(name.c_str(), projid), "ftok", key);
+  int flag = (exclusive == true ? IPC_CREAT | IPC_EXCL : IPC_CREAT);
+
+  msgid = msgget(key, MSG_QUEUE_PERM | flag);
+  // Check if we're trying to create message queue and it already exists; if so, delete existing queue
+  if (msgid == -1 && exclusive == true && errno == EEXIST)
+  {
+    NCCLCHECK(MsgQueueClose(name, projid));
+    SYSCHECKVAL(msgget(key, MSG_QUEUE_PERM | flag), "msgget", msgid);
+  }
+  else if (msgid == -1)
+  {
+    WARN("Call to MsgQueueGetId failed : %s", strerror(errno));
+    return ncclSystemError;
+  }
+  return ncclSuccess;
+}
+
+ncclResult_t MsgQueueSend(int msgid, const void* msgp, size_t msgsz, int msgflg)
+{
+  SYSCHECK(msgsnd(msgid, msgp, msgsz, msgflg), "msgsnd");
+  return ncclSuccess;
+}
+
+ncclResult_t MsgQueueRecv(int msgid, void* msgp, size_t msgsz, long msgtyp, bool wait)
+{
+  int msgflg = (wait == false ? IPC_NOWAIT : 0);
+  SYSCHECK(msgrcv(msgid, msgp, msgsz, msgtyp, msgflg), "msgrcv");
+  return ncclSuccess;
+}
+
+ncclResult_t MsgQueueClose(std::string name, int projid)
+{
+  key_t key;
+  int msgid;
+  key = ftok(name.c_str(), projid);
+  SYSCHECKVAL(msgget(key, IPC_CREAT), "msgget", msgid);
+  SYSCHECK(msgctl(msgid, IPC_RMID, NULL), "msgctl");
+  return ncclSuccess;
+}
@@ -0,0 +1,42 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef RCCL_MSG_QUEUE_HPP_
+#define RCCL_MSG_QUEUE_HPP_
+
+#include <string>
+
+#include "nccl.h"
+#include "core.h"
+
+struct MsgBuffer
+{
+  long msg_type;
+  char msg_text[1];
+};
+
+ncclResult_t MsgQueueGetId(std::string name, int projid, bool exclusive, int& msgid);
+ncclResult_t MsgQueueSend(int msgid, const void* msgp, size_t msgsz, int msgflg);
+ncclResult_t MsgQueueRecv(int msgid, void* msgp, size_t msgsz, long msgtyp, bool wait);
+ncclResult_t MsgQueueClose(std::string name, int projid);
+
+#endif
@@ -0,0 +1,43 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef SHAREDMEMHELPER_H
+#define SHAREDMEMHELPER_H
+
+
+class SharedMemHelper
+{
+public:
+  SharedMemHelper(int rank, int numRanks, int numEntries);
+
+  ncclStatus_t Init(std::string const& baseFilename);
+
+  ncclStatus_t
+
+
+protected:
+  bool m_initialized;
+  int m_rank;
+  int m_numRanks;
+};
+
+#endif
@@ -0,0 +1,45 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "ShmObject.h"
+#include <string>
+
+// Template specializations for sem_t objects which require additional initialization
+template<>
+ncclResult_t ShmObject<sem_t>::Close()
+{
+    size_t numMutexes = m_shmSize / sizeof(sem_t);
+
+    for (size_t i = 0; i < numMutexes; i++)
+    {
+        sem_destroy(static_cast<sem_t*>(&m_shmPtr[i]));
+    }
+
+    int retVal = shm_unlink(m_shmName.c_str());
+    if (retVal == -1 && errno != ENOENT)
+    {
+        WARN("Call to shm_unlink in ShmObject failed : %s", strerror(errno));
+        return ncclSystemError;
+    }
+
+    return ncclSuccess;
+}
@@ -0,0 +1,210 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef NCCL_SHM_OBJECT_H_
+#define NCCL_SHM_OBJECT_H_
+
+#include <string>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <type_traits>
+#include <semaphore.h>
+
+#include "MsgQueue.h"
+#include "nccl.h"
+#include "core.h"
+#include "shm.h"
+
+// ShmObject abstracts away the nitty-gritty when multiple processes need to handle opening a shared
+// memory object at the same time.
+
+static ncclResult_t shmSetupExclusive(const char* shmname, const int shmsize, int* fd, void** ptr, int create) {
+  *fd = shm_open(shmname, O_CREAT | O_RDWR | O_EXCL, S_IRUSR | S_IWUSR);
+  if (*fd == -1) return ncclSystemError;
+  if (create) SYSCHECK(shm_allocate(*fd, shmsize), "posix_fallocate");
+  SYSCHECK(shm_map(*fd, shmsize, ptr), "mmap");
+  close(*fd);
+  *fd = -1;
+  if (create) memset(*ptr, 0, shmsize);
+  return ncclSuccess;
+}
+
+template <typename T>
+class ShmObject
+{
+public:
+ShmObject(size_t size, std::string fileName, int rank, int numRanks, int projid) :
+  m_shmSize(size),
+    m_shmName(fileName),
+    m_rank(rank),
+    m_numRanks(numRanks),
+    m_projid(projid),
+    m_alloc(false),
+    m_shmPtr(nullptr) {}
+
+  ShmObject() {}
+
+  ~ShmObject() {}
+
+  ncclResult_t Open();
+
+  ncclResult_t Close()
+  {
+    if (m_alloc)
+    {
+      if (m_rank == 0)
+      {
+        std::string tmpFileName = "/tmp/" + m_shmName;
+        remove(tmpFileName.c_str());
+      }
+      int retVal = shm_unlink(m_shmName.c_str());
+      if (retVal == -1 && errno != ENOENT)
+      {
+        WARN("Call to shm_unlink in ShmObject failed : %s", strerror(errno));
+        return ncclSystemError;
+      }
+    }
+    return ncclSuccess;
+  }
+
+  T*& Get()
+  {
+    return m_shmPtr;
+  }
+protected:
+  ncclResult_t BroadcastMessage(int msgid, bool pass)
+  {
+    MsgBuffer msg;
+    msg.msg_text[0] = (pass == 0 ? 'F': 'P');
+    for (int rank = 0; rank < m_numRanks; rank++)
+    {
+      if (rank == m_rank) continue;
+      msg.msg_type = rank;
+      NCCLCHECK(MsgQueueSend(msgid, &msg, sizeof(msg), 0));
+    }
+    return ncclSuccess;
+  }
+
+  // tag for dispatch
+      template<class U>
+        struct OpenTag{};
+
+      ncclResult_t InitIfSemaphore(OpenTag<int> tag);
+      ncclResult_t InitIfSemaphore(OpenTag<uint32_t> tag);
+      ncclResult_t InitIfSemaphore(OpenTag<hipIpcMemHandle_t> tag);
+      ncclResult_t InitIfSemaphore(OpenTag<sem_t> tag);
+      ncclResult_t InitIfSemaphore(OpenTag<std::pair<hipIpcMemHandle_t,size_t>> tag);
+
+      size_t      m_shmSize;
+      std::string m_shmName;
+      int         m_rank;
+      int         m_numRanks;
+      int         m_projid;
+      bool        m_alloc;
+      T*          m_shmPtr;
+};
+
+template <typename T>
+ncclResult_t ShmObject<T>::Open()
+{
+  if (m_alloc == false)
+  {
+    int shmFd;
+    int protection = PROT_READ | PROT_WRITE;
+    int visibility = MAP_SHARED;
+
+    int msgid;
+    std::string tmpFileName = "/tmp/" + m_shmName;
+    NCCLCHECK(MsgQueueGetId(tmpFileName, m_projid, false, msgid));
+    if (m_rank == 0)
+    {
+      ncclResult_t resultSetup = shmSetupExclusive(m_shmName.c_str(), m_shmSize, &shmFd, (void**)&m_shmPtr, 1);
+      ncclResult_t resultSemInit = InitIfSemaphore(OpenTag<T>{});
+      if ((resultSetup != ncclSuccess && errno != EEXIST) || (resultSemInit != ncclSuccess))
+      {
+        NCCLCHECK(BroadcastMessage(msgid, false));
+        WARN("Call to ShmObject::Open in root rank failed : %s", strerror(errno));
+        return ncclSystemError;
+      }
+      NCCLCHECK(BroadcastMessage(msgid, true));
+    }
+    else
+    {
+      MsgBuffer msg;
+      NCCLCHECK(MsgQueueRecv(msgid, &msg, sizeof(msg), m_rank, true));
+      if (msg.msg_text[0] == 'P')
+      {
+        NCCLCHECK(shmSetup(m_shmName.c_str(), m_shmSize, &shmFd, (void**)&m_shmPtr, 0));
+      }
+      else
+      {
+        WARN("Call to shm_open from non-root rank in ShmObject failed : %s", strerror(errno));
+        return ncclSystemError;
+      }
+    }
+    m_alloc = true;
+  }
+  else
+  {
+    WARN("Cannot allocate ShmObject twice.\n");
+    return ncclInvalidUsage;
+  }
+  return ncclSuccess;
+}
+
+template<typename T>
+ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<int> tag)
+{
+  return ncclSuccess;
+}
+
+template<typename T>
+ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<unsigned int> tag)
+{
+  return ncclSuccess;
+}
+
+template<typename T>
+ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<hipIpcMemHandle_t> tag)
+{
+  return ncclSuccess;
+}
+
+template<typename T>
+ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<std::pair<hipIpcMemHandle_t,size_t>> tag)
+{
+  return ncclSuccess;
+}
+
+template<typename T>
+ncclResult_t ShmObject<T>::InitIfSemaphore(OpenTag<sem_t> tag)
+{
+  size_t numMutexes = m_shmSize / sizeof(sem_t);
+
+  for (size_t i = 0; i < numMutexes; i++)
+  {
+    SYSCHECK(sem_init(static_cast<sem_t*>(&m_shmPtr[i]), 1, 1), "sem_init");
+  }
+  return ncclSuccess;
+}
+#endif
@@ -8,6 +8,7 @@
 #include "devcomm.h"
 #include "primitives.h"
 #include "collectives.h"
+#include "clique/AllReduceCliqueKernel.h" // [RCCL] AllReduce Clique-based kernel support

 template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
@@ -310,6 +311,7 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
  const ssize_t loopSize = nChannels*chunkSize;
  const ssize_t size = args->coll.count;

+
  if (loopSize > size) {
    chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
  }
@@ -417,76 +419,10 @@ __device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclAllReduceRingLL128Kernel(struct CollectiveArgs* args) {
-  const int tid = threadIdx.x;
-  const int nthreads = args->coll.nThreads;
-  const int bid = args->coll.bid;
-  const int nChannels = args->coll.nChannels;
-  struct ncclDevComm* comm = args->comm;
-  struct ncclChannel* channel = comm->channels+blockIdx.x;
-  struct ncclRing* ring = &channel->ring;
-  const int stepSize = comm->buffSizes[NCCL_PROTO_LL128] / (sizeof(uint64_t)*NCCL_STEPS);
-  ssize_t chunkSize = stepSize*NCCL_LL128_DATAELEMS*sizeof(uint64_t) / (NCCL_LL128_LINEELEMS*sizeof(T));
-  // We should not need the final /2 but it makes performance much, much smoother. Might be a bug somewhere.
-  const ssize_t minChunkSize = (NCCL_LL128_SHMEM_ELEMS_PER_THREAD*nthreads*NCCL_LL128_DATAELEMS*sizeof(uint64_t))/(NCCL_LL128_LINEELEMS*sizeof(T))/2;
-  const int nranks = comm->nRanks;
-  const ssize_t loopSize = nChannels*nranks*chunkSize;
-  const ssize_t size = args->coll.count;

-  ncclLL128Primitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &ring->prev, &ring->next, stepSize, channel, comm);
-
-  // Compute pointers
-  const T * __restrict__ thisInput = (const T*)args->sendbuff;
-  T * __restrict__ thisOutput = (T*)args->recvbuff;
-
-  for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
-    chunkSize = min(DIVUP(size-gridOffset, nChannels*nranks*minChunkSize)*minChunkSize, chunkSize);
-
-    /////////////// begin AllReduce steps ///////////////
-    ssize_t offset;
-    int nelem;
-    int chunk;
-
-    // step 0: push data to next GPU
-    chunk = ring->devUserRanks[nranks-1];
-    offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-    nelem = min(chunkSize, size-offset);
-
-    LLprims.send(thisInput+offset, nelem);
-
-    // k-2 steps: reduce and copy to next GPU
-    for (int j=2; j<nranks; ++j) {
-      chunk = ring->devUserRanks[nranks-j];
-      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-      nelem = min(chunkSize, size-offset);
-
-      LLprims.recvReduceSend(thisInput+offset, nelem);
-    }
-
-    // step k-1: reduce this buffer and data, which will produce the final
-    // result that we store in this data and push to the next GPU
-    chunk = ring->devUserRanks[0];
-    offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-    nelem = min(chunkSize, size-offset);
-
-    LLprims.recvReduceCopySend(thisInput+offset, thisOutput+offset, nelem);
-
-    // k-2 steps: copy to next GPU
-    for (int j=1; j<nranks-1; ++j) {
-      chunk = ring->devUserRanks[nranks-j];
-      offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-      nelem = min(chunkSize, size-offset);
-
-      LLprims.recvCopySend(thisOutput+offset, nelem);
-    }
-
-    // Make final copy from buffer to dest.
-    chunk = ring->devUserRanks[1];
-    offset = gridOffset + (chunk*nChannels+bid) * chunkSize;
-    nelem = min(chunkSize, size-offset);
-
-    // Here we need to copy from buffer to this output.
-    LLprims.recv(thisOutput+offset, nelem);
-  }
+  // [RCCL] RingLL128 is re-purposed as clique-based kernel
+  LAUNCH_CLIQUE_KERNEL(AllReduceCliqueSplitKernel, FUNC, T, args);
+  // [/RCCL]
 }

 template<int UNUSED, class FUNC, typename T>
@@ -507,6 +443,7 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
  int nthreadsSplit = NCCL_LL128_SPLIT(nthreads);
  const ssize_t size = args->coll.count;

+
  if (loopSize > size) {
    chunkSize = DIVUP(size, nChannels*minChunkSize)*minChunkSize;
  }
@@ -89,13 +89,45 @@ static inline __device__ void exitIfAbortBarrier(int abort) {
  NCCL_FUNCS3B(coll, copy), \
  NCCL_FUNCS3B(coll, copy)

+// [RCCL] Adding clique-based kernels for AllReduce, in-place of unused RingLL28 kernels
+#define NCCL_FUNC5B(coll, op, dtype) \
+  NCCL_COLL_NAME(coll##LL, op, dtype), \
+  NCCL_COLL_NAME(coll##LL128, op, dtype), \
+  NCCL_COLL_NAME(coll, op, dtype)
+
+#define NCCL_FUNC4B(coll, op, dtype) \
+  NCCL_FUNC5(coll##Tree, op, dtype), \
+  NCCL_FUNC5B(coll##Ring, op, dtype), \
+  NCCL_FUNC5(coll##CollNet, op, dtype)
+
+#define NCCL_FUNCS3C(coll, op)                  \
+  NCCL_FUNC4B(coll, op,  i8), \
+  NCCL_FUNC4B(coll, op,  u8), \
+  NCCL_FUNC4B(coll, op, i32), \
+  NCCL_FUNC4B(coll, op, u32), \
+  NCCL_FUNC4B(coll, op, i64), \
+  NCCL_FUNC4B(coll, op, u64), \
+  NCCL_FUNC4B(coll, op, f16), \
+  NCCL_FUNC4B(coll, op, f32), \
+  NCCL_FUNC4B(coll, op, f64), \
+  NCCL_FUNC4B(coll, op, b16)
+
+#define NCCL_FUNCS2C(coll) \
+  NCCL_FUNCS3C(coll, sum ), \
+  NCCL_FUNCS3C(coll, prod), \
+  NCCL_FUNCS3C(coll, max ), \
+  NCCL_FUNCS3C(coll, min )
+
+// [/RCCL]
+
+
 // Must be consistent with ncclFunc_t
 #define NCCL_FUNCS() { \
  NCCL_FUNCS2B(ncclBroadcast), \
  NCCL_FUNCS2A(ncclReduce), \
  NCCL_FUNCS2B(ncclAllGather), \
  NCCL_FUNCS2A(ncclReduceScatter), \
-  NCCL_FUNCS2A(ncclAllReduce), \
+  NCCL_FUNCS2C(ncclAllReduce), \
  NCCL_COLL_NAME(ncclGather, copy, i8), \
  NCCL_COLL_NAME(ncclScatter, copy, i8), \
  NCCL_COLL_NAME(ncclAllToAll, copy, i8), \
@@ -114,7 +146,7 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{
  NCCL_FUNCS2A(ncclReduce),
  NCCL_FUNCS2B(ncclAllGather),
  NCCL_FUNCS2A(ncclReduceScatter),
-  NCCL_FUNCS2A(ncclAllReduce),
+  NCCL_FUNCS2C(ncclAllReduce),
  NCCL_COLL_NAME(ncclGather, copy, i8),
  NCCL_COLL_NAME(ncclScatter, copy, i8),
  NCCL_COLL_NAME(ncclAllToAll, copy, i8),
@@ -350,9 +350,14 @@ __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(int32_t); }
 __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }
 #endif

+#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
+// Multiply UNROLL by 2 if single source/single destination
+#define AUTOUNROLL (UNROLL*((MINSRCS==1 && MINDSTS==1) ? 2 : 1))
+#else
 // Try to limit consecutive load/stores to 8.
 // Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
 #define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
+#endif

 template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
 __device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
@@ -9,6 +9,8 @@
 #include "argcheck.h"
 #include "coll_net.h"
 #include "../graph/topo.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_ext.h>

 // Only generate inline kernels for LL
 #define NCCL_FUNC5(coll, op, dtype) \
@@ -116,6 +118,10 @@ ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
    STORE(&channel->collectives[(channel->collStart+channel->collCount-1)%NCCL_MAX_OPS].active, 2);
  }

+  { // [RCCL] Wait for any clique-based collectives
+    NCCLCHECK(comm->cliqueManager->WaitForPointers());
+  } // [/RCCL]
+
  // Find the first operation, choose the kernel accordingly and pass it
  // as the first argument.
  struct ncclColl* coll = comm->channels[0].collectives+comm->channels[0].collStart;
@@ -210,7 +216,8 @@ ncclResult_t ncclBarrierEnqueueWait(ncclComm_t comm) {
        (comm->launchMode == ncclComm::GROUP && comm->groupCudaStream) ? "/Stream" : "");
  }

-
+  hipEvent_t startEvent;
+  hipEvent_t stopEvent;
  if (comm->launchMode == ncclComm::PARALLEL) {
    hipLaunchKernelGGL(((void (*)(struct ncclDevComm*))params->func), params->gridDim, params->blockDim, params->sharedMem, params->stream, **((struct ncclDevComm ***)(params->args)));
  } else {
@@ -257,6 +264,7 @@ static ncclResult_t getAlgoInfo(struct ncclInfo* info) {
  info->algorithm = -1;
  info->protocol = -1;
  int nAlgos = NCCL_NUM_ALGORITHMS;
+
  // Check collNet support
  int collNetTypeSupport = 0;
  if (info->comm->collNetSupport)
@@ -373,6 +381,7 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo
 #endif
    return ncclSuccess;
  }
+
  // Set nstepsPerLoop and nchunksPerLoop
  NCCLCHECK(getAlgoInfo(info));
  NCCLCHECK(getPatternInfo(info));
@@ -391,6 +400,33 @@ static ncclResult_t computeColl(struct ncclInfo* info /* input */, struct ncclCo

  coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);

+  { // [RCCL] Check for clique-based kernel support
+    if (info->comm->cliqueManager->IsSupported(info->coll,
+                                               info->count,
+                                               info->datatype,
+                                               info->op))
+    {
+      // Declare the input / output pointers being used (to exchange via IPC with other ranks)
+      NCCLCHECK(info->comm->cliqueManager->DeclarePointers(info->comm->opCount,
+                                                           info->sendbuff,
+                                                           info->recvbuff));
+
+
+      info->algorithm = NCCL_ALGO_RING;
+      info->protocol = NCCL_PROTO_CLIQUE;
+      // Determine the number of channels to use for clique-kernel
+      NCCLCHECK(info->comm->cliqueManager->GetNumChannelsToUse(info->coll,
+							       info->count,
+							       info->datatype,
+							       info->op,
+							       info->comm->nChannels,
+							       &coll->args.clique.nChannels));
+      coll->args.clique.count = info->count;
+      coll->funcIndex = FUNC_INDEX(info->coll, info->op, info->datatype, info->algorithm, info->protocol);
+      return ncclSuccess;
+    }
+  } // [RCCL]
+
  int stepSize   = info->comm->buffSizes[info->protocol]/NCCL_STEPS;
  int chunkSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->chunkSteps : 1;
  int sliceSteps = (info->protocol == NCCL_PROTO_SIMPLE && info->algorithm == NCCL_ALGO_RING) ? info->sliceSteps : 1;
@@ -478,6 +514,7 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
  info->comm->myParams->blockDim.x = std::max<unsigned>(info->comm->myParams->blockDim.x, info->nThreads);

  int nChannels = info->coll == ncclCollSendRecv ? 1 : coll.args.coll.nChannels;
+  
  int nSubChannels = (info->pattern == ncclPatternCollTreeUp || info->pattern == ncclPatternCollTreeDown) ? 2 : 1;

  for (int bid=0; bid<nChannels*nSubChannels; bid++) {
@@ -519,8 +556,15 @@ ncclResult_t ncclSaveKernel(struct ncclInfo* info) {
      memcpy(c->args.a2av.extra+info->comm->nRanks*2, info->recvcounts, sizeof(size_t*)*(info->comm->nRanks));
      memcpy(c->args.a2av.extra+info->comm->nRanks*3, info->rdispls, sizeof(size_t*)*(info->comm->nRanks));
      c->args.a2av.bid = bid % coll.args.coll.nChannels;
-    } else if (info->coll != ncclCollSendRecv)
+    } else if (info->coll != ncclCollSendRecv) {
      c->args.coll.bid = bid % coll.args.coll.nChannels;
+    }
+
+    // [RCCL] Setup pointers to where all the input/output pointers will be
+    if (info->protocol == NCCL_PROTO_CLIQUE) {
+      NCCLCHECK(info->comm->cliqueManager->SetCliqueCollectiveArgs(&c->args));
+    }
+    // [/RCCL]

    STORE(&c->active, 1);
    opIndex = (opIndex+1)%NCCL_MAX_OPS;
@@ -599,6 +643,7 @@ ncclResult_t ncclEnqueueCheck(struct ncclInfo* info) {
    } else {
      NCCLCHECKGOTO(ncclSaveKernel(info), ret, end);
    }
+
 end:
    if (savedDev != -1) CUDACHECK(hipSetDevice(savedDev));
    ncclAsyncErrCheck(ret);
@@ -414,14 +414,15 @@ ncclResult_t ncclTopoCheckGdr(struct ncclTopoSystem* system, int64_t busId, int

  // Check if we are close enough that it makes sense to enable GDR
  int netGdrLevel = PATH_PXB;
-#ifdef TOPO_EXPL
-  int arch, vendor, model;
-  NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
-  if (arch == NCCL_TOPO_CPU_ARCH_X86 && vendor == NCCL_TOPO_CPU_VENDOR_AMD && model == NCCL_TOPO_CPU_TYPE_ROME)
-    netGdrLevel = PATH_PHB;
-#endif
  NCCLCHECK(ncclGetLevel(&ncclTopoUserGdrLevel, NULL, "NCCL_NET_GDR_LEVEL"));
  if (ncclTopoUserGdrLevel != -2) netGdrLevel = ncclTopoUserGdrLevel;
+  else {
+    int arch, vendor, model;
+    NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
+    if((system->nodes[GPU].nodes[g].id & 0xf0000) == (system->nodes[NET].nodes[n].net.busId & 0xf0000))
+      netGdrLevel = PATH_PHB;
+  }
+
  int distance = gpu->paths[NET][n].type;
  if (distance > netGdrLevel) {
    INFO(NCCL_NET,"GPU Direct RDMA Disabled for GPU %lx / HCA %d (distance %d > %d)", busId, netDev, distance, netGdrLevel);
@@ -29,6 +29,7 @@ struct rcclRomeModel {
  int nNics;
  int nLinks;
  int64_t gpuIds[MAX_ROME_GPUS];
+  int64_t nicIds[MAX_ROME_NICS];
  int64_t gpuNuma[MAX_ROME_GPUS];
  int64_t nicNuma[MAX_ROME_NICS];
  int connMatrix[MAX_ROME_GPUS*MAX_ROME_GPUS];
@@ -39,6 +40,7 @@ struct rcclRomeModel {
 static struct rcclRomeModel rome_model_22 = {
  .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 2,
  .gpuIds = { 0x3000, 0x43000, 0x26000, 0xc3000, 0x83000, 0x23000, 0xc6000, 0xa3000, },
+  .nicIds = { 0xe1000, },
  .gpuNuma = { 1, 0, 1, 2, 3, 1, 2, 3, },
  .nicNuma = { 2, },
  .connMatrix = { 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, },
@@ -49,6 +51,7 @@ static struct rcclRomeModel rome_model_22 = {
 static struct rcclRomeModel rome_model_25 = {
  .nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
+  .nicIds = { 0x61000, 0xa1000, },
  .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
  .nicNuma = { 0, 3, },
  .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
@@ -59,6 +62,7 @@ static struct rcclRomeModel rome_model_25 = {
 static struct rcclRomeModel rome_model_27 = {
  .nGpus = 8, .nCpus = 4, .nNics = 2, .nLinks = 2,
  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
+  .nicIds = { 0x61000, 0xa1000, },
  .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
  .nicNuma = { 0, 3, },
  .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
@@ -69,6 +73,7 @@ static struct rcclRomeModel rome_model_27 = {
 static struct rcclRomeModel rome_model_29 = {
  .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
+  .nicIds = { 0xe1000, },
  .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
  .nicNuma = { 2, },
  .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
@@ -79,6 +84,7 @@ static struct rcclRomeModel rome_model_29 = {
 static struct rcclRomeModel rome_model_31 = {
  .nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
+  .nicIds = { 0x61000, 0xa1000, },
  .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
  .nicNuma = { 0, 6, },
  .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
@@ -89,6 +95,7 @@ static struct rcclRomeModel rome_model_31 = {
 static struct rcclRomeModel rome_model_33 = {
  .nGpus = 8, .nCpus = 8, .nNics = 2, .nLinks = 2,
  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
+  .nicIds = { 0x61000, 0xa1000, },
  .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
  .nicNuma = { 0, 6, },
  .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
@@ -99,6 +106,7 @@ static struct rcclRomeModel rome_model_33 = {
 static struct rcclRomeModel rome_model_30 = {
  .nGpus = 8, .nCpus = 8, .nNics = 0, .nLinks = 2,
  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
+  .nicIds = { },
  .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
  .nicNuma = { },
  .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
@@ -109,6 +117,7 @@ static struct rcclRomeModel rome_model_30 = {
 static struct rcclRomeModel rome_model_32 = {
  .nGpus = 8, .nCpus = 8, .nNics = 0, .nLinks = 2,
  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
+  .nicIds = { },
  .gpuNuma = { 1, 2, 2, 3, 4, 5, 5, 7, },
  .nicNuma = { },
  .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
@@ -119,6 +128,7 @@ static struct rcclRomeModel rome_model_32 = {
 static struct rcclRomeModel rome_model_24 = {
  .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2,
  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
+  .nicIds = { },
  .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
  .nicNuma = { },
  .connMatrix = { 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, },
@@ -129,6 +139,7 @@ static struct rcclRomeModel rome_model_24 = {
 static struct rcclRomeModel rome_model_26 = {
  .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2,
  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xe3000, 0xc3000, 0xc6000, 0x83000, },
+  .nicIds = { },
  .gpuNuma = { 0, 1, 1, 1, 2, 2, 2, 3, },
  .nicNuma = { },
  .connMatrix = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, },
@@ -139,6 +150,7 @@ static struct rcclRomeModel rome_model_26 = {
 static struct rcclRomeModel rome_model_23 = {
  .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 2,
  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
+  .nicIds = { },
  .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
  .nicNuma = { },
  .connMatrix = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, },
@@ -149,6 +161,7 @@ static struct rcclRomeModel rome_model_23 = {
 static struct rcclRomeModel rome_model_38 = {
  .nGpus = 8, .nCpus = 7, .nNics = 0, .nLinks = 2,
  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
+  .nicIds = { },
  .gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, },
  .nicNuma = { },
  .connMatrix = { 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, },
@@ -159,6 +172,7 @@ static struct rcclRomeModel rome_model_38 = {
 static struct rcclRomeModel rome_model_28 = {
  .nGpus = 8, .nCpus = 4, .nNics = 0, .nLinks = 3,
  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
+  .nicIds = { },
  .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
  .nicNuma = { },
  .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
@@ -166,6 +180,39 @@ static struct rcclRomeModel rome_model_28 = {
  .ringBase = "0 3 2 1 4 5 6 7|7 6 5 4 1 2 3 0|0 2 5 7 4 6 3 1|1 3 6 4 7 5 2 0",
 };

+static struct rcclRomeModel rome_model_40 = {
+  .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
+  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
+  .nicIds = { 0xe1000, },
+  .gpuNuma = { 0, 1, 1, 1, 2, 2, 3, 3, },
+  .nicNuma = { 2, },
+  .connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, },
+  .pattern = "10302120",
+  .ringBase = "6 7 1 4 0 5 3 2|7 6 4 1 0 2 3 5",
+};
+
+static struct rcclRomeModel rome_model_42 = {
+  .nGpus = 8, .nCpus = 7, .nNics = 1, .nLinks = 3,
+  .gpuIds = { 0x43000, 0x23000, 0x26000, 0x3000, 0xc3000, 0xc6000, 0xa3000, 0x83000, },
+  .nicIds = { 0xe1000, },
+  .gpuNuma = { 1, 2, 2, 3, 5, 5, 6, 7, },
+  .nicNuma = { 4, },
+  .connMatrix = { 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, },
+  .pattern = "00102010012010",
+  .ringBase = "7 4 6 1 3 0 2 5|6 4 7 1 3 2 5 0",
+};
+
+static struct rcclRomeModel rome_model_44 = {
+  .nGpus = 8, .nCpus = 4, .nNics = 1, .nLinks = 3,
+  .gpuIds = { 0x63000, 0x43000, 0x27000, 0x3000, 0xe3000, 0xc3000, 0xa3000, 0x83000, },
+  .nicIds = { 0xc4000, },
+  .gpuNuma = { 0, 0, 1, 1, 2, 2, 3, 3, },
+  .nicNuma = { 2, },
+  .connMatrix = { 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, },
+  .pattern = "20202120",
+  .ringBase = "5 4 7 6 2 1 3 0|5 6 7 4 1 0 2 3",
+};
+
 static struct rcclRomeModel romeTopoModels[] = {
  rome_model_22,
  rome_model_25,
@@ -180,4 +227,7 @@ static struct rcclRomeModel romeTopoModels[] = {
  rome_model_23,
  rome_model_38,
  rome_model_28,
+  rome_model_40,
+  rome_model_42,
+  rome_model_44,
 };
@@ -905,6 +905,7 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
    }
    if (j >= romeTopo->nNics) {
      net_map[j] = i;
+      romeTopo->nicIds[romeTopo->nNics] = system->nodes[NET].nodes[i].net.busId;
      (romeTopo->nNics)++;
      if (romeTopo->nNics >= MAX_ROME_NICS) break;
    }
@@ -941,6 +942,9 @@ static ncclResult_t parseRomeSystem(struct ncclTopoSystem* system, struct rcclRo
    fprintf(file, "  .gpuIds = { ");
    for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "0x%lx, ", romeTopo->gpuIds[i]);
    fprintf(file, "},\n");
+    fprintf(file, "  .nicIds = { ");
+    for (int i = 0; i < romeTopo->nNics; i ++) fprintf(file, "0x%lx, ", romeTopo->nicIds[i]);
+    fprintf(file, "},\n");
    fprintf(file, "  .gpuNuma = { ");
    for (int i = 0; i < romeTopo->nGpus; i ++) fprintf(file, "%ld, ", romeTopo->gpuNuma[i]);
    fprintf(file, "},\n");
@@ -1038,13 +1042,18 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopo
  }

  char line[1024];
+#ifdef ENABLE_TRACE
  sprintf(line, "Found matching Rome model index %d in %.2fms (%d iter) with GPU mapping: ", i, t, time);
+#else
+  sprintf(line, "Found matching Rome model index %d with GPU mapping: ", i);
+#endif
  int offset = strlen(line);
  for (int k = 0; k < ngpus; k++) {
    sprintf(line+offset, "%d ", g[k]);
    offset = strlen(line);
  }
  INFO(NCCL_GRAPH, "%s", line);
+
  // create 4P2H based on reference and remapped ids
  NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, g, romeTopo.nNics, net_map));
  return ncclSuccess;
@@ -266,7 +266,7 @@ ncclResult_t ncclTopoSortSystem(struct ncclTopoSystem* system) {
  return ncclSuccess;
 }

-ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
+ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int64_t busId) {
  int dev;
  NCCLCHECK(xmlGetAttrInt(xmlNet, "dev", &dev));

@@ -286,6 +286,7 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
  if (xmlGetAttrInt(xmlNet, "gdr", &net->net.gdrSupport) != ncclSuccess) net->net.gdrSupport = 0;
  if (xmlGetAttrInt(xmlNet, "maxconn", &net->net.maxChannels) != ncclSuccess) net->net.maxChannels = MAXCHANNELS;
  if (ncclCollNet && xmlGetAttrInt(xmlNet, "coll", &net->net.collSupport) != ncclSuccess) net->net.collSupport = 0;
+  net->net.busId = busId;
  ncclDebugNoWarn = 0;

  NCCLCHECK(ncclTopoConnectNodes(nic, net, LINK_NET, net->net.width));
@@ -293,14 +294,14 @@ ncclResult_t ncclTopoAddNet(struct ncclXmlNode* xmlNet, struct ncclTopoSystem* s
  return ncclSuccess;
 }

-ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic) {
+ncclResult_t ncclTopoAddNic(struct ncclXmlNode* xmlNic, struct ncclTopoSystem* system, struct ncclTopoNode* nic, int64_t busId) {
  for (int s=0; s<xmlNic->nSubs; s++) {
    struct ncclXmlNode* xmlNet = xmlNic->subs[s];
    if (strcmp(xmlNet->name, "net") != 0) continue;
    int index;
    NCCLCHECK(xmlGetAttrIndex(xmlNet, "dev", &index));
    if (index == -1) continue;
-    NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic));
+    NCCLCHECK(ncclTopoAddNet(xmlNet, system, nic, busId));
  }
  return ncclSuccess;
 }
@@ -354,7 +355,7 @@ ncclResult_t ncclTopoAddPci(struct ncclXmlNode* xmlPci, struct ncclTopoSystem* s
      NCCLCHECK(ncclTopoCreateNode(system, &nicNode, type, busId));
      node = nicNode; // Connect it to parent later on
    }
-    NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode));
+    NCCLCHECK(ncclTopoAddNic(xmlNic, system, nicNode, busId));
  } else if (type == PCI) {
    NCCLCHECK(ncclTopoCreateNode(system, &node, type, busId));
    for (int s=0; s<xmlPci->nSubs; s++) {
@@ -421,7 +422,7 @@ ncclResult_t ncclTopoAddCpu(struct ncclXmlNode* xmlCpu, struct ncclTopoSystem* s
        NCCLCHECK(ncclTopoConnectNodes(cpu, nic, LINK_PCI, LOC_WIDTH));
        NCCLCHECK(ncclTopoConnectNodes(nic, cpu, LINK_PCI, LOC_WIDTH));
      }
-      NCCLCHECK(ncclTopoAddNic(node, system, nic));
+      NCCLCHECK(ncclTopoAddNic(node, system, nic, 0));
    }
  }
  return ncclSuccess;
@@ -100,6 +100,7 @@ struct ncclTopoNode {
      int gdrSupport;
      int collSupport;
      int maxChannels;
+      int64_t busId;
    }net;
    struct {
      int arch;
@@ -12,7 +12,7 @@
 ncclResult_t bootstrapNetInit();
 ncclResult_t bootstrapCreateRoot(ncclUniqueId* commId, bool idFromEnv);
 ncclResult_t bootstrapGetUniqueId(ncclUniqueId* out);
-ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState);
+ncclResult_t bootstrapInit(ncclUniqueId* id, int rank, int nranks, void** commState, int* rootPid); // [RCCL] Adding rootPid
 ncclResult_t bootstrapAllGather(void* commState, void* allData, int size);
 ncclResult_t bootstrapSend(void* commState, int peer, void* data, int size);
 ncclResult_t bootstrapRecv(void* commState, int peer, void* data, int size);
@@ -10,6 +10,9 @@

 #include "transport.h"
 #include "p2p.h"
+// [RCCL]
+#include "clique/CliqueManager.h"
+// [/RCCL]

 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
 #else
@@ -143,8 +146,12 @@ struct ncclComm {
  //list of async p2p operation queued in a group semantics
  struct ncclP2Plist p2plist;

-  // RCCL AllToAll/Scatter/Gather API
-  bool alltoallDisable;
+  // [RCCL]
+  bool alltoallDisable;            // RCCL AllToAll/Scatter/Gather API
+  CliqueManager* cliqueManager;    // CliqueManager handles pointer collection / distribution for clique-based kernels
+  int rootPid;                     // Process ID of root
+  // [/RCCL]
+
 };

 #endif
@@ -12,6 +12,9 @@
 #include "rccl_bfloat16.h"
 #include "align.h"
 #include <stdint.h>
+// [RCCL] Support for clique-based kernels
+#include "clique/CliqueCommon.h"
+// [/RCCL]

 // Convert volatile access to atomic
 #if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
@@ -22,6 +25,7 @@
 #define STORE(DST, SRC) *(DST) = (SRC)
 #endif

+
 #define NCCL_NUM_FUNCTIONS 5 // SendRecv not included for now
 typedef enum { ncclCollBroadcast, ncclCollReduce, ncclCollAllGather, ncclCollReduceScatter, ncclCollAllReduce, ncclCollGather, ncclCollScatter, ncclCollAllToAll, ncclCollAllToAllv, ncclCollSendRecv} ncclFunc_t;
 extern const char* ncclFuncStr[NCCL_NUM_FUNCTIONS+4];
@@ -35,6 +39,7 @@ extern const char* ncclAlgoStr[NCCL_NUM_ALGORITHMS];
 #define NCCL_NUM_PROTOCOLS 3 // Simple/LL/LL128
 #define NCCL_PROTO_LL 0
 #define NCCL_PROTO_LL128 1
+#define NCCL_PROTO_CLIQUE 1  // [RCCL] Clique takes up same protocol as unused LL128
 #define NCCL_PROTO_SIMPLE 2
 extern const char* ncclProtoStr[NCCL_NUM_PROTOCOLS];

@@ -190,8 +195,18 @@ struct CollectiveArgs {
      size_t count;
      size_t* extra;
    } a2av;
+    // [RCCL] Clique-based arguments
+    struct {
+      uint16_t nThreads;
+      uint8_t bid;
+      uint8_t nChannels;
+      size_t count;
+      cliqueDevicePtrs_t* ptrs;
+    } clique;
+    // [/RCCL]
  };
 };
+
 struct ncclColl {
  union {
    struct {
@@ -28,6 +28,10 @@
 #include <unistd.h>
 #include "graph/topo.h"

+// [RCCL]
+#include "clique/CliqueManager.h"
+// [/RCCL]
+
 #define STR2(v) #v
 #define STR(v) STR2(v)

@@ -299,7 +303,8 @@ static ncclResult_t commFree(ncclComm_t comm) {
  return ncclSuccess;
 }

-RCCL_PARAM(AllToAllDisable, "ALLTOALL_KERNEL_DISABLE", 0);
+RCCL_PARAM(AllToAllDisable, "ALLTOALL_KERNEL_DISABLE", 1);
+RCCL_PARAM(ForceEnableClique, "FORCE_ENABLE_CLIQUE", 0);

 static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
  if (ndev < 1) {
@@ -678,7 +683,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
  int nranks = comm->nRanks;
  uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
  TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
-  NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
+  // [RCCL] Collect the PID of the root
+  int rootPid;
+  NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap, &rootPid));
+  // [/RCCL]

  // AllGather1 - begin
  struct {
@@ -996,36 +1004,84 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
      connect->nsend[c] = 0;
    }
  }
-  // We should have allocated all buffers, collective fifos, ... we can
-  // restore the affinity.
-affinity_restore:
-  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
-  if (ret != ncclSuccess) return ret;

  // Compute intra ranks (using AllGather1 data)
-  int intraRank0 = -1, intraRank = -1, intraRanks = 0;
-  for (int i = 0; i < nranks; i++) {
-    if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
-        (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
-      if (intraRanks == 0) intraRank0 = i;
-      if (i == rank) intraRank = intraRanks;
-      intraRanks++;
+  do {
+    int intraRank0 = -1, intraRank = -1, intraRanks = 0;
+    for (int i = 0; i < nranks; i++) {
+      if ((allGather1Data[i].peerInfo.hostHash == allGather1Data[rank].peerInfo.hostHash) &&
+          (allGather1Data[i].peerInfo.pidHash == allGather1Data[rank].peerInfo.pidHash)) {
+        if (intraRanks == 0) intraRank0 = i;
+        if (i == rank) intraRank = intraRanks;
+        intraRanks++;
+      }
    }
-  }
-  TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+    TRACE(NCCL_INIT,"hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
        rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
-  if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
-    WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
-         rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
-    return ncclInternalError;
-  }
-  NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
+    if (intraRank == -1 || intraRank0 == -1 || allGather1Data[intraRank0].comm == NULL) {
+      WARN("Failed to determine intra ranks hostHash[%d] %lx intraRank %d intraRanks %d intraRank0 %d",
+          rank, allGather1Data[rank].peerInfo.hostHash, intraRank, intraRanks, intraRank0);
+      return ncclInternalError;
+    }
+    NCCLCHECK(ncclCommSetIntra(comm, intraRank, intraRanks, allGather1Data[intraRank0].comm));
+
+    { // [RCCL] Check if clique-based kernels can be enabled and initialize CliqueManager if so
+      CliqueManager::cliqueMode_t cliqueMode = CliqueManager::CLIQUE_DISABLED;
+      if (comm->localRanks == comm->nRanks)
+      {
+        // Check that all the GPUs have peer access to one another
+        bool hasPeerAccess = true;
+        for (int i = 0; i < nranks && hasPeerAccess; i++)
+        {
+          int cudaDev1 = allGather1Data[i].peerInfo.cudaDev;
+          for (int j = 0; j < nranks; j++)
+          {
+            if (i == j) continue;
+            int cudaDev2 = allGather1Data[j].peerInfo.cudaDev;
+            int p2p;
+            if (hipDeviceCanAccessPeer(&p2p, cudaDev1, cudaDev2) != hipSuccess || !p2p)
+            {
+              hasPeerAccess = false;
+              break;
+            }
+          }
+        }
+        if (hasPeerAccess)
+        {
+          if (intraRanks == nranks)
+            cliqueMode = CliqueManager::CLIQUE_SINGLE_PROCESS;
+          else
+            cliqueMode = CliqueManager::CLIQUE_SINGLE_NODE;
+        }
+
+        // For now, only enable clique-based kernels on CR8_G topologies, unless explicitly asked
+        if (!rcclParamForceEnableClique())
+        {
+          // Disable clique-kernel support if not on CR8 topology
+          if (!(comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G))
+          {
+            INFO(NCCL_INIT, "Disabling clique-based kernels due to topology (force enable with RCCL_FORCE_ENABLE_CLIQUE)");
+            cliqueMode = CliqueManager::CLIQUE_DISABLED;
+          }
+        }
+      }
+      comm->cliqueManager = new CliqueManager(rank, nranks, cliqueMode);
+      NCCLCHECK(comm->cliqueManager->Init(commId, rootPid));
+    } // [/RCCL]
+  } while(0);
+

  // Done with AllGather1 data
  free(allGather1Data);

  if (comm->nNodes) NCCLCHECK(ncclProxyCreate(comm));

+  // We should have allocated all buffers, collective fifos, ... we can
+  // restore the affinity.
+affinity_restore:
+  sched_setaffinity(0, sizeof(cpu_set_t), &affinitySave);
+  if (ret != ncclSuccess) return ret;
+
  TRACE(NCCL_INIT, "rank %d nranks %d - DONE", rank, nranks);
  return ncclSuccess;
 }
@@ -1144,6 +1200,10 @@ ncclResult_t ncclCommDestroy(ncclComm_t comm) {
    return ncclInvalidArgument;
  }

+  // [RCCL] Delete CliqueManager if it exists
+  if (comm->cliqueManager) delete comm->cliqueManager;
+  // [/RCCL]
+
  return commDestroy(comm);
 }

@@ -73,10 +73,10 @@ ncclResult_t ArgsCheck(struct ncclInfo* info) {
      }
    } else {
      // Check CUDA device pointers
-      if (info->coll != ncclCollBroadcast || info->comm->rank == info->root) {
+      if ((info->coll != ncclCollBroadcast && info->coll != ncclCollScatter) || info->comm->rank == info->root) {
        NCCLCHECK(CudaPtrCheck(info->sendbuff, info->comm, "sendbuff", info->opName));
      }
-      if (info->coll != ncclCollReduce || info->comm->rank == info->root) {
+      if ((info->coll != ncclCollReduce && info->coll != ncclCollGather) || info->comm->rank == info->root) {
        NCCLCHECK(CudaPtrCheck(info->recvbuff, info->comm, "recvbuff", info->opName));
      }
    }
@@ -418,23 +418,24 @@ namespace CorrectnessTests
                        switch (dataset.dataType)
                        {
                        case ncclInt8:
-                            printf("Expected %d.  Output %d on device %d[%d]\n", outputI1[j], expectedI1[j], i, j); break;
+                            printf("Expected %d.  Output %d on device %d[%d]\n", expectedI1[j], outputI1[j], i, j);
+                            break;
                        case ncclUint8:
-                            printf("Expected %u.  Output %u on device %d[%d]\n", outputU1[j], expectedU1[j], i, j); break;
+                            printf("Expected %u.  Output %u on device %d[%d]\n", expectedU1[j], outputU1[j], i, j); break;
                        case ncclInt32:
-                            printf("Expected %d.  Output %d on device %d[%d]\n", outputI4[j], expectedI4[j], i, j); break;
+                            printf("Expected %d.  Output %d on device %d[%d]\n", expectedI4[j], outputI4[j], i, j); break;
                        case ncclUint32:
-                            printf("Expected %u.  Output %u on device %d[%d]\n", outputU4[j], expectedU4[j], i, j); break;
+                            printf("Expected %u.  Output %u on device %d[%d]\n", expectedU4[j], outputU4[j], i, j); break;
                        case ncclInt64:
-                            printf("Expected %ld.  Output %ld on device %d[%d]\n", outputI8[j], expectedI8[j], i, j); break;
+                            printf("Expected %ld.  Output %ld on device %d[%d]\n", expectedI8[j], outputI8[j], i, j); break;
                        case ncclUint64:
-                            printf("Expected %lu.  Output %lu on device %d[%d]\n", outputU8[j], expectedU8[j], i, j); break;
+                            printf("Expected %lu.  Output %lu on device %d[%d]\n", expectedU8[j], outputU8[j], i, j); break;
                        case ncclFloat32:
-                            printf("Expected %f.  Output %f on device %d[%d]\n", outputF4[j], expectedF4[j], i, j); break;
+                            printf("Expected %f.  Output %f on device %d[%d]\n", expectedF4[j], outputF4[j], i, j); break;
                        case ncclFloat64:
-                            printf("Expected %lf.  Output %lf on device %d[%d]\n", outputF8[j], expectedF8[j], i, j); break;
+                            printf("Expected %lf.  Output %lf on device %d[%d]\n", expectedF8[j], outputF8[j], i, j); break;
                        case ncclBfloat16:
-                            printf("Expected %f.  Output %f on device %d[%d]\n", (float)outputB2[j], (float)expectedB2[j], i, j); break;
+                            printf("Expected %f.  Output %f on device %d[%d]\n", (float)expectedB2[j], (float)outputB2[j], i, j); break;
                        default:
                            fprintf(stderr, "[ERROR] Unsupported datatype\n");
                            exit(0);
@@ -58,6 +58,6 @@ namespace CorrectnessTests
                                testing::Values(2,3,4,5,6,7,8),
                                // In-place or not
                                testing::Values(false, true),
-                                testing::Values("")),
+                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
                            CorrectnessTest::PrintToStringParamName());
 } // namespace
@@ -96,6 +96,6 @@ namespace CorrectnessTests
                                testing::Values(2,3,4,5,6,7,8),
                                // In-place or not
                                testing::Values(false, true),
-                                testing::Values("")),
+                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
                            CorrectnessTest::PrintToStringParamName());
 } // namespace
@@ -116,6 +116,6 @@ namespace CorrectnessTests
                                testing::Values(2,3,4,5,6,7,8),
                                // In-place or not
                                testing::Values(false, true),
-                                testing::Values("")),
+                                testing::Values("RCCL_ENABLE_CLIQUE=0", "RCCL_ENABLE_CLIQUE=1")),
                             CorrectnessTest::PrintToStringParamName());
 } // namespace
@@ -0,0 +1,257 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <sys/socket.h>
+#include <ifaddrs.h>
+#include <netdb.h>
+#include <unistd.h>
+#include <cstdio>
+#include <string>
+#include <chrono>
+#include <hip/hip_runtime.h>
+#include <rccl.h>
+#include "HelloRccl.hpp"
+
+
+void Usage(char *argv0);
+void ExecuteTest(int numIntraRank, int intraRankStartId, int numTotalRanks, ncclComm_t* comm);
+
+int main(int argc, char **argv)
+{
+  if (getenv("NCCL_COMM_ID") && argc == 3) // Run in multi-process mode
+  {
+    int nranks   = atoi(argv[1]);
+    int rank     = atoi(argv[2]);
+    if (rank == 0) printf("Running in multi-process mode\n");
+
+    // Create communicator for this rank
+    ncclUniqueId commId;
+    NCCL_CALL(ncclGetUniqueId(&commId));
+
+    // Initialize communicator
+    ncclComm_t comm;
+    HIP_CALL(hipSetDevice(rank));
+    NCCL_CALL(ncclCommInitRank(&comm, nranks, commId, rank));
+
+    // Run the test
+    ExecuteTest(1, rank, nranks, &comm);
+  }
+  else if (argc == 2) // Run in single-process mode
+  {
+    printf("Running in single-process mode\n");
+
+    int nranks   = atoi(argv[1]);
+
+    // Initialize communicators for each rank
+    ncclComm_t comm[nranks];
+    NCCL_CALL(ncclCommInitAll(comm, nranks, NULL));
+
+    // Run the test
+    ExecuteTest(nranks, 0, nranks, comm);
+  }
+  else
+  {
+    Usage(argv[0]);
+    return 1;
+  }
+  return 0;
+}
+
+void ExecuteTest(int numIntraRank, int intraRankStartId, int numTotalRanks, ncclComm_t* comm)
+{
+  // Test configuration settings
+  int minPow        = 10;      // Starting power of 2 input size
+  int maxPow        = 28;      // Ending power of 2 input size
+  int numWarmups    =  3;      // Number of untimed warmup iterations
+  int numIterations = 10;      // Number of timed iterations
+
+  // Allocate GPU resources for this process
+  hipStream_t stream[numIntraRank];
+  hipEvent_t  startEvent[numIntraRank];
+  hipEvent_t  stopEvent[numIntraRank];
+  for (int i = 0; i < numIntraRank; i++)
+  {
+    HIP_CALL(hipSetDevice(intraRankStartId + i));
+    HIP_CALL(hipStreamCreate(&stream[i]));
+    HIP_CALL(hipEventCreate(&startEvent[i]));
+    HIP_CALL(hipEventCreate(&stopEvent[i]));
+  }
+
+  if (intraRankStartId == 0)
+  {
+    printf("AllReduce Performance (sum of floats):\n");
+    printf("%10s %10s %10s\n", "Bytes", "CpuTime(ms)", "GpuTime(ms)");
+  }
+
+  // Loop over power-of-two input sizes
+  for (int power = minPow; power <= maxPow; power++)
+  {
+    int N = 1 << power;
+
+    // Allocate GPU memory
+    float *iputGpu[numIntraRank], *oputGpu[numIntraRank];
+    for (int r = 0; r < numIntraRank; r++)
+    {
+      HIP_CALL(hipSetDevice(intraRankStartId + r));
+      HIP_CALL(hipMalloc((void **)&iputGpu[r], N * sizeof(float)));
+      HIP_CALL(hipMalloc((void **)&oputGpu[r], N * sizeof(float)));
+    }
+
+    // Allocate CPU memory for input/output
+    float *iputCpu = (float *)malloc(N * sizeof(float));
+    float *oputCpu = (float *)malloc(N * sizeof(float));
+
+    // Fill CPU memory with a simple pattern
+    for (int i = 0; i < N; i++)
+    {
+      iputCpu[i] = 1.0f;
+      oputCpu[i] = 0.0f;
+    }
+
+    // Copy the input from CPU memory to GPU memory
+    for (int r = 0; r < numIntraRank; r++)
+    {
+      HIP_CALL(hipSetDevice(intraRankStartId + r));
+      HIP_CALL(hipMemcpy(iputGpu[r], iputCpu, N * sizeof(float), hipMemcpyHostToDevice));
+    }
+
+    // Perform some untimed initial warmup iterations
+    for (int iteration = 0; iteration < numWarmups; iteration++)
+    {
+      NCCL_CALL(ncclGroupStart());
+      for (int r = 0; r < numIntraRank; r++)
+      {
+        HIP_CALL(hipSetDevice(intraRankStartId + r));
+        NCCL_CALL(ncclAllReduce(iputGpu[r], oputGpu[r], N, ncclFloat, ncclSum, comm[r], stream[r]));
+      }
+      NCCL_CALL(ncclGroupEnd());
+    }
+    for (int r = 0; r < numIntraRank; r++)
+      HIP_CALL(hipStreamSynchronize(stream[r]));
+
+    // Perform timed iterations
+    auto cpuStart = std::chrono::high_resolution_clock::now();
+    for (int r = 0; r < numIntraRank; r++)
+      HIP_CALL(hipEventRecord(startEvent[r], stream[r]));
+
+    for (int iteration = 0; iteration < numIterations; iteration++)
+    {
+      NCCL_CALL(ncclGroupStart());
+      for (int r = 0; r < numIntraRank; r++)
+      {
+        HIP_CALL(hipSetDevice(intraRankStartId + r));
+        NCCL_CALL(ncclAllReduce(iputGpu[r], oputGpu[r], N, ncclFloat, ncclSum, comm[r], stream[r]));
+      }
+      NCCL_CALL(ncclGroupEnd());
+    }
+
+    for (int r = 0; r < numIntraRank; r++)
+      HIP_CALL(hipEventRecord(stopEvent[r], stream[r]));
+
+    for (int r = 0; r < numIntraRank; r++)
+      HIP_CALL(hipStreamSynchronize(stream[r]));
+
+    auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
+    double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(cpuDelta).count();
+
+    float totalGpuTime;
+    HIP_CALL(hipEventElapsedTime(&totalGpuTime, startEvent[0], stopEvent[0]));
+
+    if (intraRankStartId == 0) printf("%10lu %10.3f %10.3f\n", N * sizeof(float), (totalCpuTime / numIterations), (totalGpuTime / numIterations));
+
+    // Validate results
+    for (int r = 0; r < numIntraRank; r++)
+    {
+      HIP_CALL(hipMemcpy(oputCpu, oputGpu[r], N * sizeof(float), hipMemcpyDeviceToHost));
+      bool isOK = true;
+      int expected = numTotalRanks;
+      for (int i = 0; i < N; i++)
+      {
+        isOK &= (oputCpu[i] == expected);
+      }
+      if (!isOK)
+      {
+        printf("[ERROR] Rank %d Incorrect results for N = %d\n", intraRankStartId + r, N);
+        NCCL_CALL(ncclCommDestroy(comm[r]));
+        exit(1);
+      }
+    }
+
+    // Release GPU resources
+    for (int r = 0; r < numIntraRank; r++)
+    {
+      HIP_CALL(hipFree(oputGpu[r]));
+      HIP_CALL(hipFree(iputGpu[r]));
+    }
+    free(iputCpu);
+    free(oputCpu);
+  }
+
+  for (int r = 0; r < numIntraRank; r++)
+  {
+    HIP_CALL(hipStreamDestroy(stream[r]));
+    HIP_CALL(hipEventDestroy(startEvent[r]));
+    HIP_CALL(hipEventDestroy(stopEvent[r]));
+    NCCL_CALL(ncclCommDestroy(comm[r]));
+  }
+}
+
+void Usage(char *argv0)
+{
+  printf("Single Process Usage: %s numRanks\n", argv0);
+  printf("\n");
+  printf("Multi Process Usage: %s numRanks rank\n", argv0);
+  printf(" - NCCL_COMM_ID must be set in order to use this\n\n");
+  printf(" - To use this process as the root process you may use any of the following:\n");
+
+  char hostname[256];
+  gethostname(hostname, 256);
+  printf("    export NCCL_COMM_ID=%s:12345\n", hostname);
+
+  // Loop over linked list of interfaces
+  struct ifaddrs *ifaddr;
+  getifaddrs(&ifaddr);
+  for (struct ifaddrs* ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next)
+  {
+    // Skip anything not based on IPv4 / IPv6
+    int family = ifa->ifa_addr->sa_family;
+    if (family != AF_INET && family != AF_INET6) continue;
+
+    // Skip iPv6 loopback interface
+    if (family == AF_INET6)
+    {
+      struct sockaddr_in6* sa = (struct sockaddr_in6*)(ifa->ifa_addr);
+      if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
+    }
+
+    socklen_t saLen = (family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6));
+    char host[NI_MAXHOST];
+    char service[NI_MAXSERV];
+
+    getnameinfo(ifa->ifa_addr, saLen, host, NI_MAXHOST, service, NI_MAXSERV,
+                NI_NUMERICHOST|NI_NUMERICSERV);
+
+    std::string result = std::string(host) + ":12345";
+    printf("    export NCCL_COMM_ID=%s\n", result.c_str());
+  }
+  freeifaddrs(ifaddr);
+}
@@ -0,0 +1,49 @@
+/*
+Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HELLORCCL_HPP
+#define HELLORCCL_HPP
+#include <iostream>
+
+#define HIP_CALL(cmd)                                                 \
+  do {                                                                \
+    hipError_t error = (cmd);                                         \
+    if (error != hipSuccess)                                          \
+    {                                                                   \
+      std::cerr << "Encountered HIP error (" << hipGetErrorString(error) << ") at line " \
+                << __LINE__ << " in file " << __FILE__ << "\n";         \
+      exit(-1);                                                         \
+    }                                                                   \
+  } while (0)
+
+#define NCCL_CALL(cmd) \
+  do { \
+    ncclResult_t error = (cmd);                 \
+    if (error != ncclSuccess)                   \
+    {                                           \
+      std::cerr << "Encountered NCCL error (" << ncclGetErrorString(error) << ") at line " \
+                << __LINE__ << " in file " << __FILE__ << "\n";         \
+      exit(-1);                                                         \
+    }                                                                   \
+  } while (0)
+
+#endif
@@ -0,0 +1,21 @@
+# Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved.
+
+# Set to where RCCL is installed
+RCCL_INSTALL=../../build/release
+
+HIP_PATH?= $(wildcard /opt/rocm/hip)
+ifeq (,$(HIP_PATH))
+HIP_PATH=../../..
+endif
+HIPCC=$(HIP_PATH)/bin/hipcc
+
+EXE=HelloRccl
+CXXFLAGS = -std=c++11 -O3 -I../../src/include -I$(RCCL_INSTALL) -L$(RCCL_INSTALL) -lrccl
+
+all: $(EXE)
+
+$(EXE): $(EXE).cpp $(shell find -regex ".*\.\hpp")
+	$(HIPCC) $(CXXFLAGS) $< -o $@
+
+clean:
+	rm -f *.o $(EXE)
@@ -0,0 +1,22 @@
+#!/bin/bash
+RCCL_INSTALL=../../build/release
+EXE=$PWD/HelloRccl
+LDPATH=$LD_LIBRARY_PATH:$RCCL_INSTALL
+
+echo "Single process - With clique-based kernels:"
+RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 LD_LIBRARY_PATH=$LDPATH $EXE 4
+
+echo "Single process - Without clique-based kernels:"
+NCCL_DEBUG=INFO LD_LIBRARY_PATH=$LDPATH $EXE 4
+
+echo "With clique-based kernels:"
+RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 0 &
+RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 1 &
+RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 2 &
+RCCL_CLIQUE_ALLREDUCE_BYTE_LIMIT=1073741824 RCCL_FORCE_ENABLE_CLIQUE=1 NCCL_DEBUG=INFO RCCL_ENABLE_CLIQUE=1 NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 3
+
+echo "Without clique-based kernels:"
+NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 0 &
+NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 1 &
+NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 2 &
+NCCL_COMM_ID=$HOSTNAME:12345 LD_LIBRARY_PATH=$LDPATH $EXE 4 3
@@ -6,7 +6,7 @@ endif
 HIPCC=$(HIP_PATH)/bin/hipcc

 EXE=TransferBench
-CXXFLAGS = -O3 -fopenmp -I../../src/include -I.
+CXXFLAGS = -O3 -I../../src/include -I.

 all: $(EXE)

@@ -26,7 +26,7 @@ THE SOFTWARE.
 #include "TransferBench.hpp"

 // Simple configuration parameters
-size_t const DEFAULT_BYTES_PER_LINK = (1<<28);
+size_t const DEFAULT_BYTES_PER_LINK = (1<<26);
 int    const DEFAULT_NUM_WARMUPS    = 3;
 int    const DEFAULT_NUM_ITERATIONS = 10;

@@ -40,6 +40,27 @@ int main(int argc, char **argv)
    exit(0);
  }

+  // If a negative value is listed for N, generate a comprehensive config file for this node
+  if (argc > 2 && atoi(argv[2]) < 0)
+  {
+    GenerateConfigFile(argv[1], -1*atoi(argv[2]));
+    exit(0);
+  }
+
+  // Collect environment variables / display current run configuration
+  bool useHipCall      = getenv("USE_HIP_CALL");       // Use hipMemcpy/hipMemset instead of custom shader kernels
+  bool useMemset       = getenv("USE_MEMSET");         // Perform a memset instead of a copy (ignores source memory)
+  bool useFineGrainMem = getenv("USE_FINEGRAIN_MEM");  // Allocate fine-grained GPU memory instead of coarse-grained GPU memory
+  bool useSingleSync   = getenv("USE_SINGLE_SYNC");    // Perform synchronization only once after all iterations instead of per iteration
+  bool useInteractive  = getenv("USE_INTERACTIVE");    // Pause for user-input before starting transfer loop
+  bool useSleep        = getenv("USE_SLEEP");          // Adds a 100ms sleep after each synchronization
+  bool reuseStreams    = getenv("REUSE_STREAMS");      // Re-use streams instead of creating / destroying per test
+  bool showAddr        = getenv("SHOW_ADDR");          // Print out memory addresses for each Link
+  bool outputToCsv     = getenv("OUTPUT_TO_CSV");      // Output in CSV format
+  int  byteOffset      = getenv("BYTE_OFFSET") ? atoi(getenv("BYTE_OFFSET")) : 0; // Byte-offset for memory allocations
+  int  numWarmups      = getenv("NUM_WARMUPS") ? atoi(getenv("NUM_WARMUPS")) : DEFAULT_NUM_WARMUPS;
+  int  numIterations   = getenv("NUM_ITERATIONS") ? atoi(getenv("NUM_ITERATIONS")) : DEFAULT_NUM_ITERATIONS;
+
  // Determine number of bytes to run per link
  // If a non-zero number of bytes is specified, use it
  // Otherwise generate array of bytes values to execute over
@@ -55,12 +76,10 @@ int main(int argc, char **argv)
  if (numBytesPerLink != 0)
  {
    size_t N = numBytesPerLink / sizeof(float);
-    printf("Operating on %zu bytes per link (%zu floats)\n", numBytesPerLink, N);
    valuesOfN.push_back(N);
  }
  else
  {
-    printf("Operating on range of sizes\n");
    for (int N = 256; N <= (1<<27); N *= 2)
    {
      int decimationFactor = 1;  // This can be modified to increase number of samples between powers of two
@@ -74,19 +93,6 @@ int main(int argc, char **argv)
    }
  }

-  // Collect environment variables / display current run configuration
-  bool useHipCall      = getenv("USE_HIP_CALL");       // Use hipMemcpy/hipMemset instead of custom shader kernels
-  bool useMemset       = getenv("USE_MEMSET");         // Perform a memset instead of a copy (ignores source memory)
-  bool useFineGrainMem = getenv("USE_FINEGRAIN_MEM");  // Allocate fine-grained GPU memory instead of coarse-grained GPU memory
-  bool useSingleSync   = getenv("USE_SINGLE_SYNC");    // Perform synchronization only once after all iterations instead of per iteration
-  bool useInteractive  = getenv("USE_INTERACTIVE");    // Pause for user-input before starting transfer loop
-  bool useSleep        = getenv("USE_SLEEP");          // Adds a 100ms sleep after each synchronization
-  bool reuseStreams    = getenv("REUSE_STREAMS");      // Re-use streams instead of creating / destroying per test
-  bool showAddr        = getenv("SHOW_ADDR");          // Print out memory addresses for each Link
-  int  byteOffset      = getenv("BYTE_OFFSET") ? atoi(getenv("BYTE_OFFSET")) : 0; // Byte-offset for memory allocations
-  int  numWarmups      = getenv("NUM_WARMUPS") ? atoi(getenv("NUM_WARMUPS")) : DEFAULT_NUM_WARMUPS;
-  int  numIterations   = getenv("NUM_ITERATIONS") ? atoi(getenv("NUM_ITERATIONS")) : DEFAULT_NUM_ITERATIONS;
-
  if (byteOffset % 4)
  {
    printf("[ERROR] byteOffset must be a multiple of 4\n");
@@ -95,49 +101,55 @@ int main(int argc, char **argv)
  int initOffset = byteOffset / sizeof(float);

  char *env;
-  printf("Run configuration\n");
-  printf("=====================================================\n");
-  printf("%-20s %8s: Using %s\n",
-         "USE_HIP_CALL", useHipCall ? "(set)" : "(unset)",
-         useHipCall ? "HIP functions" : "custom kernels");
-  printf("%-20s %8s: Performing %s\n",
-         "USE_MEMSET", useMemset ? "(set)" : "(unset)",
-         useMemset ? "memset" : "memcopy");
-  if (useHipCall && !useMemset)
+  if (!outputToCsv)
  {
-    env = getenv("HSA_ENABLE_SDMA");
+    printf("Run configuration\n");
+    printf("=====================================================\n");
+    printf("%-20s %8s: Using %s\n",
+           "USE_HIP_CALL", useHipCall ? "(set)" : "(unset)",
+           useHipCall ? "HIP functions" : "custom kernels");
+    printf("%-20s %8s: Performing %s\n",
+           "USE_MEMSET", useMemset ? "(set)" : "(unset)",
+           useMemset ? "memset" : "memcopy");
+    if (useHipCall && !useMemset)
+    {
+      env = getenv("HSA_ENABLE_SDMA");
+      printf("%-20s %8s: %s\n",
+             "HSA_ENABLE_SDMA", env ? env : "(unset)",
+             (env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
+    }
+    printf("%-20s %8s: GPU destination memory type: %s-grained\n",
+           "USE_FINEGRAIN_MEM", useFineGrainMem ? "(set)" : "(unset)",
+           useFineGrainMem ? "fine" : "coarse");
    printf("%-20s %8s: %s\n",
-           "HSA_ENABLE_SDMA", env ? env : "(unset)",
-           (env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
+           "USE_SINGLE_SYNC", useSingleSync ? "(set)" : "(unset)",
+           useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration");
+    printf("%-20s %8s: Running in %s mode\n",
+           "USE_INTERACTIVE", useInteractive ? "(set)" : "(unset)",
+           useInteractive ? "interactive" : "non-interactive");
+    printf("%-20s %8s: %s\n",
+           "USE_SLEEP", useSleep ? "(set)" : "(unset)",
+           useSleep ? "Add sleep after each sync" : "No sleep per sync");
+    printf("%-20s %8s: %s\n",
+           "REUSE_STREAMS", reuseStreams ? "(set)" : "(unset)",
+           reuseStreams ? "Re-using streams per topology" : "Creating/destroying streams per topology");
+    printf("%-20s %8s: %s\n",
+           "SHOW_ADDR", showAddr ? "(set)" : "(unset)",
+           showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses");
+    env = getenv("OUTPUT_TO_CSV");
+    printf("%-20s %8s: Output to csv\n",
+           "OUTPUT_TO_CSV", env ? env : "(unset)");
+    env = getenv("BYTE_OFFSET");
+    printf("%-20s %8s: Using byte offset of %d\n",
+           "BYTE_OFFSET", env ? env : "(unset)", byteOffset);
+    env = getenv("NUM_WARMUPS");
+    printf("%-20s %8s: Running %d warmup iteration(s) per topology\n",
+           "NUM_WARMUPS", env ? env : "(unset)", numWarmups);
+    env = getenv("NUM_ITERATIONS");
+    printf("%-20s %8s: Running %d timed iteration(s) per topology\n",
+           "NUM_ITERATIONS", env ? env : "(unset)", numIterations);
+    printf("\n");
  }
-  printf("%-20s %8s: GPU destination memory type: %s-grained\n",
-         "USE_FINEGRAIN_MEM", useFineGrainMem ? "(set)" : "(unset)",
-         useFineGrainMem ? "fine" : "coarse");
-  printf("%-20s %8s: %s\n",
-         "USE_SINGLE_SYNC", useSingleSync ? "(set)" : "(unset)",
-         useSingleSync ? "Synchronizing only once, after all iterations" : "Synchronizing per iteration");
-  printf("%-20s %8s: Running in %s mode\n",
-         "USE_INTERACTIVE", useInteractive ? "(set)" : "(unset)",
-         useInteractive ? "interactive" : "non-interactive");
-  printf("%-20s %8s: %s\n",
-         "USE_SLEEP", useSleep ? "(set)" : "(unset)",
-         useSleep ? "Add sleep after each sync" : "No sleep per sync");
-  printf("%-20s %8s: %s\n",
-         "REUSE_STREAMS", reuseStreams ? "(set)" : "(unset)",
-         reuseStreams ? "Re-using streams per topology" : "Creating/destroying streams per topology");
-  printf("%-20s %8s: %s\n",
-         "SHOW_ADDR", showAddr ? "(set)" : "(unset)",
-         showAddr ? "Displaying src/dst mem addresses" : "Not displaying src/dst mem addresses");
-  env = getenv("BYTE_OFFSET");
-  printf("%-20s %8s: Using byte offset of %d\n",
-         "BYTE_OFFSET", env ? env : "(unset)", byteOffset);
-  env = getenv("NUM_WARMUPS");
-  printf("%-20s %8s: Running %d warmup iteration(s) per topology\n",
-         "NUM_WARMUPS", env ? env : "(unset)", numWarmups);
-  env = getenv("NUM_ITERATIONS");
-  printf("%-20s %8s: Running %d timed iteration(s) per topology\n",
-         "NUM_ITERATIONS", env ? env : "(unset)", numIterations);
-  printf("\n");

  // Collect the number of available CPUs/GPUs on this machine
  int numGpuDevices;
@@ -160,8 +172,14 @@ int main(int argc, char **argv)
  std::map<std::pair<int, int>, int> linkMap;
  std::vector<std::vector<hipStream_t>> streamCache(numGpuDevices);

+  // Print CSV header
+  if (outputToCsv)
+  {
+    printf("Test,NumBytes,ExeGpu,SrcMem,DstMem,BW(GB/s),Time(ms),LinkDesc,SrcAddr,DstAddr,numWarmups,numIters,useHipCall,useMemSet,useFineGrain,useSingleSync,resuseStreams\n");
+  }
+
  // Loop over each line in the configuration file
-  int lineNum = 0;
+  int testNum = 0;
  char line[2048];
  while(fgets(line, 2048, fp))
  {
@@ -171,12 +189,12 @@ int main(int argc, char **argv)

    int const numLinks = links.size();
    if (numLinks == 0) continue;
-    lineNum++;
+    testNum++;

    // Loop over all the different number of bytes to use per Link
    for (auto N : valuesOfN)
    {
-      printf("Test %d: [%lu bytes]\n", lineNum, N * sizeof(float));
+      if (!outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float));
      float*                  linkSrcMem[numLinks];        // Source memory per Link
      float*                  linkDstMem[numLinks];        // Destination memory per Link
      hipStream_t             streams[numLinks];           // hipStream to use per Link
@@ -191,7 +209,6 @@ int main(int argc, char **argv)
      for (int i = 0; i < numGpuDevices; i++)
        linkCount[i] = 0;

-      char name[MAX_NAME_LEN+1] = {};                      // Used to describe the set of Links
      for (int i = 0; i < numLinks; i++)
      {
        MemType srcMemType  = links[i].srcMemType;
@@ -206,12 +223,10 @@ int main(int argc, char **argv)
            (dstIndex < 0 || dstIndex >= numGpuDevices) ||
            (exeIndex < 0 || exeIndex >= numGpuDevices))
        {
-          printf("[ERROR] Invalid link %d:(%c%d->%c%d). Total devices: %d\n",
-                 exeIndex, MemTypeStr[srcMemType], srcIndex, MemTypeStr[dstMemType], dstIndex, numGpuDevices);
+          printf("[ERROR] Invalid link %d:(%c%d->%c%d) GPU index must be between 0 and %d inclusively\n",
+                 exeIndex, MemTypeStr[srcMemType], srcIndex, MemTypeStr[dstMemType], dstIndex, numGpuDevices-1);
          exit(1);
        }
-        snprintf(name + strlen(name), MAX_NAME_LEN, "%d:(%c%d->%c%d:%d)",
-                 exeIndex, MemTypeStr[srcMemType], srcIndex, MemTypeStr[dstMemType], dstIndex, blocksToUse);

        // Enable peer-to-peer access if this is the first time seeing this pair
        if (srcMemType == MEM_GPU && dstMemType == MEM_GPU)
@@ -304,8 +319,7 @@ int main(int argc, char **argv)
        // Start CPU timing for this iteration
        auto cpuStart = std::chrono::high_resolution_clock::now();

-        // Run all links in parallel (one thread per link)
-        #pragma omp parallel for num_threads(numLinks)
+        // Enqueue all links
        for (int i = 0; i < numLinks; i++)
        {
          HIP_CALL(hipSetDevice(links[i].exeIndex));
@@ -331,17 +345,13 @@ int main(int argc, char **argv)
          }
          else
          {
-            // Record start event
-            //if (recordStart) HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
            hipExtLaunchKernelGGL(useMemset ? MemsetKernel : CopyKernel,
                                  dim3(links[i].numBlocksToUse, 1, 1),
                                  dim3(BLOCKSIZE, 1, 1),
                                  0, streams[i],
-                                  recordStart ? startEvents[i] : dummyEvents[i],
-                                  recordStop  ?  stopEvents[i] : dummyEvents[i],
+                                  recordStart ? startEvents[i] : NULL,
+                                  recordStop  ?  stopEvents[i] : NULL,
                                  0, gpuBlockParams[i]);
-            // Record stop event
-            //if (recordStop) HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
          }
        }

@@ -393,19 +403,57 @@ int main(int argc, char **argv)
        CheckOrFill(MODE_CHECK, N, useMemset, useHipCall, linkDstMem[i] + initOffset);

      // Report timings
-
+      totalCpuTime = totalCpuTime / (1.0 * numIterations) * 1000;
+      double totalBandwidthGbs = 0.0;
      for (int i = 0; i < numLinks; i++)
      {
        double linkDurationMsec = totalGpuTime[i] / (1.0 * numIterations);
        double linkBandwidthGbs = (N * sizeof(float) / 1.0E9) / linkDurationMsec * 1000.0f;
-        printf(" Link %02d: %c%02d -> [GPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms |",
-               i + 1,
-               MemTypeStr[links[i].srcMemType], links[i].srcIndex,
-               links[i].exeIndex, links[i].numBlocksToUse,
-               MemTypeStr[links[i].dstMemType], links[i].dstIndex,
-               linkBandwidthGbs, linkDurationMsec);
-        if (showAddr) printf(" %16p | %16p |", linkSrcMem[i] + initOffset, linkDstMem[i] + initOffset);
-        printf("\n");
+        totalBandwidthGbs += linkBandwidthGbs;
+        if (!outputToCsv)
+        {
+          printf(" Link %02d: %c%02d -> [GPU %02d:%02d] -> %c%02d | %9.3f GB/s | %8.3f ms | %9s |",
+                 i + 1,
+                 MemTypeStr[links[i].srcMemType], links[i].srcIndex,
+                 links[i].exeIndex, links[i].numBlocksToUse,
+                 MemTypeStr[links[i].dstMemType], links[i].dstIndex,
+                 linkBandwidthGbs, linkDurationMsec,
+                 GetLinkDesc(links[i]).c_str());
+          if (showAddr) printf(" %16p | %16p |", linkSrcMem[i] + initOffset, linkDstMem[i] + initOffset);
+          printf("\n");
+        }
+        else
+        {
+          printf("%d,%lu,%02d,%c%02d,%c%02d,%9.3f,%8.3f,%s,%p,%p,%d,%d,%s,%s,%s,%s,%s\n",
+                 testNum, N * sizeof(float), links[i].exeIndex,
+                 MemTypeStr[links[i].srcMemType], links[i].srcIndex,
+                 MemTypeStr[links[i].dstMemType], links[i].dstIndex,
+                 linkBandwidthGbs, linkDurationMsec,
+                 GetLinkDesc(links[i]).c_str(),
+                 linkSrcMem[i] + initOffset, linkDstMem[i] + initOffset,
+                 numWarmups, numIterations,
+                 useHipCall ? "true" : "false",
+                 useMemset ? "true" : "false",
+                 useFineGrainMem ? "true" : "false",
+                 useSingleSync ? "true" : "false",
+                 reuseStreams ? "true" : "false");
+        }
+      }
+
+      // Display aggregate statistics
+      if (!outputToCsv)
+      {
+        printf(" Aggregate Bandwidth                | %9.3f GB/s | %8.3f ms |\n", totalBandwidthGbs, totalCpuTime);
+      }
+      else
+      {
+        printf("%d,%lu,ALL,ALL,ALL,%9.3f,%8.3f,ALL,ALL,ALL,%d,%d,%s,%s,%s,%s,%s\n",
+               testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, numWarmups, numIterations,
+               useHipCall ? "true" : "false",
+               useMemset ? "true" : "false",
+               useFineGrainMem ? "true" : "false",
+               useSingleSync ? "true" : "false",
+               reuseStreams ? "true" : "false");
      }

      // Release GPU memory
@@ -431,23 +479,6 @@ int main(int argc, char **argv)
        HIP_CALL(hipStreamDestroy(stream));
  }

-  // Print link information
-  printf("Link topology:\n");
-  uint32_t linkType;
-  uint32_t hopCount;
-  for (auto mapPair : linkMap)
-  {
-    int src = mapPair.first.first;
-    int dst = mapPair.first.second;
-    HIP_CALL(hipExtGetLinkTypeAndHopCount(src, dst, &linkType, &hopCount));
-    printf("%d -> %d: %s [%d hop(s)]\n", src, dst,
-           linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? "HYPERTRANSPORT" :
-           linkType == HSA_AMD_LINK_INFO_TYPE_QPI            ? "QPI" :
-           linkType == HSA_AMD_LINK_INFO_TYPE_PCIE           ? "PCIE" :
-           linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND      ? "INFINIBAND" :
-           linkType == HSA_AMD_LINK_INFO_TYPE_XGMI           ? "XGMI" : "UNKNOWN",
-           hopCount);
-  }
  return 0;
 }

@@ -459,6 +490,7 @@ void DisplayUsage(char const* cmdName)
  printf("  N         : (Optional) Number of bytes to transfer per link.\n");
  printf("              If not specified, defaults to %lu bytes. Must be a multiple of 128 bytes\n", DEFAULT_BYTES_PER_LINK);
  printf("              If 0 is specified, a range of Ns will be benchmarked\n");
+  printf("              If a negative number is specified, a configFile gets generated with this number as default number of CUs per link\n");
  printf("\n");
  printf("Configfile Format:\n");
  printf("==================\n");
@@ -508,11 +540,115 @@ void DisplayUsage(char const* cmdName)
  printf(" USE_SLEEP          - Adds a 100ms sleep after each synchronization\n");
  printf(" REUSE_STREAMS      - Re-use streams instead of creating / destroying per test\n");
  printf(" SHOW_ADDR          - Print out memory addresses for each Link\n");
+  printf(" OUTPUT_TO_CSV      - Outputs to CSV format if set\n");
  printf(" BYTE_OFFSET        - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
  printf(" NUM_WARMUPS=W      - Perform W untimed warmup iteration(s) per test\n");
  printf(" NUM_ITERATIONS=I   - Perform I timed iteration(s) per test\n");
 }

+void GenerateConfigFile(char const* cfgFile, int numBlocks)
+{
+  // Detect number of available GPUs and skip if less than 2
+  int numGpuDevices;
+  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
+  printf("Generated configFile %s for %d device(s) / %d CUs per link\n", cfgFile, numGpuDevices, numBlocks);
+  if (numGpuDevices < 2)
+  {
+    printf("Skipping. (Less than 2 GPUs detected)\n");
+    exit(0);
+  }
+
+  // Open config file for writing
+  FILE* fp = fopen(cfgFile, "w");
+  if (!fp)
+  {
+    printf("Unable to open [%s] for writing\n", cfgFile);
+    exit(1);
+  }
+
+  // CU testing
+  fprintf(fp, "# CU scaling tests\n");
+  for (int i = 1; i < 16; i++)
+    fprintf(fp, "1 %d (0 G0 G1)\n", i);
+  fprintf(fp, "\n");
+
+  // Pinned memory testing
+  fprintf(fp, "# Pinned CPU memory read tests\n");
+  for (int i = 0; i < numGpuDevices; i++)
+    fprintf(fp, "1 %d (%d C%d G%d)\n", numBlocks, i, i, i);
+  fprintf(fp, "\n");
+
+  fprintf(fp, "# Pinned CPU memory write tests\n");
+  for (int i = 0; i < numGpuDevices; i++)
+    fprintf(fp, "1 %d (%d G%d C%d)\n", numBlocks, i, i, i);
+  fprintf(fp, "\n");
+
+  // Single link testing GPU testing
+  fprintf(fp, "# Unidirectional link GPU tests\n");
+  for (int i = 0; i < numGpuDevices; i++)
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      if (i == j) continue;
+      fprintf(fp, "1 %d (%d G%d G%d)\n", numBlocks, i, i, j);
+    }
+  fprintf(fp, "\n");
+
+  // Bi-directional link testing
+  fprintf(fp, "# Bi-directional link tests\n");
+  for (int i = 0; i < numGpuDevices; i++)
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      if (i == j) continue;
+      fprintf(fp, "2 %d (%d G%d G%d) (%d G%d G%d)\n", numBlocks, i, i, j, j, j, i);
+    }
+  fprintf(fp, "\n");
+
+  // Simple uni-directional ring
+  fprintf(fp, "# Simple unidirectional ring\n");
+  fprintf(fp, "%d %d", numGpuDevices, numBlocks);
+  for (int i = 0; i < numGpuDevices; i++)
+  {
+    fprintf(fp, " (%d G%d G%d)", i, i, (i+1)%numGpuDevices);
+  }
+  fprintf(fp, "\n\n");
+
+  // Simple bi-directional ring
+  fprintf(fp, "# Simple bi-directional ring\n");
+  fprintf(fp, "%d %d", numGpuDevices * 2, numBlocks);
+  for (int i = 0; i < numGpuDevices; i++)
+    fprintf(fp, " (%d G%d G%d)", i, i, (i+1)%numGpuDevices);
+  for (int i = 0; i < numGpuDevices; i++)
+    fprintf(fp, " (%d G%d G%d)", i, i, (i+numGpuDevices-1)%numGpuDevices);
+  fprintf(fp, "\n\n");
+
+  // Broadcast from GPU 0
+  fprintf(fp, "# GPU 0 Broadcast\n");
+  fprintf(fp, "%d %d", numGpuDevices-1, numBlocks);
+  for (int i = 1; i < numGpuDevices; i++)
+    fprintf(fp, " (%d G%d G%d)", 0, 0, i);
+  fprintf(fp, "\n\n");
+
+  // Gather to GPU 0
+  fprintf(fp, "# GPU 0 Gather\n");
+  fprintf(fp, "%d %d", numGpuDevices-1, numBlocks);
+  for (int i = 1; i < numGpuDevices; i++)
+    fprintf(fp, " (%d G%d G%d)", 0, i, 0);
+  fprintf(fp, "\n\n");
+
+  // Full stress test
+  fprintf(fp, "# Full stress test\n");
+  fprintf(fp, "%d %d", numGpuDevices * (numGpuDevices-1), numBlocks);
+  for (int i = 0; i < numGpuDevices; i++)
+    for (int j = 0; j < numGpuDevices; j++)
+    {
+      if (i == j) continue;
+      fprintf(fp, " (%d G%d G%d)", i, i, j);
+    }
+  fprintf(fp, "\n\n");
+
+  fclose(fp);
+}
+
 void DisplayTopology()
 {
  printf("\nDetected topology:\n");
@@ -700,3 +836,48 @@ void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, float* ptr

  free(refBuffer);
 }
+
+std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount)
+{
+  char result[10];
+
+  switch (linkType)
+  {
+  case HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT: sprintf(result, "  HT-%d", hopCount); break;
+  case HSA_AMD_LINK_INFO_TYPE_QPI           : sprintf(result, " QPI-%d", hopCount); break;
+  case HSA_AMD_LINK_INFO_TYPE_PCIE          : sprintf(result, "PCIE-%d", hopCount); break;
+  case HSA_AMD_LINK_INFO_TYPE_INFINBAND     : sprintf(result, "INFB-%d", hopCount); break;
+  case HSA_AMD_LINK_INFO_TYPE_XGMI          : sprintf(result, "XGMI-%d", hopCount); break;
+  default: sprintf(result, "??????");
+  }
+  return result;
+}
+
+std::string GetLinkDesc(Link const& link)
+{
+  std::string result = "";
+
+  // Currently only describe links between src/dst on GPU
+  if (link.srcMemType == MEM_GPU && link.dstMemType == MEM_GPU)
+  {
+    if (link.exeIndex != link.srcIndex)
+    {
+      uint32_t linkType, hopCount;
+      HIP_CALL(hipExtGetLinkTypeAndHopCount(link.srcIndex, link.exeIndex, &linkType, &hopCount));
+      result += GetLinkTypeDesc(linkType, hopCount);
+    }
+
+    if (link.exeIndex != link.dstIndex)
+    {
+      uint32_t linkType, hopCount;
+      HIP_CALL(hipExtGetLinkTypeAndHopCount(link.exeIndex, link.dstIndex, &linkType, &hopCount));
+      if (result != "") result += "+";
+      result += GetLinkTypeDesc(linkType, hopCount);
+    }
+  }
+  else
+  {
+    result = "???";
+  }
+  return result;
+}
@@ -81,12 +81,16 @@ struct BlockParam
    float* dst;
 };

-void DisplayUsage(char const* cmdName);                // Display usage instructions
-void DisplayTopology();                                // Display GPU topology
-void ParseLinks(char* line, std::vector<Link>& links); // Parse Link information
+void DisplayUsage(char const* cmdName);                      // Display usage instructions
+void GenerateConfigFile(char const* cfgFile, int numBlocks); // Generate a sample config file
+void DisplayTopology();                                      // Display GPU topology
+void ParseLinks(char* line, std::vector<Link>& links);       // Parse Link information
 void AllocateMemory(MemType memType, int devIndex, size_t numBytes, bool useFineGrainMem, float** memPtr);
 void DeallocateMemory(MemType memType, int devIndex, float* memPtr);
 void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, float* ptr);
+std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
+std::string GetLinkDesc(Link const& link);
+

 #define MAX_NAME_LEN 64
 #define BLOCKSIZE 256
@@ -21,7 +21,7 @@

 DIR="$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)"

-for i in {0..38}
+for i in {0..44}
 do
 	$DIR/../topo_expl/topo_expl -m $i > "topo_m$i.log"
 	$DIR/../TopoVisual/topo_visual.sh -i "topo_m$i.log"
@@ -0,0 +1,87 @@
+<system version="2">
+  <cpu numaid="0" affinity="00000000,00000000,00000000,ffffffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="0" sm="90" gcn="908" arch="38911" rank="0" gdr="1">
+          <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="1" affinity="00000000,00000000,ffffffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="1" sm="90" gcn="908" arch="38911" rank="1" gdr="1">
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="2" sm="90" gcn="908" arch="38911" rank="2" gdr="1">
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="3" sm="90" gcn="908" arch="38911" rank="3" gdr="1">
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="00000000,ffffffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="4" sm="90" gcn="908" arch="38911" rank="4" gdr="1">
+          <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="5" sm="90" gcn="908" arch="38911" rank="5" gdr="1">
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:e1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
+      <nic>
+        <net name="mlx5_0" dev="0" speed="200000" port="1" guid="0x70cd600003da341c" maxconn="262144" gdr="1"/>
+      </nic>
+    </pci>
+  </cpu>
+  <cpu numaid="3" affinity="ffffffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:a1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:a3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="6" sm="90" gcn="908" arch="38911" rank="6" gdr="1">
+          <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="7" sm="90" gcn="908" arch="38911" rank="7" gdr="1">
+          <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+</system>
@@ -0,0 +1,87 @@
+<system version="2">
+  <cpu numaid="0" affinity="00000000,00000000,00ffffff,00000000,00000000,00ffffff" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:61:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:63:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="0" sm="90" gcn="908" arch="38911" rank="0" gdr="1">
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:27:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="1" sm="90" gcn="908" arch="38911" rank="1" gdr="1">
+          <xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:27:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="1" affinity="00000000,0000ffff,ff000000,00000000,0000ffff,ff000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:25:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:27:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="2" sm="90" gcn="908" arch="38911" rank="2" gdr="1">
+          <xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="3" sm="90" gcn="908" arch="38911" rank="3" gdr="1">
+          <xgmi target="0000:63:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:27:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="000000ff,ffff0000,00000000,000000ff,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:e1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:e3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="4" sm="90" gcn="908" arch="38911" rank="4" gdr="1">
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="5" sm="90" gcn="908" arch="38911" rank="5" gdr="1">
+          <xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:c4:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
+      <nic>
+        <net name="mlx5_0" dev="0" speed="200000" port="1" guid="0x22fd9f00039b0398" maxconn="262144" gdr="1"/>
+      </nic>
+    </pci>
+  </cpu>
+  <cpu numaid="3" affinity="ffffff00,00000000,00000000,ffffff00,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:a1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:a3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="6" sm="90" gcn="908" arch="38911" rank="6" gdr="1">
+          <xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="7" sm="90" gcn="908" arch="38911" rank="7" gdr="1">
+          <xgmi target="0000:e3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+</system>
@@ -0,0 +1,93 @@
+<system version="2">
+  <cpu numaid="1" affinity="00000000,00000000,00000000,ffff0000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:41:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:43:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="0" sm="90" gcn="908" arch="38911" rank="0" gdr="1">
+          <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="2" affinity="00000000,00000000,0000ffff,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:21:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:23:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="1" sm="90" gcn="908" arch="38911" rank="1" gdr="1">
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:24:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:26:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="2" sm="90" gcn="908" arch="38911" rank="2" gdr="1">
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="3" affinity="00000000,00000000,ffff0000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:01:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:03:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="3" sm="90" gcn="908" arch="38911" rank="3" gdr="1">
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c6:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="5" affinity="00000000,ffff0000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:c1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:c3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="4" sm="90" gcn="908" arch="38911" rank="4" gdr="1">
+          <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+    <pci busid="0000:c4:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:c6:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="5" sm="90" gcn="908" arch="38911" rank="5" gdr="1">
+          <xgmi target="0000:43:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:26:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:03:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="6" affinity="0000ffff,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:a1:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:a3:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="6" sm="90" gcn="908" arch="38911" rank="6" gdr="1">
+          <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:83:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="7" affinity="ffff0000,00000000,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:81:00.0" class="0x060400" link_speed="16 GT/s" link_width="16">
+      <pci busid="0000:83:00.0" class="0x038000" link_speed="16 GT/s" link_width="16">
+        <gpu dev="7" sm="90" gcn="908" arch="38911" rank="7" gdr="1">
+          <xgmi target="0000:23:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:c3:00.0" count="1" tclass="0x038000"/>
+          <xgmi target="0000:a3:00.0" count="1" tclass="0x038000"/>
+        </gpu>
+      </pci>
+    </pci>
+  </cpu>
+  <cpu numaid="4" affinity="00000000,0000ffff,00000000,00000000" arch="x86_64" vendor="AuthenticAMD" familyid="143" modelid="49">
+    <pci busid="0000:e1:00.0" class="0x020700" link_speed="16 GT/s" link_width="16">
+      <nic>
+        <net name="mlx5_0" dev="0" speed="200000" port="1" guid="0x70cd600003da341c" maxconn="262144" gdr="1"/>
+      </nic>
+    </pci>
+  </cpu>
+</system>
@@ -108,6 +108,12 @@ NodeModelDesc model_descs[] = {
  {4, "topo_8p_rome_n2_2.xml",  "4 nodes 8 VEGA20 Rome NPS=2 Alt. Model 2 NET/IF"},
  {4, "topo_8p_ts1_n4_2.xml",   "4 nodes 8 VEGA20 TS1 NPS=4 3 NET/IF"},
  {1, "topo_8p_rome_n4.xml",    "single node 8 VEGA20 Rome NPS=4"},
+  {1, "topo_4p3l_n2.xml",       "single node 8 gfx908 Rome"},
+  {4, "topo_4p3l_n2.xml",       "4 nodes 8 gfx908 Rome"},
+  {1, "topo_4p3l_n4.xml",       "single node 8 gfx908 Rome NPS=4"},
+  {4, "topo_4p3l_n4.xml",       "4 nodes 8 gfx908 Rome NPS=4"},
+  {1, "topo_4p3l_n2_1.xml",     "single node 8 gfx908 Rome"},
+  {4, "topo_4p3l_n2_1.xml",     "4 nodes 8 gfx908 Rome"},
 };

 int main(int argc,char* argv[])