diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e1fec1403..81cd32af5f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -181,6 +181,9 @@ else()
   target_link_libraries(rccl PUBLIC hip::hip_hcc ${hcc_LIBRARIES} numa)
 endif()
 
+#Setup librccl.so version
+rocm_set_soversion(rccl "1.0")
+
 rocm_install_targets(TARGETS
                      rccl
                      PREFIX
@@ -197,8 +200,8 @@ rocm_export_targets(NAMESPACE
                     DEPENDS
                     hip)
 
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc")
-set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "rocm-dev (>= 2.5.27)")
+set(CPACK_RPM_PACKAGE_REQUIRES "rocm-dev >= 2.5.27")
 
 set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt" "/opt/rocm")
 
@@ -208,7 +211,7 @@ rocm_create_package(
   DESCRIPTION
   "Optimized primitives for collective multi-GPU communication"
   MAINTAINER
-  "<no-reply@amd.com>"
+  "<rccl-maintainer@amd.com>"
   LDCONFIG)
 
 rocm_install_symlink_subdir(rccl)
diff --git a/README.md b/README.md
index 56eca69c52..405b69127b 100644
--- a/README.md
+++ b/README.md
@@ -37,10 +37,14 @@ $ git clone https://github.com/ROCmSoftwarePlatform/rccl.git
 $ cd rccl
 $ mkdir build
 $ cd build
-$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
+$ CXX=/opt/rocm/bin/hcc cmake ..
 $ make -j 8
 ```
-You may substitute a path of your own choosing for CMAKE_INSTALL_PREFIX. Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
+You may substitute an installation path of your own choosing by passing CMAKE_INSTALL_PREFIX. For example:
+```shell
+$ CXX=/opt/rocm/bin/hcc cmake -DCMAKE_INSTALL_PREFIX=$PWD/rccl-install ..
+```
+Note: ensure rocm-cmake is installed, `apt install rocm-cmake`.
 
 #### To build the RCCL package and install package :
 
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 66236c2f18..ee542902b6 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -10,13 +10,6 @@ if(BUILD_TESTS)
       message(FATAL_ERROR "chrpath is required for UnitTests. Please install (e.g. sudo apt-get install chrpath)")
   endif()
 
-  # OpenMP is used to drive GPUs (one per thread)
-  if(EXISTS /etc/redhat-release)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libgomp -pthread")
-  else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread")
-  endif()    
-
   # Download and unpack googletest at configure time
   configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
   execute_process(
diff --git a/test/test_AllGather.cpp b/test/test_AllGather.cpp
index b28df48e87..df5aea23e3 100644
--- a/test/test_AllGather.cpp
+++ b/test/test_AllGather.cpp
@@ -4,7 +4,6 @@
  * See LICENSE.txt for license information
  ************************************************************************/
 #include "test_AllGather.hpp"
-#include <omp.h>
 
 namespace CorrectnessTests
 {
@@ -23,13 +22,14 @@ namespace CorrectnessTests
         size_t const sendCount = dataset.numElements / dataset.numDevices;
 
         // Launch the reduction (1 thread per GPU)
-        #pragma omp parallel for num_threads(numDevices)
+        ncclGroupStart();
         for (int i = 0; i < numDevices; i++)
         {
             ncclAllGather((int8_t *)dataset.inputs[i] + (i * byteCount),
                           dataset.outputs[i], sendCount,
                           dataType, comms[i], streams[i]);
         }
+        ncclGroupEnd();
 
         // Wait for reduction to complete
         Synchronize();
@@ -68,13 +68,14 @@ namespace CorrectnessTests
                 size_t const sendCount = subDataset.numElements / subDataset.numDevices;
 
                 // Launch the reduction (1 thread per GPU)
-                #pragma omp parallel for num_threads(numDevices)
+                ncclGroupStart();
                 for (int i = 0; i < numDevices; i++)
                 {
                     ncclAllGather((int8_t *)subDataset.inputs[i] + (i * byteCount),
                                   subDataset.outputs[i], sendCount,
                                   dataType, comms[i], streams[i]);
                 }
+                ncclGroupEnd();
 
                 // Wait for reduction to complete
                 Synchronize();
diff --git a/test/test_AllReduce.cpp b/test/test_AllReduce.cpp
index 0fd5eedf91..86d4654b7b 100644
--- a/test/test_AllReduce.cpp
+++ b/test/test_AllReduce.cpp
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "test_AllReduce.hpp"
-#include <omp.h>
 
 namespace CorrectnessTests
 {
@@ -20,12 +19,13 @@ namespace CorrectnessTests
         ComputeExpectedResults(dataset, op);
 
         // Launch the reduction (1 thread per GPU)
-        #pragma omp parallel for num_threads(numDevices)
+        ncclGroupStart();
         for (int i = 0; i < numDevices; i++)
         {
             ncclAllReduce(dataset.inputs[i], dataset.outputs[i],
                           numElements, dataType, op, comms[i], streams[i]);
         }
+        ncclGroupEnd();
 
         // Wait for reduction to complete
         Synchronize();
diff --git a/test/test_AllReduceAbort.cpp b/test/test_AllReduceAbort.cpp
index 7b329d249d..719bf323d4 100644
--- a/test/test_AllReduceAbort.cpp
+++ b/test/test_AllReduceAbort.cpp
@@ -6,7 +6,6 @@
 
 #include "test_AllReduceAbort.hpp"
 #include "../include/comm.h"
-#include <omp.h>
 
 #define NUM_ITER 8
 #define FAKE_OP_COUNT NUM_ITER+1
diff --git a/test/test_Broadcast.cpp b/test/test_Broadcast.cpp
index 0e728b0153..a98d5884cb 100644
--- a/test/test_Broadcast.cpp
+++ b/test/test_Broadcast.cpp
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "test_Broadcast.hpp"
-#include <omp.h>
 
 namespace CorrectnessTests
 {
@@ -25,7 +24,7 @@ namespace CorrectnessTests
             ComputeExpectedResults(dataset, root);
 
             // Launch the reduction (1 thread per GPU)
-            #pragma omp parallel for num_threads(numDevices)
+            ncclGroupStart();
             for (int i = 0; i < numDevices; i++)
             {
                 ncclBroadcast(dataset.inputs[i],
@@ -33,7 +32,7 @@ namespace CorrectnessTests
                               numElements, dataType,
                               root, comms[i], streams[i]);
             }
-
+            ncclGroupEnd();
 
             // Wait for reduction to complete
             Synchronize();
diff --git a/test/test_Broadcast.hpp b/test/test_Broadcast.hpp
index 5ccf351592..358d8cf09a 100644
--- a/test/test_Broadcast.hpp
+++ b/test/test_Broadcast.hpp
@@ -7,7 +7,6 @@
 #define TEST_BROADCAST_HPP
 
 #include "CorrectnessTest.hpp"
-#include <omp.h>
 
 namespace CorrectnessTests
 {
diff --git a/test/test_BroadcastAbort.cpp b/test/test_BroadcastAbort.cpp
index e49a872e0b..df0175002a 100644
--- a/test/test_BroadcastAbort.cpp
+++ b/test/test_BroadcastAbort.cpp
@@ -6,7 +6,6 @@
 
 #include "test_BroadcastAbort.hpp"
 #include "../include/comm.h"
-#include <omp.h>
 
 #define NUM_ITER 8
 #define FAKE_OP_COUNT NUM_ITER+1
diff --git a/test/test_CombinedCalls.cpp b/test/test_CombinedCalls.cpp
index 4b51ab5375..41e93c78a2 100644
--- a/test/test_CombinedCalls.cpp
+++ b/test/test_CombinedCalls.cpp
@@ -11,8 +11,6 @@
 #include "test_Reduce.hpp"
 #include "test_ReduceScatter.hpp"
 
-#include <omp.h>
-
 namespace CorrectnessTests
 {
     TEST_P(CombinedCallsCorrectnessTest, Correctness)
@@ -38,7 +36,7 @@ namespace CorrectnessTests
         size_t const byteCount = datasets[0].NumBytes() / numDevices;
         size_t const elemCount = numElements / numDevices;
 
-        #pragma omp parallel for num_threads(numDevices)
+        ncclGroupStart();
         for (int i = 0; i < numDevices; i++)
         {
             ncclAllGather((int8_t *)datasets[0].inputs[i] + (i * byteCount),
@@ -63,6 +61,7 @@ namespace CorrectnessTests
                               elemCount, dataType, op,
                               comms[i], streams[i]);
         }
+        ncclGroupEnd();
 
         // Wait for reduction to complete
         Synchronize();
diff --git a/test/test_GroupCalls.cpp b/test/test_GroupCalls.cpp
index 77780b633d..3cfd55c6f5 100644
--- a/test/test_GroupCalls.cpp
+++ b/test/test_GroupCalls.cpp
@@ -11,8 +11,6 @@
 #include "test_Reduce.hpp"
 #include "test_ReduceScatter.hpp"
 
-#include <omp.h>
-
 namespace CorrectnessTests
 {
     TEST_P(GroupCallsCorrectnessTest, Correctness)
diff --git a/test/test_Reduce.cpp b/test/test_Reduce.cpp
index 9844e928c1..8234f2ab05 100644
--- a/test/test_Reduce.cpp
+++ b/test/test_Reduce.cpp
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "test_Reduce.hpp"
-#include <omp.h>
 
 namespace CorrectnessTests
 {
@@ -25,7 +24,7 @@ namespace CorrectnessTests
             ComputeExpectedResults(dataset, op, root);
 
             // Launch the reduction (1 thread per GPU)
-            #pragma omp parallel for num_threads(numDevices)
+            ncclGroupStart();
             for (int i = 0; i < numDevices; i++)
             {
                 ncclReduce(dataset.inputs[i],
@@ -33,6 +32,7 @@ namespace CorrectnessTests
                            numElements, dataType, op,
                            root, comms[i], streams[i]);
             }
+            ncclGroupEnd();
 
             // Wait for reduction to complete
             Synchronize();
diff --git a/test/test_ReduceScatter.cpp b/test/test_ReduceScatter.cpp
index 11007732b5..f6461f20b8 100644
--- a/test/test_ReduceScatter.cpp
+++ b/test/test_ReduceScatter.cpp
@@ -5,7 +5,6 @@
  ************************************************************************/
 
 #include "test_ReduceScatter.hpp"
-#include <omp.h>
 
 namespace CorrectnessTests
 {
@@ -24,7 +23,7 @@ namespace CorrectnessTests
         size_t const recvCount = dataset.numElements / dataset.numDevices;
 
         // Launch the reduction (1 thread per GPU)
-        #pragma omp parallel for num_threads(numDevices)
+        ncclGroupStart();
         for (int i = 0; i < numDevices; i++)
         {
             ncclReduceScatter(dataset.inputs[i],
@@ -32,7 +31,7 @@ namespace CorrectnessTests
                               recvCount, dataType, op,
                               comms[i], streams[i]);
         }
-
+        ncclGroupEnd();
 
         // Wait for reduction to complete
         Synchronize();
diff --git a/tools/TransferBench/TransferBench.cpp b/tools/TransferBench/TransferBench.cpp
index afb548ff58..5bdccdf807 100644
--- a/tools/TransferBench/TransferBench.cpp
+++ b/tools/TransferBench/TransferBench.cpp
@@ -27,7 +27,10 @@ THE SOFTWARE.
 #include <cstdio>
 #include <cstdlib>
 #include <set>
-
+#include <unistd.h>
+#include <map>
+#include <iostream>
+#include <sstream>
 #include <hip/hip_runtime.h>
 #include "copy_kernel.h"
 #include "TransferBench.hpp"
@@ -49,12 +52,13 @@ int main(int argc, char **argv)
     printf("\n");
     printf("Environment variables:\n");
     printf("======================\n");
-    printf(" USE_HIP_CALL    - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
-    printf(" USE_MEMSET      - Write constant value (instead of doing a copy)\n");
-    printf(" USE_COARSE_MEM  - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
-    printf(" USE_SINGLE_SYNC - Only synchronize once at end of iterations (disables GPU times)\n");
-    printf(" USE_INTERACTIVE - Waits for user-input prior to start and after transfer loop (for profiling)\n");
+    printf(" USE_HIP_CALL     - Use hip calls (hipMemcpyAsync/hipMemset) instead of kernel\n");
+    printf(" USE_MEMSET       - Write constant value (instead of doing a copy)\n");
+    printf(" USE_COARSE_MEM   - Use coarse-grained dst GPU memory (instead of fine-grained)\n");
+    printf(" USE_SINGLE_SYNC  - Only synchronize once at end of iterations (disables GPU times)\n");
+    printf(" USE_INTERACTIVE  - Waits for user-input prior to start and after transfer loop (for profiling)\n");
     printf(" USE_ITERATIONS=N - Sets number of iterations to run (default is 10)\n");
+    printf(" USE_SLEEP        - Adds a 100ms sleep after sync (for profiling)\n");
     exit(0);
   }
 
@@ -74,6 +78,7 @@ int main(int argc, char **argv)
   bool useCoarseMem = getenv("USE_COARSE_MEM");
   bool useSingleSync = getenv("USE_SINGLE_SYNC");
   bool useInteractive = getenv("USE_INTERACTIVE");
+  bool useSleep = getenv("USE_SLEEP");
 
   int numWarmups = 3;
   int numIterations = getenv("USE_ITERATIONS") ? atoi(getenv("USE_ITERATIONS")) : 10;
@@ -99,6 +104,10 @@ int main(int argc, char **argv)
     printf("Running in interactive mode (USE_INTERACTIVE)\n");
   else
     printf("Running in non-interactive mode (enable interactive mode via USE_INTERACTIVE)\n");
+  if (useSleep)
+    printf("Adding 100ms sleep after sync (USE_SLEEP)\n");
+  else
+    printf("No sleep per sync (enable sleep via USE_SLEEP)\n");
 
   printf("Executing %d warmup iteration(s), and %d timed iteration(s) (Set via USE_ITERATION=#)\n",
          numWarmups, numIterations);
@@ -265,7 +274,8 @@ int main(int argc, char **argv)
       {
         HIP_CALL(hipSetDevice(links[i].srcGpu));
 
-        HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
+        if (!useSingleSync || iteration == 0)
+          HIP_CALL(hipEventRecord(startEvents[i], streams[i]));
 
         if (useHipCall)
         {
@@ -301,7 +311,8 @@ int main(int argc, char **argv)
                                gpuBlockParams[i]);
           }
         }
-        HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
+        if (!useSingleSync || iteration == numIterations - 1)
+          HIP_CALL(hipEventRecord(stopEvents[i], streams[i]));
       }
 
       // Synchronize per iteration, unless in single sync mode, in which case
@@ -314,6 +325,7 @@ int main(int argc, char **argv)
 
       auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
       double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
+      if (useSleep) usleep(100000);
 
       if (iteration >= 0)
       {
@@ -370,8 +382,7 @@ int main(int argc, char **argv)
       }
       else
       {
-        if (!useSingleSync)
-          totalGpuTime[i] /= (1.0 * numIterations);
+        totalGpuTime[i] /= (1.0 * numIterations);
         printf("%8.3f", (linkCount[i] * numBytesPerLink / 1.0E9) / totalGpuTime[i]);
       }
     }
diff --git a/tools/TransferBench/TransferBench.hpp b/tools/TransferBench/TransferBench.hpp
index 5fde8f58fe..92c596d672 100644
--- a/tools/TransferBench/TransferBench.hpp
+++ b/tools/TransferBench/TransferBench.hpp
@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
+#include <sstream>
+
 // Helper macro for catching HIP errors
 #define HIP_CALL(cmd)                                                   \
     do {                                                                \