diff --git a/README.md b/README.md
index 7b9c7b528c..2128c4a67e 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@ ROCm Communication Collectives Library
 
 ## Introduction
 
-RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, and reduce-scatter. It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
+RCCL (pronounced "Rickle") is a stand-alone library of standard collective communication routines for GPUs, implementing all-reduce, all-gather, reduce, broadcast, reduce-scatter, gather, scatter, and all-to-all. There is also initial support for direct GPU-to-GPU send and receive operations.  It has been optimized to achieve high bandwidth on platforms using PCIe, xGMI as well as networking using InfiniBand Verbs or TCP/IP sockets. RCCL supports an arbitrary number of GPUs installed in a single node or multiple nodes, and can be used in either single- or multi-process (e.g., MPI) applications.
 
 The collective operations are implemented using ring and tree algorithms and have been optimized for throughput and latency. For best performance, small operations can be either batched into larger operations or aggregated through the API.
 
diff --git a/docs/source/api.rst b/docs/source/api.rst
index 1f15ee0208..0b4cdafed3 100644
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@@ -27,7 +27,7 @@ Communicator Functions
 
 .. doxygenfunction:: ncclCommUserRank
 
-Collection Communication Operations
+Collective Communication Operations
 -----------------------------------
 
 Collective communication operations must be called separately for each communicator in a communicator clique.
@@ -48,6 +48,15 @@ Since they may perform inter-CPU synchronization, each call has to be done from
 
 .. doxygenfunction:: ncclAllGather
 
+.. doxygenfunction:: ncclSend
+
+.. doxygenfunction:: ncclRecv
+
+.. doxygenfunction:: ncclGather
+
+.. doxygenfunction:: ncclScatter
+
+.. doxygenfunction:: ncclAllToAll
 
 Group Semantics
 ---------------
diff --git a/install.sh b/install.sh
index 76db967ef9..d7610559f5 100755
--- a/install.sh
+++ b/install.sh
@@ -29,6 +29,7 @@ build_release=true
 install_library=false
 build_hip_clang=true
 install_dependencies=false
+
 # #################################################
 # Parameter parsing
 # #################################################
diff --git a/src/nccl.h.in b/src/nccl.h.in
index 51c3d091fd..f83cebe0d8 100644
--- a/src/nccl.h.in
+++ b/src/nccl.h.in
@@ -238,11 +238,9 @@ ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcou
 ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, ncclComm_t comm, hipStream_t stream);
 
-/*
- * Send
- *
- * Send data from sendbuff to rank peer.
+/*! @brief Send
  *
+ * @details Send data from sendbuff to rank peer.
  * Rank peer needs to call ncclRecv with the same datatype and the same count from this
  * rank.
  *
@@ -255,11 +253,9 @@ ncclResult_t  ncclSend(const void* sendbuff, size_t count, ncclDataType_t dataty
 ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, hipStream_t stream);
 
-/*
- * Receive
- *
- * Receive data from rank peer into recvbuff.
+/*! @brief Receive
  *
+ * @details Receive data from rank peer into recvbuff.
  * Rank peer needs to call ncclSend with the same datatype and the same count to this
  * rank.
  *
@@ -272,11 +268,11 @@ ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, in
 ncclResult_t  ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
     ncclComm_t comm, hipStream_t stream);
 
-/*
- * Gather
+/*! @brief Gather
  *
- * Root device gathers sendcount values from other GPUs into recvbuff,
+ * @details Root device gathers sendcount values from other GPUs into recvbuff,
  * receiving data from rank i at offset i*sendcount.
+ *
  * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
  * should have a size of at least nranks*sendcount elements.
  *
@@ -287,11 +283,11 @@ ncclResult_t  ncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
 ncclResult_t pncclGather(const void* sendbuff, void* recvbuff, size_t sendcount,
     ncclDataType_t datatype, int root, ncclComm_t comm, hipStream_t stream);
 
-/*
- * Scatter
+/*! @brief Scatter
  *
- * Scattered over the devices so that recvbuff on rank i will contain the i-th
+ * @details Scattered over the devices so that recvbuff on rank i will contain the i-th
  * block of the data on root.
+ *
  * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
  * should have a size of at least nranks*recvcount elements.
  *
@@ -304,10 +300,9 @@ ncclResult_t pncclScatter(const void* sendbuff, void* recvbuff,
     size_t recvcount, ncclDataType_t datatype, int root, ncclComm_t comm,
     hipStream_t stream);
 
-/*
- * All-To-All
+/*! @brief All-To-All
  *
- * Device (i) send (j)th block of data to device (j) and be placed as (i)th
+ * @details Device (i) send (j)th block of data to device (j) and be placed as (i)th
  * block. Each block for sending/receiving has count elements, which means
  * that recvbuff and sendbuff should have a size of nranks*count elements.
  *