From c269577b89e79fcc319b0cf25c394cd7971213be Mon Sep 17 00:00:00 2001 From: Yiltan Date: Fri, 17 Oct 2025 12:10:37 -0400 Subject: [PATCH] Updated docs for ROCm 7.x.x (#239) Co-authored-by: Aurelien Bouteiller Co-authored-by: yugang-amd [ROCm/rocshmem commit: 9338c8448027e0a6c021e113643650e9cbdf2429] --- projects/rocshmem/CHANGELOG.md | 25 ++++++++-- projects/rocshmem/README.md | 47 ++++++++++++++----- projects/rocshmem/docs/api/ctx.rst | 19 +++++++- projects/rocshmem/docs/api/init.rst | 16 +++++++ .../rocshmem/include/rocshmem/rocshmem.hpp | 11 +++-- .../include/rocshmem/rocshmem_COLL.hpp | 22 --------- 6 files changed, 95 insertions(+), 45 deletions(-) diff --git a/projects/rocshmem/CHANGELOG.md b/projects/rocshmem/CHANGELOG.md index a268d061d7..3ecebc82bc 100644 --- a/projects/rocshmem/CHANGELOG.md +++ b/projects/rocshmem/CHANGELOG.md @@ -1,16 +1,35 @@ # Changelog for rocSHMEM -## rocSHMEM 3.x.x for ROCm 7.x.x +## Unreleased - rocSHMEM 3.x.x for ROCm 7.x.x +### Added +* Allow for IPC, RO, GDA backends to be selected at runtime +* Added the GDA conduit for different NIC vendors + * AMD Pensando IONIC + * Broadcom BNXT\_RE (Thor 2) + * Mellanox MLX5 (IB and RoCE ConnectX-7) +* Added new APIs: + * `rocshmem_get_device_ctx` ### Changed - * The following APIs have been deprecated: * `rocshmem_wg_init` * `rocshmem_wg_finalize` * `rocshmem_wg_init_thread` +* `rocshmem_ptr` can now return non-null pointer to + a shared memory region when the IPC transport is available to reach that region. + Previously, it would return a null pointer. +* `ROCSHMEM_RO_DISABLE_IPC` was renamed to `ROCSHMEM_DISABLE_MIXED_IPC`. + This enviroment variable was not documented for prior releases. + It is now documented to inform users who were using this undocumented feature. + +### Removed +* rocSHMEM no-longer requires rocPRIM and rocThrust as dependencies +* Removed MPI compile-time dependency + +### Known issues +* Only a subset of rocSHMEM APIs are implemented for the GDA conduit ## rocSHMEM 3.0.0 for ROCm 7.0.0 - ### Added * Added the Reverse Offload conduit diff --git a/projects/rocshmem/README.md b/projects/rocshmem/README.md index a5f8261a84..5b174817fa 100644 --- a/projects/rocshmem/README.md +++ b/projects/rocshmem/README.md @@ -5,10 +5,10 @@ initiative to provide GPU-centric networking through an OpenSHMEM-like interface This intra-kernel networking library simplifies application code complexity and enables more fine-grained communication/computation overlap than traditional host-driven networking. -rocSHMEM uses a single symmetric heap (SHEAP) that is allocated on GPU memories. +rocSHMEM uses a single symmetric heap that is allocated on GPU memories. There are currently three backends for rocSHMEM; -IPC, Reverse Offload (RO), and GPU-IB. +IPC, Reverse Offload (RO), and GDA. The backends primarily differ in their implementations of intra-kernel networking. The IPC backend implements communication primitives using load/store operations issued from the GPU. @@ -18,7 +18,14 @@ to the host-side runtime, which calls into a traditional MPI or OpenSHMEM implementation. This forwarding of requests is transparent to the programmer, who only sees the GPU-side interface. -The RO backend is provided as-is with limited support from AMD or AMD Research. +The GPU Direct Async (GDA) backend allows for rocSHMEM to issue communication operations to the NIC directly from the device-side code, without involving a CPU proxy. +within the GPU. +During initialization we prepare network resources for each NIC vendor using the vendor-appropriate +Direct Verbs APIs. +When calling the device-side rocSHMEM API, the device threads are used to construct Work Queue Entries (WQEs) and post the communication to the send queues of the NIC directly. +Completion Queues (CQs) are polled from the device-side code as well. + +The RO and GDA backend is provided as-is with limited support from AMD or AMD Research. ## Requirements @@ -60,9 +67,18 @@ cd build ../scripts/build_configs/ro_ipc ``` +To create an out-of-source build for the GDA backend, we do the following. +Ensure you select the correct NIC vendor script + +``` +mkdir build +cd build +../scripts/build_configs/gda_ +``` + The build script passes configuration options to CMake to setup canonical builds. There are other scripts in `./scripts/build_configs` -directory but currently, only `ipc_single` is supported. +directory but currently, only `ipc_single` and `ro_ipc` is supported. By default, the library is installed in `~/rocshmem`. You may provide a custom install path by supplying it as an argument. For example: @@ -183,6 +199,20 @@ rocSHMEM requires a ROCm-Aware Open MPI and UCX. Other MPI implementations, such as MPICH, _should_ be compatible with rocSHMEM but it has not been thoroughly tested. +## Building the Dependencies using our Helper Script +We have a script to install dependencies. +However, it is not guaranteed to work and perform optimally on all platforms. +Configuration options are platform dependent. + +``` +BUILD_DIR=/path/to/not_rocshmem_src_or_build/dependencies /path/to/rocshmem_src/sripts/install_dependencies.sh +``` + +After compiling and installing UCX and Open MPI, please update your `PATH` and `LD_LIBRARY_PATH` +to point to the installation locations. +The exact locations will be printed if the script ran successfully. + +## Building the Dependencies from Source To build and configure ROCm-Aware UCX (1.17.0 or later), you need to: ``` @@ -212,14 +242,5 @@ export PATH=/bin:$PATH export LD_LIBRARY_PATH=/lib:/lib:$LD_LIBRARY_PATH ``` - -Alternatively, we have script to install dependencies. -However, it is not guaranteed to work and perform optimally on all platforms. -Configuration options are platform dependent. - -``` -BUILD_DIR=/path/to/not_rocshmem_src_or_build/dependencies /path/to/rocshmem_src/sripts/install_dependencies.sh -``` - For more information on OpenMPI-UCX support, please visit: https://rocm.docs.amd.com/en/latest/how-to/gpu-enabled-mpi.html diff --git a/projects/rocshmem/docs/api/ctx.rst b/projects/rocshmem/docs/api/ctx.rst index 5cbbd324e5..b9aa2dd52f 100644 --- a/projects/rocshmem/docs/api/ctx.rst +++ b/projects/rocshmem/docs/api/ctx.rst @@ -13,11 +13,11 @@ ROCSHMEM_CTX_CREATE .. cpp:function:: __device__ int rocshmem_wg_ctx_create(int64_t options, rocshmem_ctx_t *ctx) .. cpp:function:: __device__ int rocshmem_wg_team_create_ctx(rocshmem_team_t team, long options, rocshmem_ctx_t *ctx) - + :param team: Team handle to derive the context from. :param options: Options for context creation. Ignored in current design; use the value ``0``. :param ctx: Context handle. - + :returns: All threads returns ``0`` if the context was created successfully. If any thread returns non-zero value, the operation fails and a higher number of ``ROCSHMEM_MAX_NUM_CONTEXTS`` is required. @@ -38,3 +38,18 @@ ROCSHMEM_CTX_DESTROY **Description:** This routine destroys an rocSHMEM context. It must be called collectively by all threads in the work-group. + +ROCSHMEM_GET_DEVICE_CTX +----------------------- + +.. cpp:function:: __host__ void * rocshmem_get_device_ctx() + + :param: None. + + :returns: Returns ``ROCSHMEM_CTX_DEFAULT`` device pointer that users. + can query from one instance of rocSHMEM host library and + use later for dynamic module initialization in + kernel bitcode device library in the same application. + +**Description:** +This routine queries rocSHMEM default device context from host API. diff --git a/projects/rocshmem/docs/api/init.rst b/projects/rocshmem/docs/api/init.rst index f605b729ad..ae358af2ad 100644 --- a/projects/rocshmem/docs/api/init.rst +++ b/projects/rocshmem/docs/api/init.rst @@ -143,3 +143,19 @@ It can be called before ``rocshmem_init``. **Description:** This routine queries the PE ID of the caller. It can be called per thread with no performance penalty. + +ROCSHMEM_PTR +-------------- + +.. cpp:function:: __host__ void* rocshmem_ptr(const void *dest, int pe); +.. cpp:function:: __device__ void* rocshmem_ptr(const void *dest, int pe); + + :param dest: Local symmetric heap allocation pointer for current PE. + :param pe: Remote PE. + :returns: Returns remote symmetric heap device pointer from host-side API. + ``NULL`` is returned if a valid device pointer cannot be provided. + This pointer can be used to issue load/store from custom kernels + instead of using rocshmem device side get/put APIs for RMA operations. + +**Description:** +This routine queries rocSHMEM remote symmetric heap pointer. diff --git a/projects/rocshmem/include/rocshmem/rocshmem.hpp b/projects/rocshmem/include/rocshmem/rocshmem.hpp index 259b23db97..5d4cd073c7 100644 --- a/projects/rocshmem/include/rocshmem/rocshmem.hpp +++ b/projects/rocshmem/include/rocshmem/rocshmem.hpp @@ -80,9 +80,9 @@ __host__ void rocshmem_init(void); /** * @brief Query rocSHMEM context from host API * - * @param[out] ctx Returns ROCSHMEM_CTX_DEFAULT device pointer that users + * @param[out] ctx Returns ROCSHMEM_CTX_DEFAULT device pointer that users * can query from one instance of rocshmem host library and - * use use later for dynamic module initialization in + * use use later for dynamic module initialization in * kernel bitcode device library in the same application */ __host__ void * rocshmem_get_device_ctx(); @@ -91,14 +91,15 @@ __host__ void * rocshmem_get_device_ctx(); * @brief Query rocSHMEM remote symmetric heap pointer * * @param[in] dest local symmetric heap allocation pointer for current pe/device - * + * * @param[in] pe remote PE - * + * * @param[out] ptr Returns remote symmetric heap device pointer from host-side API. * This can be used to issue load/store from custom kernels * instead of using rocshmem device side get/put APIs for RMA operations. */ -__host__ void *rocshmem_ptr(void *dest, int pe); +__host__ void* rocshmem_ptr(const void *dest, int pe); +__device__ ATTR_NO_INLINE void* rocshmem_ptr(const void *dest, int pe); #if defined(HAVE_EXTERNAL_MPI) /** diff --git a/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp b/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp index 587da8f5c5..9d7f2e5437 100644 --- a/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp +++ b/projects/rocshmem/include/rocshmem/rocshmem_COLL.hpp @@ -775,28 +775,6 @@ __device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wave( __device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wg( rocshmem_ctx_t ctx, rocshmem_team_t team); -/** - * @brief Query a local pointer to a symmetric data object on the - * specified \pe . Returns an address that may be used to directly reference - * dest on the specified \pe. This address can be accesses with LD/ST ops. - * - * Can be called per thread with no performance penalty. - */ -__device__ ATTR_NO_INLINE void *rocshmem_ptr(const void *dest, int pe); - -/** - * @brief Make all uncacheable GPU data visible to other agents in the sytem. - * - * This only works for data that was explicitly allocated uncacheable on the - * GPU! - * - * Can be called per thread with no performance penalty. - * - * @param[in] GPU-side handle. - * - * @return void - */ - } // namespace rocshmem #endif // LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP