Add 'projects/rocshmem/' from commit '0496586829058af5cfd7f23acda2a6d0040da584'

git-subtree-dir: projects/rocshmem
git-subtree-mainline: 5fd976da70
git-subtree-split: 0496586829
This commit is contained in:
Ameya Keshava Mallya
2026-01-21 20:25:37 +00:00
förälder 5fd976da70 0496586829
incheckning 12ab8df3bc
367 ändrade filer med 81890 tillägg och 0 borttagningar
+1
Visa fil
@@ -0,0 +1 @@
* @avinashkethineedi @akolliasAMD @Yiltan @BKP @abouteiller @edgargabriel @gaoikawa @omor1
+1
Visa fil
@@ -0,0 +1 @@
build/
+18
Visa fil
@@ -0,0 +1,18 @@
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
sphinx:
configuration: docs/conf.py
formats: []
python:
install:
- requirements: docs/sphinx/requirements.txt
build:
os: ubuntu-22.04
tools:
python: "3.10"
+16
Visa fil
@@ -0,0 +1,16 @@
## This is the list of rocSHMEM's significant contributors.
#### This does not necessarily list everyone who has contributed code, especially since many employees of one corporation may be contributing. To see the full list of contributors, see the revision history in source control.
- Khaled Hamidouche
- Brandon Potter
- Michael LeBeane
- Rohit Zambre
- Kishore Punniyamurthy
- Ruchi Shah
- Muhammad A. Awad
- Edgar Gabriel
- Avinash Kethineedi
- Yiltan Temucin
- Aurelien Bouteiller
- Omri Mor
+96
Visa fil
@@ -0,0 +1,96 @@
# Changelog for rocSHMEM
## Unreleased - rocSHMEM 3.x.x for ROCm 7.x.x
### Added
* Added new APIs:
* `rocshmem_TYPENAME_alltoall_wg`
## Unreleased -- rocSHMEM 3.2.1 for ROCm x.x.x
### Added
### Changed
### Removed
### Resolved issues
### Known issues
## rocSHMEM 3.2.0 for ROCm 7.2.0
### Added
* Added the GDA conduit for AMD Pensando IONIC
### Changed
* Dependency libraries are loaded dynamically
* The following APIs now have an implementation for the GDA conduit
* `rocshmem_p`
* fetching atomics `rochsmem_<TYPE>_fetch_<op>`
* collective APIs
* The following APIs now have an implementation for the IPC conduit
* `rocshmem_<TYPE>_atomic_{and,or,xor,swap}`
* `rocshmem_<TYPE>_atomic_fetch_{and,or,xor,swap}`
### Known issues
* Only 64bit rocSHMEM atomic APIs are implemented for the GDA conduit
## rocSHMEM 3.1.0 for ROCm 7.1.1
### Added
* Allow for IPC, RO, GDA backends to be selected at runtime
* Added the GDA conduit for different NIC vendors
* Broadcom BNXT\_RE (Thor 2)
* Mellanox MLX5 (IB and RoCE ConnectX-7)
* Added new APIs:
* `rocshmem_get_device_ctx`
* `rocshmem_ctx_pe_quiet`
* `rocshmem_pe_quiet`
### Changed
* The following APIs have been deprecated:
* `rocshmem_wg_init`
* `rocshmem_wg_finalize`
* `rocshmem_wg_init_thread`
* `rocshmem_ptr` can now return non-null pointer to
a shared memory region when the IPC transport is available to reach that region.
Previously, it would return a null pointer.
* `ROCSHMEM_RO_DISABLE_IPC` was renamed to `ROCSHMEM_DISABLE_MIXED_IPC`.
This enviroment variable was not documented for prior releases.
It is now documented to inform users who were using this undocumented feature.
### Removed
* rocSHMEM no-longer requires rocPRIM and rocThrust as dependencies
* Removed MPI compile-time dependency
### Known issues
* Only a subset of rocSHMEM APIs are implemented for the GDA conduit
## rocSHMEM 3.0.0 for ROCm 7.0.0
### Added
* Added the Reverse Offload conduit
* Added new APIs:
* `rocshmem_ctx_barrier`
* `rocshmem_ctx_barrier_wave`
* `rocshmem_ctx_barrier_wg`
* `rocshmem_barrier_all`
* `rocshmem_barrier_all_wave`
* `rocshmem_barrier_all_wg`
* `rocshmem_ctx_sync`
* `rocshmem_ctx_sync_wave`
* `rocshmem_ctx_sync_wg`
* `rocshmem_sync_all`
* `rocshmem_sync_all_wave`
* `rocshmem_sync_all_wg`
* `rocshmem_init_attr`
* `rocshmem_get_uniqueid`
* `rocshmem_set_attr_uniqueid_args`
* Added dlmalloc based allocator
* Added XNACK support
* Added support for initialization with MPI communicators other than `MPI_COMM_WORLD`
### Changed
* Changed collective APIs to use `_wg` suffix rather than `_wg_` infix
### Resolved Issues
* Resolved segfault in `rocshmem_wg_ctx_create`, now provides nullptr if ctx cannot be created
## rocSHMEM 2.0.1 for ROCm 6.4.2
### Resolved Issues
* Resolved incorrect output for `rocshmem_ctx_my_pe` and `rocshmem_ctx_n_pes`
* Resolved multi-team errors by providing team specific buffers in `rocshmem_ctx_wg_team_sync`
* Resolved missing implementation of `rocshmem_g` for IPC conduit
+278
Visa fil
@@ -0,0 +1,278 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
###############################################################################
# AVOID IN SOURCE BUILD
###############################################################################
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_BINARY_DIR AND
CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
set(MSG "")
message(STATUS "Warning! Building from the source directory is not recommended")
message(STATUS "If unintended, please remove 'CMakeCache.txt' and 'CMakeFiles'")
message(STATUS "and build from a separate directory")
message(FATAL_ERROR "In-source build")
endif()
###############################################################################
# CONFIGURATION OPTIONS
###############################################################################
option(DEBUG "Enable debug trace" OFF)
option(PROFILE "Enable statistics and timing support" OFF)
option(USE_RO "Enable RO conduit" ON)
option(USE_IPC "Enable IPC support (using HIP)" OFF)
option(USE_GDA "Enable GDA conduit" OFF)
option(USE_THREADS "Enable workgroup threads to share network queues" OFF)
option(USE_WF_COAL "Enable wavefront message coalescing" OFF)
option(USE_HEAP_DEVICE_FINEGRAIN "Heap uses GPU memory in finegrain mode" ON)
option(USE_HEAP_DEVICE_UNCACHED "Heap uses GPU memory in uncached mode" OFF)
option(USE_HEAP_DEVICE_COARSEGRAIN "Heap uses GPU memory in coarsegrain mode" OFF)
option(USE_HEAP_MANAGED "Heap uses managed memory" OFF)
option(USE_HEAP_HOST_HIP "Heap uses pinned host memory allocated with hip api" OFF)
option(USE_HEAP_HOST "Heap uses host memory allocated with malloc/free" OFF)
option(USE_ALLOC_DLMALLOC "Enable dlmalloc device memory allocator" ON)
option(USE_ALLOC_POW2BINS "Enable legacy Pow2Bins device memory allocator" OFF)
option(USE_FUNC_CALL "Force compiler to use function calls on library API" OFF)
option(USE_SHARED_CTX "Request support for shared ctx between WG" OFF)
option(USE_SINGLE_NODE "Enable single node support only." OFF)
option(USE_HDP_FLUSH "Force flush the HDP cache." OFF)
option(USE_HDP_FLUSH_HOST_SIDE "Use a polling thread to flush the HDP cache on the host." OFF)
option(BUILD_FUNCTIONAL_TESTS "Build the functional tests (Requires MPI)" OFF)
option(BUILD_EXAMPLES "Build the examples" ON)
option(BUILD_UNIT_TESTS "Build the unit tests (Requires MPI)" OFF)
option(BUILD_TESTS_ONLY "Build only tests. Used to link agains rocSHMEM in a ROCm Release" OFF)
option(BUILD_TOOLS "Build binary tools (e.g., rocshmem_info)" ON)
option(BUILD_LOCAL_GPU_TARGET_ONLY "Build only for GPUs detected on this machine" OFF)
option(BUILD_CODE_COVERAGE "Build with code coverage flags (gcc only)" OFF)
option(GDA_IONIC "Build for AMD Pensando IONIC RDMA provider" OFF)
option(GDA_BNXT "Build for Broadcom RDMA provider" OFF)
option(GDA_MLX5 "Build for Mellanox MLX5 RDMA provider" OFF)
set(USE_EXTERNAL_MPI AUTO CACHE STRING "Link with an external MPI (required if used MPI is ABI incompatible with Open MPI v5)")
set_property(CACHE USE_EXTERNAL_MPI PROPERTY STRINGS AUTO ON OFF)
###############################################################################
# PROJECT
###############################################################################
include(${CMAKE_SOURCE_DIR}/cmake/setup_project.cmake)
## Setup VERSION
file(READ include/rocshmem/rocshmem.hpp header_text)
if("${header_text}" MATCHES "constexpr char VERSION\\[\\] *= \"([0-9]+)\\.([0-9]+)\\.([0-9]+)\";")
set(VERSION_STRING ${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3})
else()
message(FATAL_ERROR "Failed to parse Version")
endif()
message(STATUS "rocSHMEM Version: " "${VERSION_STRING}")
project(rocshmem VERSION ${VERSION_STRING} LANGUAGES CXX)
find_package(ROCmCMakeBuildTools PATHS /opt/rocm)
include(ROCMCreatePackage)
include(ROCMInstallTargets)
include(ROCMCheckTargetIds)
rocm_setup_version(VERSION ${VERSION_STRING})
#############################################################################
# SET GPU ARCHITECTURES
#############################################################################
include(cmake/rocm_local_targets.cmake)
set(DEFAULT_GPUS
gfx90a:xnack-;
gfx90a:xnack+;
gfx1100;
gfx1201;
gfx942)
if(${ROCM_MAJOR_VERSION} GREATER 6)
list(APPEND DEFAULT_GPUS gfx950)
endif()
if($ENV{BUILD_LOCAL_GPU_TARGET_ONLY})
set(BUILD_LOCAL_GPU_TARGET_ONLY ON)
endif()
if (BUILD_LOCAL_GPU_TARGET_ONLY)
message(STATUS "Building only for local GPU target")
if (COMMAND rocm_local_targets)
rocm_local_targets(DEFAULT_GPUS)
else()
message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.")
endif()
endif()
set(DEFAULT_GPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING
"Target default GPUs if GPU_TARGETS is not defined.")
if (COMMAND rocm_check_target_ids)
message(STATUS "Checking for ROCm support for GPU targets: " "${DEFAULT_GPU_TARGETS}")
rocm_check_target_ids(SUPPORTED_GPUS TARGETS ${DEFAULT_GPU_TARGETS})
else()
message(WARNING "Unable to check for supported GPU targets.")
set(SUPPORTED_GPUS ${DEFAULT_GPU_TARGETS})
endif()
set(GPU_TARGETS "${SUPPORTED_GPUS}" CACHE STRING "GPU architectures to compile for")
message(STATUS "Compiling for ${GPU_TARGETS}")
###############################################################################
# CREATE ROCSHMEM LIBRARY
###############################################################################
if (NOT BUILD_TESTS_ONLY)
add_library(${PROJECT_NAME})
add_library(roc::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
add_subdirectory(src)
#############################################################################
# PACKAGE DEPENDENCIES
#############################################################################
if (NOT USE_EXTERNAL_MPI STREQUAL "OFF")
find_package(MPI)
else()
message ("-- External MPI detection disabled by user")
endif()
if (MPI_FOUND)
set(HAVE_EXTERNAL_MPI ON)
else()
set(HAVE_EXTERNAL_MPI OFF)
set(BUILD_UNIT_TESTS OFF)
endif()
if (USE_EXTERNAL_MPI STREQUAL "ON")
if(NOT HAVE_EXTERNAL_MPI)
message(FATAL_ERROR "External MPI support requested but MPI support not found. Build Aborted")
endif()
endif()
find_package(hip REQUIRED PATHS /opt/rocm)
find_package(hsa-runtime64 REQUIRED)
set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
set(THREADS_PREFER_PTHREAD_FLAG TRUE)
find_package(Threads REQUIRED)
configure_file(cmake/rocshmem_config.h.in include/rocshmem/rocshmem_config.h)
#############################################################################
# LINKING AND INCLUDE DIRECTORIES
#############################################################################
target_compile_options(
${PROJECT_NAME}
PUBLIC
-fgpu-rdc
)
target_include_directories(
${PROJECT_NAME}
PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include> # rocshmem_config.h
$<BUILD_INTERFACE:${CMAKE_BINARY_DIR}/include/rocshmem> # rocshmem_config.h from rocshmem.hpp
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
$<INSTALL_INTERFACE:include>
)
target_link_libraries(
${PROJECT_NAME}
PUBLIC
$<$<BOOL:${HAVE_EXTERNAL_MPI}>:MPI::MPI_CXX>
Threads::Threads
hip::device
hip::host
dl
hsa-runtime64::hsa-runtime64
-fgpu-rdc
)
if(${ROCM_MAJOR_VERSION} LESS 7)
# ROCm 6.x requires us to explicitly enable warp sync builtins
target_compile_definitions(${PROJECT_NAME} PRIVATE HIP_ENABLE_WARP_SYNC_BUILTINS=1)
endif()
#############################################################################
# INSTALL
#############################################################################
include(ROCMInstallTargets)
include(ROCMCreatePackage)
rocm_install(TARGETS rocshmem)
rocm_install(
DIRECTORY ${CMAKE_SOURCE_DIR}/include/
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)
rocm_install(
FILES "${CMAKE_BINARY_DIR}/include/rocshmem/rocshmem_config.h"
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/rocshmem
)
if (BUILD_TOOLS)
rocm_install(
PROGRAMS "${CMAKE_BINARY_DIR}/src/tools/rocshmem_info"
DESTINATION ${CMAKE_INSTALL_BINDIR}
)
endif()
rocm_package_add_dependencies(
DEPENDS
hsa-rocr
hip-runtime-amd
rocm-dev
)
rocm_export_targets(
TARGETS roc::rocshmem
NAMESPACE roc::
)
include(ROCMPackageConfigHelpers)
include(ROCMClients)
rocm_package_setup_component(clients)
rocm_package_setup_client_component(tests PACKAGE_NAME tests)
rocm_create_package(
NAME "rocSHMEM"
DESCRIPTION "ROCm OpenSHMEM (rocSHMEM)"
MAINTAINER "rocSHMEM Maintainer <rocshmem-maintainer@amd.com>"
)
endif (NOT BUILD_TESTS_ONLY)
###############################################################################
# TEST SUBDIRECTORIES
###############################################################################
add_subdirectory(tests)
if (BUILD_EXAMPLES)
add_subdirectory(examples)
endif()
+59
Visa fil
@@ -0,0 +1,59 @@
## How to fork from us
To keep our development fast and conflict free, we recommend you to [fork](https://github.com/ROCm/rocSHMEM/fork) our repository and start your work from our `develop` branch in your private repository.
Afterwards, git clone your repository to your local machine. But that is not it! To keep track of the original develop repository, add it as another remote.
```
git remote add mainline https://github.com/ROCm/rocSHMEM.git
git checkout dev
```
As always in git, start a new branch with
```
git checkout -b topic-<yourFeatureName>
```
and apply your changes there.
## How to contribute to rocSHMEM
### Did you find a bug?
- Ensure the bug was not already reported by searching on GitHub under [Issues](https://github.com/ROCm/rocSHMEM/issues).
- If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/ROCm/rocSHMEM/issues/new).
### Did you write a patch that fixes a bug?
- Open a new GitHub [pull request](https://github.com/ROCm/rocSHMEM/compare) with the patch.
- Ensure the PR description clearly describes the problem and solution. If there is an existing GitHub issue open describing this bug, please include it in the description so we can close it.
- Ensure the PR is based on the `dev` branch of the rocSHMEM GitHub repository.
- rocSHMEM requires new commits to include a "Signed-off-by" token in the commit message (typically enabled via the `git commit -s` option), indicating your agreement to the projects's [Developer's Certificate of Origin](https://developercertificate.org/) and compatability with the project [LICENSE](https://github.com/ROCm/rocSHMEM/blob/main/LICENSE):
> (a) The contribution was created in whole or in part by me and I
> have the right to submit it under the open source license
> indicated in the file; or
>
> (b) The contribution is based upon previous work that, to the best
> of my knowledge, is covered under an appropriate open source
> license and I have the right under that license to submit that
> work with modifications, whether created in whole or in part
> by me, under the same open source license (unless I am
> permitted to submit under a different license), as indicated
> in the file; or
>
> (c) The contribution was provided directly to me by some other
> person who certified (a), (b) or (c) and I have not modified
> it.
>
> (d) I understand and agree that this project and the contribution
> are public and that a record of the contribution (including all
> personal information I submit with it, including my sign-off) is
> maintained indefinitely and may be redistributed consistent with
> this project or the open source license(s) involved.
+23
Visa fil
@@ -0,0 +1,23 @@
MIT License
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
SPDX-License-Identifier: MIT
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
+33
Visa fil
@@ -0,0 +1,33 @@
# ROCm OpenSHMEM (rocSHMEM)
The ROCm OpenSHMEM (rocSHMEM) runtime is part of an AMD and AMD Research
initiative to provide GPU-centric networking through an OpenSHMEM-like interface.
This intra-kernel networking library simplifies application
code complexity and enables more fine-grained communication/computation
overlap than traditional host-driven networking.
rocSHMEM uses a single symmetric heap that is allocated on GPU memories.
There are currently three backends for rocSHMEM;
IPC, Reverse Offload (RO), and GDA.
The backends primarily differ in their implementations of intra-kernel networking.
The IPC backend implements communication primitives using load/store operations issued from the GPU.
The Reverse Offload (RO) backend has the GPU runtime forward rocSHMEM networking operations
to the host-side runtime, which calls into a traditional MPI or OpenSHMEM
implementation. This forwarding of requests is transparent to the
programmer, who only sees the GPU-side interface.
The GPU Direct Async (GDA) backend allows for rocSHMEM to issue communication operations to the NIC directly from the device-side code, without involving a CPU proxy.
within the GPU.
During initialization we prepare network resources for each NIC vendor using the vendor-appropriate
Direct Verbs APIs.
When calling the device-side rocSHMEM API, the device threads are used to construct Work Queue Entries (WQEs) and post the communication to the send queues of the NIC directly.
Completion Queues (CQs) are polled from the device-side code as well.
The RO and GDA backend is provided as-is with limited support from AMD or AMD Research.
## Installation and using rocSHMEM
For information on how to install and use rocSHMEM,
[please see our documentation](https://rocm.docs.amd.com/projects/rocSHMEM/en/latest/).
+68
Visa fil
@@ -0,0 +1,68 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
# Find pmix installation.
# Different scenarios need to be covered:
# - pmix at user-provided location (i.e., in PMIX_ROOT)
# - pmix installed as part of Open MPI, i.e., in the MPI installation directories
# - pmix deployed with linux distros, Slurm, etc.
find_package(PkgConfig QUIET)
if (PkgConfig_FOUND)
# Figure out and prepend the install dir for MPI
string(REGEX REPLACE "/include$" "" mpi_dir "${MPI_CXX_HEADER_DIR}")
foreach (mpiroot "${MPI_ROOT}" "$ENV{MPI_ROOT}" "${mpi_dir}")
if (mpiroot)
set(ENV{PKG_CONFIG_PATH} "${mpiroot}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}")
endif()
endforeach()
# prepend PMIX_ROOT
foreach (pmixroot "${PMIX_ROOT}" "$ENV{PMIX_ROOT}" "${PMIx_ROOT}" "$ENV{PMIx_ROOT}")
if (pmixroot)
set(ENV{PKG_CONFIG_PATH} "${pmixroot}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}")
endif()
endforeach()
pkg_check_modules(PC_PMIX QUIET pmix)
endif()
find_path(PMIX_INCLUDE_DIR pmix.h
HINTS ${PC_PMIX_INCLUDE_DIRS} ${MPI_CXX_HEADER_DIR} ${MPI_ROOT} $ENV{MPI_ROOT}
PATH_SUFFIXES include)
if (PMIX_INCLUDE_DIR)
string(REGEX REPLACE "/include$" "" pmix_dir ${PMIX_INCLUDE_DIR})
find_library(PMIX_LIBRARY pmix PATHS ${pmix_dir} PATH_SUFFIXES lib lib64 NO_DEFAULT_PATH)
endif()
find_package_handle_standard_args(PMIx DEFAULT_MSG
PMIX_LIBRARY PMIX_INCLUDE_DIR)
mark_as_advanced(PMIX_LIBRARY PMIX_INCLUDE_DIR)
if (PMIx_FOUND)
add_library(PMIx::pmix UNKNOWN IMPORTED)
set_target_properties(PMIx::pmix PROPERTIES
IMPORTED_LOCATION "${PMIX_LIBRARY}"
INTERFACE_COMPILE_OPTIONS "${PC_PMIX_CFLAGS_OTHER}"
INTERFACE_INCLUDE_DIRECTORIES "${PMIX_INCLUDE_DIR}"
)
endif()
@@ -0,0 +1,52 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
# Find available local ROCM targets
# NOTE: This will eventually be part of ROCm-CMake and should be removed at that time
function(rocm_local_targets VARIABLE)
set(${VARIABLE} "NOTFOUND" PARENT_SCOPE)
find_program(_rocm_agent_enumerator rocm_agent_enumerator HINTS /opt/rocm/bin ENV ROCM_PATH)
if(NOT _rocm_agent_enumerator STREQUAL "_rocm_agent_enumerator-NOTFOUND")
execute_process(
COMMAND "${_rocm_agent_enumerator}"
RESULT_VARIABLE _found_agents
OUTPUT_VARIABLE _rocm_agents
ERROR_QUIET
)
if (_found_agents EQUAL 0)
string(REPLACE "\n" ";" _rocm_agents "${_rocm_agents}")
unset(result)
foreach (agent IN LISTS _rocm_agents)
if (NOT agent STREQUAL "gfx000")
list(APPEND result "${agent}")
endif()
endforeach()
if(result)
list(REMOVE_DUPLICATES result)
set(${VARIABLE} "${result}" PARENT_SCOPE)
endif()
endif()
endif()
endfunction()
@@ -0,0 +1,48 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#cmakedefine DEBUG
#cmakedefine PROFILE
#cmakedefine USE_RO
#cmakedefine USE_IPC
#cmakedefine USE_GDA
#cmakedefine USE_THREADS
#cmakedefine USE_SHARED_CTX
#cmakedefine USE_WF_COAL
#cmakedefine USE_HEAP_DEVICE_FINEGRAIN
#cmakedefine USE_HEAP_DEVICE_UNCACHED
#cmakedefine USE_HEAP_DEVICE_COARSEGRAIN
#cmakedefine USE_HEAP_MANAGED
#cmakedefine USE_HEAP_HOST_HIP
#cmakedefine USE_HEAP_HOST
#cmakedefine USE_ALLOC_DLMALLOC
#cmakedefine USE_ALLOC_POW2BINS
#cmakedefine USE_FUNC_CALL
#cmakedefine USE_SINGLE_NODE
#cmakedefine USE_HDP_FLUSH
#cmakedefine USE_HDP_FLUSH_HOST_SIDE
#cmakedefine GDA_IONIC
#cmakedefine GDA_BNXT
#cmakedefine GDA_MLX5
#cmakedefine HAVE_EXTERNAL_MPI
@@ -0,0 +1,82 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
###############################################################################
# DEFAULT BUILD TYPE
###############################################################################
set(CMAKE_BUILD_TYPE "Release" CACHE STRING
"build type: Release, Debug, RelWithDebInfo, MinSizeRel")
###############################################################################
# DEPENDENCIES
###############################################################################
# Try to establish ROCM_PATH (for find_package)
#==================================================================================================
if(NOT DEFINED ROCM_PATH)
# Guess default location
set(ROCM_PATH "/opt/rocm")
message(WARNING "Unable to find ROCM_PATH: Falling back to ${ROCM_PATH}")
else()
message(STATUS "ROCM_PATH found: ${ROCM_PATH}")
endif()
set(ENV{ROCM_PATH} ${ROCM_PATH})
## Check for ROCm version
if(ROCM_PATH)
message(STATUS "Reading ROCM version from ${ROCM_PATH}/.info/version")
file(READ "${ROCM_PATH}/.info/version" rocm_version_string)
else()
message(FATAL_ERROR "Could not determine ROCM version (set EXPLICIT_ROCM_VERSION or set ROCM_PATH to a valid installation)")
endif()
string(REGEX MATCH "([0-9]+)\\.([0-9]+)\\.([0-9]+)" rocm_version_matches ${rocm_version_string})
if (rocm_version_matches)
set(ROCM_MAJOR_VERSION ${CMAKE_MATCH_1})
set(ROCM_MINOR_VERSION ${CMAKE_MATCH_2})
set(ROCM_PATCH_VERSION ${CMAKE_MATCH_3})
message(STATUS "ROCm version: ${ROCM_MAJOR_VERSION}.${ROCM_MINOR_VERSION}.${ROCM_PATCH_VERSION}")
else()
message(WARNING "Failed to extract ROCm version.")
endif()
foreach (root ${hip_ROOT} $ENV{hip_ROOT} ${ROCM_ROOT} $ENV{ROCM_ROOT} ${ROCM_PATH} $ENV{ROCM_PATH})
if (IS_DIRECTORY ${root})
list(PREPEND CMAKE_PREFIX_PATH ${root})
endif()
endforeach()
if (NOT DEFINED CMAKE_CXX_COMPILER)
find_program(CMAKE_CXX_COMPILER hipcc PATHS /opt/rocm)
endif()
###############################################################################
# GLOBAL COMPILE FLAGS
###############################################################################
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_FLAGS_DEBUG "-O0 -ggdb")
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+5
Visa fil
@@ -0,0 +1,5 @@
_build/
_doxygen/
doxygen/html/
doxygen/xml/
sphinx/_toc.yml
+21
Visa fil
@@ -0,0 +1,21 @@
# Building the rocSHMEM documentation
## macOS
To build html documentation locally:
```
brew install doxygen sphinx-doc
pip3.10 install -r ./sphinx/requirements.txt
python3.10 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
open _build/html/index.html
```
To build pdf documentation we require a LaTeX installation on your machine.
Once LaTeX is installed, you may run the following:
```
pip3.10 install -r ./sphinx/requirements.txt
sphinx-build -M latexpdf . _build
open _build/latex/rocshmem.pdf
```
+418
Visa fil
@@ -0,0 +1,418 @@
.. meta::
:description: rocSHMEM intra-kernel networking runtime for AMD dGPUs on the ROCm platform.
:keywords: rocSHMEM, API, ROCm, documentation, HIP, Networking, Communication
.. _rocshmem-api-amo:
---------------------------
Atomic memory operations
---------------------------
You can call these functions from divergent control paths at the per-thread level.
ROSHMEM_ATOMIC_FETCH
--------------------
.. cpp:function:: __device__ TYPE rocshmem_TYPENAME_atomic_fetch(TYPE *source, int pe)
.. cpp:function:: __device__ TYPE rocshmem_ctx_TYPENAME_atomic_fetch(rocshmem_ctx_t ctx, TYPE *source, int pe)
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param pe: PE of the remote process.
:returns: The value of ``dest``.
**Description:**
This function atomically returns the value of ``dest`` to the calling PE.
Valid ``TYPENAME`` and ``TYPE`` values are listed in EXTENDED_AMO_TYPES_.
SHMEM_ATOMIC_SET
----------------
.. cpp:function:: __device__ void rocshmem_TYPENAME_atomic_set(TYPE *dest, TYPE value, int pe);
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_atomic_set(rocshmem_ctx_t ctx, TYPE *dest, TYPE value, int pe);
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param value: The value to be atomically set.
:param pe: PE of the remote process.
:returns: None.
**Description:**
This function atomically sets the value ``value`` to ``dest`` on ``pe``.
Valid ``TYPENAME`` and ``TYPE`` values are listed in EXTENDED_AMO_TYPES_.
SHMEM_ATOMIC_COMPARE_SWAP
-------------------------
.. cpp:function:: __device__ TYPE rocshmem_TYPENAME_atomic_compare_swap(TYPE *dest, TYPE cond, TYPE value, TYPE pe);
.. cpp:function:: __device__ TYPE rocshmem_ctx_TYPENAME_atomic_compare_swap(rocshmem_ctx_t ctx, TYPE *dest, TYPE cond, TYPE value, TYPE pe);
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param cond: The value to be compare with.
:param value: The value to be atomically swapped.
:param pe: PE of the remote process.
:return: The old value of ``dest``.
**Description:**
This function atomically compares the value in ``dest`` with ``cond``. If they are equal, it stores ``value`` in ``dest``.
The operation returns the older value of ``dest`` to the calling PE.
The operation is blocking.
Valid ``TYPENAME`` and ``TYPE`` values are listed in STANDARD_AMO_TYPES_.
SHMEM_ATOMIC_SWAP
-----------------
.. cpp:function:: __device__ TYPE rocshmem_TYPENAME_atomic_swap(TYPE *dest, TYPE value, TYPE pe);
.. cpp:function:: __device__ TYPE rocshmem_ctx_TYPENAME_atomic_swap(rocshmem_ctx_t ctx, TYPE *dest, TYPE value, TYPE pe);
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param value: The value to be atomically swapped.
:param pe: PE of the remote process.
:return: The old value of ``dest``.
**Description:**
This function atomically swaps the value ``val`` with ``dest`` on ``pe``.
The operation is blocking.
Valid ``TYPENAME`` and ``TYPE`` values are listed in EXTENDED_AMO_TYPES_.
SHMEM_ATOMIC_FETCH_INC
----------------------
.. cpp:function:: __device__ TYPE rocshmem_TYPENAME_atomic_fetch_inc(TYPE *dest, TYPE pe);
.. cpp:function:: __device__ TYPE rocshmem_ctx_TYPENAME_atomic_fetch_inc(rocshmem_ctx_t ctx, TYPE *dest, TYPE pe);
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param pe: PE of the remote process.
:return: The old value of ``dest``.
**Description:**
This function atomically adds ``1`` to ``dest`` on ``pe``.
The operation is blocking.
Valid ``TYPENAME`` and ``TYPE`` values are listed in STANDARD_AMO_TYPES_.
SHMEM_ATOMIC_INC
----------------
.. cpp:function:: __device__ void rocshmem_TYPENAME_atomic_inc(TYPE *dest, TYPE pe);
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_atomic_inc(rocshmem_ctx_t ctx, TYPE *dest, TYPE pe);
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param pe: PE of the remote process.
:return: None.
**Description:**
This function atomically adds ``1`` to ``dest`` on ``pe``.
The operation is blocking.
Valid ``TYPENAME`` and ``TYPE`` values are listed in STANDARD_AMO_TYPES_.
SHMEM_ATOMIC_FETCH_ADD
----------------------
.. cpp:function:: __device__ TYPE rocshmem_TYPENAME_atomic_fetch_add(TYPE *dest, TYPE value, TYPE pe);
.. cpp:function:: __device__ TYPE rocshmem_ctx_TYPENAME_atomic_fetch_add(rocshmem_ctx_t ctx, TYPE *dest, TYPE value, TYPE pe);
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param value: The value to be atomically added.
:param pe: PE of the remote process.
:return: The old value of ``dest``.
**Description:**
This function atomically adds ``value`` to ``dest`` on ``pe``.
The operation is blocking.
Valid ``TYPENAME`` and ``TYPE`` values are listed in STANDARD_AMO_TYPES_.
SHMEM_ATOMIC_ADD
----------------
.. cpp:function:: __device__ void rocshmem_TYPENAME_atomic_add(TYPE *dest, TYPE value, TYPE pe);
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_atomic_add(rocshmem_ctx_t ctx, TYPE *dest, TYPE value, TYPE pe);
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param value: The value to be atomically added.
:param pe: PE of the remote process.
:return: None.
**Description:**
This function atomically adds ``value`` to ``dest`` on ``pe``.
The operation is blocking.
Valid ``TYPENAME`` and ``TYPE`` values can be seen in STANDARD_AMO_TYPES_.
SHMEM_ATOMIC_FETCH_AND
----------------------
.. cpp:function:: __device__ TYPE rocshmem_TYPENAME_atomic_fetch_and(TYPE *dest, TYPE value, TYPE pe);
.. cpp:function:: __device__ TYPE rocshmem_ctx_TYPENAME_atomic_fetch_and(rocshmem_ctx_t ctx, TYPE *dest, TYPE value, TYPE pe);
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param value: The value to be atomically ``AND``.
:param pe: PE of the remote process.
:return: The old value of ``dest``.
**Description:**
This function atomically bitwise-and ``value`` to the value at ``dest`` on ``pe``.
The operation is blocking.
Valid ``TYPENAME`` and ``TYPE`` values are listed in BITWISE_AMO_TYPES_.
SHMEM_ATOMIC_AND
----------------
.. cpp:function:: __device__ TYPE rocshmem_TYPENAME_atomic_and(TYPE *dest, TYPE value, TYPE pe);
.. cpp:function:: __device__ TYPE rocshmem_ctx_TYPENAME_atomic_and(rocshmem_ctx_t ctx, TYPE *dest, TYPE value, TYPE pe);
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param value: The value to be atomically ``AND``.
:param pe: PE of the remote process.
:return: None
**Description:**
This function atomically bitwise-and ``value`` to the value at ``dest`` on ``pe``.
The operation is blocking.
Valid ``TYPENAME`` and ``TYPE`` values are listed in BITWISE_AMO_TYPES_.
SHMEM_ATOMIC_FETCH_OR
----------------------
.. cpp:function:: __device__ TYPE rocshmem_TYPENAME_atomic_fetch_or(TYPE *dest, TYPE value, TYPE pe)
.. cpp:function:: __device__ TYPE rocshmem_ctx_TYPENAME_atomic_fetch_or(rocshmem_ctx_t ctx, TYPE *dest, TYPE value, TYPE pe)
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param value: The value to be atomically ``OR``.
:param pe: PE of the remote process.
:return: The old value of ``dest``.
**Description:**
This function atomically bitwise-or ``value`` to the value at ``dest`` on ``pe``.
The operation is blocking.
Valid ``TYPENAME`` and ``TYPE`` values are listed in BITWISE_AMO_TYPES_.
SHMEM_ATOMIC_OR
---------------
.. cpp:function:: __device__ TYPE rocshmem_TYPENAME_atomic_or(TYPE *dest, TYPE value, TYPE pe)
.. cpp:function:: __device__ TYPE rocshmem_ctx_TYPENAME_atomic_or(rocshmem_ctx_t ctx, TYPE *dest, TYPE value, TYPE pe)
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param value: The value to be atomically ``OR``.
:param pe: PE of the remote process.
:return: None.
**Description:**
This function atomically bitwise-or ``value`` to the value at ``dest`` on ``pe``.
The operation is blocking.
Valid ``TYPENAME`` and ``TYPE`` values are listed in BITWISE_AMO_TYPES_.
SHMEM_ATOMIC_FETCH_XOR
----------------------
.. cpp:function:: __device__ TYPE rocshmem_TYPENAME_atomic_fetch_xor(TYPE *dest, TYPE value, TYPE pe);
.. cpp:function:: __device__ TYPE rocshmem_ctx_TYPENAME_atomic_fetch_xor(rocshmem_ctx_t ctx, TYPE *dest, TYPE value, TYPE pe);
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param value: The value to be atomically ``XOR``.
:param pe: PE of the remote process.
:return: The old value of ``dest``.
**Description:**
This function atomically bitwise-xor ``value`` to the value at ``dest`` on ``pe``.
The operation is blocking.
Valid ``TYPENAME`` and ``TYPE`` values are listed in BITWISE_AMO_TYPES_.
SHMEM_ATOMIC_XOR
----------------
.. cpp:function:: __device__ TYPE rocshmem_TYPENAME_atomic_xor(TYPE *dest, TYPE value, TYPE pe)
.. cpp:function:: __device__ TYPE rocshmem_ctx_TYPENAME_atomic_xor(rocshmem_ctx_t ctx, TYPE *dest, TYPE value, TYPE pe)
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param value: The value to be atomically ``XOR``.
:param pe: PE of the remote process.
:return: None.
**Description:**
This function atomically bitwise-xor ``value`` to the value at ``dest`` on ``pe``.
The operation is blocking.
Valid ``TYPENAME`` and ``TYPE`` values are listed in BITWISE_AMO_TYPES_.
Supported AMO data types
------------------------
.. _STANDARD_AMO_TYPES:
.. list-table:: Standard AMO Data Types
:widths: 10 20 20
:header-rows: 1
* - TYPE
- TYPENAME
- Supported
* - int
- int
- Yes
* - long
- long
- Yes
* - long long
- longlong
- Yes
* - unsigned int
- uint
- Yes
* - unsigned long
- ulong
- Yes
* - unsigned long long
- ulonglong
- Yes
* - int32_t
- int32
- Yes
* - int64_t
- int64
- Yes
* - uint32_t
- uint32
- Yes
* - uint64_t
- uint64
- Yes
* - size_t
- size
- Yes
* - ptrdiff_t
- ptrdiff
- Yes
.. _EXTENDED_AMO_TYPES:
.. list-table:: Extended AMO Data Types
:widths: 10 20 20
:header-rows: 1
* - TYPE
- TYPENAME
- Supported
* - float
- float
- Yes
* - double
- double
- Yes
* - int
- int
- Yes
* - long
- long
- Yes
* - long long
- longlong
- Yes
* - unsigned int
- uint
- Yes
* - unsigned long
- ulong
- Yes
* - unsigned long long
- ulonglong
- Yes
* - int32_t
- int32
- Yes
* - int64_t
- int64
- Yes
* - uint32_t
- uint32
- Yes
* - uint64_t
- uint64
- Yes
* - size_t
- size
- Yes
* - ptrdiff_t
- ptrdiff
- Yes
.. _BITWISE_AMO_TYPES:
.. list-table:: Bitwise AMO Data Types
:widths: 10 20 20
:header-rows: 1
* - TYPE
- TYPENAME
- Supported
* - unsigned int
- uint
- Yes
* - unsigned long
- ulong
- Yes
* - unsigned long long
- ulonglong
- Yes
* - int32_t
- int32
- Yes
* - int64_t
- int64
- Yes
* - uint32_t
- uint32
- Yes
* - uint64_t
- uint64
- Yes
+323
Visa fil
@@ -0,0 +1,323 @@
.. meta::
:description: rocSHMEM intra-kernel networking runtime for AMD dGPUs on the ROCm platform.
:keywords: rocSHMEM, API, ROCm, documentation, HIP, Networking, Communication
.. _rocshmem-api-coll:
---------------------------
Collective routines
---------------------------
ROCSHMEM_BARRIER_ALL
--------------------
.. cpp:function:: __device__ void rocshmem_barrier_all()
.. cpp:function:: __device__ void rocshmem_barrier_all_wave()
.. cpp:function:: __device__ void rocshmem_barrier_all_wg()
:returns: None.
**Description:**
This routine performs a collective barrier across all PEs in the system.
The caller is blocked until the barrier is resolved and all updates local and remote are completed.
These APIs should be called from only one thread/wavefront/workgroup within the grid to avoid undefined behavior.
ROCSHMEM_BARRIER_ALL_ON_STREAM
-------------------------------
.. cpp:function:: __host__ void rocshmem_barrier_all_on_stream(hipStream_t stream)
:param stream: HIP stream on which to enqueue the operation.
:returns: None.
**Description:**
This routine enqueues a collective barrier operation on a HIP stream. The barrier is performed
across all PEs in the system. The operation is enqueued on the specified stream and will execute
asynchronously. The caller must synchronize the stream (e.g., using ``hipStreamSynchronize``)
to ensure completion.
ROCSHMEM_BARRIER
----------------
.. cpp:function:: __device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team)
.. cpp:function:: __device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team)
.. cpp:function:: __device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team)
:param ctx: Context with which to perform this operation.
:returns: None.
**Description:**
This routine performs a collective barrier between all PEs in the system.
The caller is blocked until the barrier is resolved.
ROCSHMEM_TEAM_SYNC
------------------
.. cpp:function:: __device__ void rocshmem_ctx_sync(rocshmem_ctx_t ctx, rocshmem_team_t team)
.. cpp:function:: __device__ void rocshmem_ctx_sync_wave(rocshmem_ctx_t ctx, rocshmem_team_t team)
.. cpp:function:: __device__ void rocshmem_ctx_sync_wg(rocshmem_ctx_t ctx, rocshmem_team_t team)
:param ctx: Context with which to perform this operation.
:param team: Team with which to perform this operation.
:returns: None.
**Description:**
This routine registers the arrival of a PE at a barrier.
The caller is blocked until the synchronization is resolved.
Unlike the ``shmem_barrier_all`` routine, ``shmem_team_sync`` only ensures the
completion and visibility of previously issued memory stores, but does not
ensure the completion of remote memory updates issued via OpenSHMEM routines.
ROCSHMEM_SYNC_ALL
-----------------
.. cpp:function:: __device__ void rocshmem_sync_all()
.. cpp:function:: __device__ void rocshmem_sync_all_wave()
.. cpp:function:: __device__ void rocshmem_sync_all_wg()
:returns: None.
**Description:**
These routines behaves the same way as ``rocshmem_team_sync_*`` when called on the world team.
These APIs should be called from only one thread/wavefront/workgroup within the grid to avoid undefined behavior.
ROSHMEM_ALLTOALL
----------------
.. cpp:function:: __device__ void rocshmem_TYPENAME_alltoall_wg(rocshmem_team_t team, TYPE *dest, const TYPE *source, int nelems)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_alltoall_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, TYPE *dest, const TYPE *source, int nelems)
:param team: The team participating in the collective.
:param dest: Destination address. Must be an address on the
symmetric heap.
:param source: Source address. Must be an address on the symmetric
heap.
:param nelems: Number of data blocks transferred per pair of PEs.
:returns: None.
**Description:**
This routine exchanges a fixed amount of contiguous data blocks between all pairs
of PEs participating in the collective routine.
This function must be called as a work-group collective.
Valid ``TYPENAME`` and ``TYPE`` values are listed in :ref:`RMA_TYPES`.
ROCSHMEM_ALLTOALLMEM_ON_STREAM
-------------------------------
.. cpp:function:: __host__ void rocshmem_alltoallmem_on_stream(rocshmem_team_t team, void *dest, const void *source, size_t size, hipStream_t stream)
:param team: The team participating in the collective.
:param dest: Destination address. Must be an address on the symmetric heap.
:param source: Source address. Must be an address on the symmetric heap.
:param size: Number of bytes to transfer per pair of PEs.
:param stream: HIP stream on which to enqueue the operation.
:returns: None.
**Description:**
This routine enqueues an alltoall collective operation on a HIP stream. The function
exchanges a fixed amount of contiguous data blocks between all pairs of PEs participating
in the collective routine. The operation is enqueued on the specified stream and will
execute asynchronously. The caller must synchronize the stream (e.g., using
``hipStreamSynchronize``) to ensure completion.
This function creates a separate context for each workgroup to avoid contention on the
default context, allowing parallel execution across multiple streams.
ROCSHMEM_BROADCAST
------------------
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_broadcast_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, TYPE *dest, const TYPE *source, int nelems, int pe_root)
:param ctx: Context with which to perform this collective.
:param team: The team participating in the collective.
:param dest: Destination address. Must be an address on the
symmetric heap.
:param source: Source address. Must be an address on the symmetric
heap.
:param nelems: Number of data blocks transferred per pair of PEs.
:returns: None.
**Description:**
This routine performs a broadcast across PEs in the team.
The caller is blocked until the broadcast completes.
Valid ``TYPENAME`` and ``TYPE`` values are listed in :ref:`RMA_TYPES`.
ROCSHMEM_BROADCASTMEM_ON_STREAM
--------------------------------
.. cpp:function:: __host__ void rocshmem_broadcastmem_on_stream(rocshmem_team_t team, void *dest, const void *source, size_t nelems, int pe_root, hipStream_t stream)
:param team: The team participating in the collective.
:param dest: Destination address. Must be an address on the symmetric heap.
:param source: Source address. Must be an address on the symmetric heap.
:param nelems: Number of bytes to broadcast.
:param pe_root: Root PE (relative to team) from which to broadcast.
:param stream: HIP stream on which to enqueue the operation.
:returns: None.
**Description:**
This routine enqueues a broadcast collective operation on a HIP stream. The function broadcasts
data from the root PE to all other PEs participating in the collective routine. The operation
is enqueued on the specified stream and will execute asynchronously. The caller must synchronize
the stream (e.g., using ``hipStreamSynchronize``) to ensure completion.
This function creates a separate context for each workgroup to avoid contention on the
default context, allowing parallel execution across multiple streams.
ROCSHMEM_FCOLLECT
-----------------
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_fcollect_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, TYPE *dest, const TYPE *source, int nelems)
:param ctx: Context with which to perform this collective.
:param team: The team participating in the collective.
:param dest: Destination address. Must be an address on the
symmetric heap.
:param source: Source address. Must be an address on the symmetric
heap.
:param nelems: Number of data blocks transferred per pair of PEs.
:returns: None.
**Description:**
This routine concatenates blocks of data from multiple PEs to an array in every
PE participating in the collective routine.
ROCSHMEM_REDUCTION
------------------
.. cpp:function:: __device__ int rocshmem_ctx_TYPENAME_OPNAME_reduce_wg(rocshmem_ctx_t ctx, rocshmem_team_t team, TYPE *dest, const TYPE *source, int nreduce)
:param ctx: Context with which to perform this collective.
:param team: The team participating in the collective.
:param dest: Destination address. Must be an address on the
symmetric heap.
:param source: Source address. Must be an address on the symmetric
heap.
:param nreduce: Number of data blocks transferred per pair of PEs.
:returns: Zero on successful local completion. Nonzero otherwise.
**Description:**
This routine performs an allreduce operation across PEs in the team.
Valid ``TYPENAME``, ``TYPE``, and ``OPNAME`` values are listed in :ref:`REDUCE_TYPES`.
Supported reduction types and operations
----------------------------------------
.. _REDUCE_TYPES:
.. list-table:: Reduction Types, Names and Operations
:widths: 20 20 20 20
:header-rows: 1
* - TYPE
- TYPENAME
- OPNAME
- Supported
* - char
- char
- max, min, sum, prod
- No
* - signed char
- schar
- max, min, sum, prod
- No
* - short
- short
- max, min, sum, prod
- Yes
* - int
- int
- max, min, sum, prod
- Yes
* - long
- long
- max, min, sum, prod
- Yes
* - long long
- longlong
- max, min, sum, prod
- Yes
* - ptrdiff_t
- ptrdiff
- max, min, sum, prod
- No
* - unsigned char
- uchar
- and, or, xor, max, min, sum, prod
- No
* - unsigned short
- ushort
- and, or, xor, max, min, sum, prod
- No
* - unsigned int
- uint
- and, or, xor, max, min, sum, prod
- No
* - unsigned long
- ulong
- and, or, xor, max, min, sum, prod
- No
* - unsigned long long
- ulonglong
- and, or, xor, max, min, sum, prod
- No
* - int8_t
- int8
- and, or, xor, max, min, sum, prod
- No
* - int16_t
- int16
- and, or, xor, max, min, sum, prod
- No
* - int32_t
- int32
- and, or, xor, max, min, sum, prod
- No
* - int64_t
- int64
- and, or, xor, max, min, sum, prod
- No
* - uint8_t
- uint8
- and, or, xor, max, min, sum, prod
- No
* - uint16_t
- uint16
- and, or, xor, max, min, sum, prod
- No
* - uint32_t
- uint32
- and, or, xor, max, min, sum, prod
- No
* - uint64_t
- uint64
- and, or, xor, max, min, sum, prod
- No
* - size_t
- size
- and, or, xor, max, min, sum, prod
- No
* - float
- float
- max, min, sum, prod
- Yes
* - double
- double
- max, min, sum, prod
- Yes
* - long double
- longdouble
- max, min, sum, prod
- No
* - double _Complex
- complexd
- sum, prod
- No
* - float _Complex
- complexf
- sum, prod
- No
+59
Visa fil
@@ -0,0 +1,59 @@
.. meta::
:description: rocSHMEM intra-kernel networking runtime for AMD dGPUs on the ROCm platform.
:keywords: rocSHMEM, API, ROCm, documentation, HIP, Networking, Communication
.. _rocshmem-api-ctx:
-----------------------------------
Context management routines
-----------------------------------
ROCSHMEM_CTX_CREATE
-------------------
.. cpp:function:: __device__ int rocshmem_wg_ctx_create(int64_t options, rocshmem_ctx_t *ctx)
.. cpp:function:: __device__ int rocshmem_wg_team_create_ctx(rocshmem_team_t team, long options, rocshmem_ctx_t *ctx)
:param team: Team handle to derive the context from.
:param options: Options for context creation. Ignored in current design; use the value ``0``.
:param ctx: A handle to the newly created context.
:returns: All threads returns ``0`` if the context was created successfully.
If any thread returns non-zero value, the operation fails, ctx is set to ``ROCSHMEM_CTX_INVALID`` and a
higher number of ``ROCSHMEM_MAX_NUM_CONTEXTS`` is required.
**Description:**
This routine creates an rocSHMEM context. By design, the context is private to the calling work-group.
It must be called collectively by all threads in the work-group. If the context was created successfully, a value
of zero is returned and the context handle pointed to by ctx specifies a valid context; otherwise, a nonzero value
is returned and ctx is set to ``ROCSHMEM_CTX_INVALID``. An unsuccessful context creation call is not treated as an
error and the rocSHMEM library remains in a correct state. The creation call can be reattempted after additional
resources become available.
ROCSHMEM_CTX_DESTROY
--------------------
.. cpp:function:: __device__ void rocshmem_wg_ctx_destroy(rocshmem_ctx_t *ctx)
:param ctx: Context handle.
:returns: None.
**Description:**
This routine destroys an rocSHMEM context. It must be called collectively by all threads in the work-group.
If ctx has the value ``ROCSHMEM_CTX_INVALID``, no operation is performed.
ROCSHMEM_GET_DEVICE_CTX
-----------------------
.. cpp:function:: __host__ void * rocshmem_get_device_ctx()
:param: None.
:returns: Returns ``ROCSHMEM_CTX_DEFAULT`` device pointer that users.
can query from one instance of rocSHMEM host library and
use later for dynamic module initialization in
kernel bitcode device library in the same application.
**Description:**
This routine queries rocSHMEM default device context from host API.
@@ -0,0 +1,96 @@
.. meta::
:description: rocSHMEM environment variables reference
:keywords: rocSHMEM, ROCm, API, environment variables, environment, reference
.. _rocshmem-api-env-variables:
********************************************************************
rocSHMEM environment variables
********************************************************************
This section describes the important environment variables used to
control the behavior of rocSHMEM.
.. list-table::
:header-rows: 1
:widths: 35,14,51
* - **Environment variable**
- **Default value**
- **Value**
* - | ``ROCSHMEM_HEAP_SIZE``
| Defines the size of the rocSHMEM symmetric heap in bytes (per PE).
- ``1073741824`` (1 GB)
- | Size in bytes (per PE).
| Note: the heap is on GPU memory.
* - | ``ROCSHMEM_MAX_NUM_CONTEXTS``
| Defines the number of contexts an application can use.
- ``32``
- Maximum number of contexts.
* - | ``ROCSHMEM_MAX_NUM_TEAMS``
| Defines the number of teams an application can use.
- ``40``
- Maximum number of teams.
* - | ``ROCSHMEM_BACKEND``
| When rocSHMEM is compiled for all backends, this enviroment variable
| selects which backend to execute. The default value is an empty string and rocSHMEM auto-selects the most appropriate backend.
- `` ``
- | ``ipc``: IPC Backend
| ``ro``: Reverse Offload Backend
| ``gda``: GPU Direct Async Backend
* - | ``ROCSHMEM_UNIQUEID_WITH_MPI``
| Defines whether rocSHMEM is expected to use MPI when using the uniqueId based initialization.
- ``0``
- | ``0``: Do not use MPI.
| ``1``: Use MPI.
* - | ``ROCSHMEM_DISABLE_MIXED_IPC``
| Defines whether to force using the network conduit even when IPC is available.
- ``0``
- | ``0``: Use IPC when available.
| ``1``: Force network conduit.
* - | ``ROCSHMEM_USE_IB_HCA``
| Defines which NIC that this PE should be bound to. The default value is an empty string and rocSHMEM auto-detects the most appropriate NIC.
- `` ``
- | Example value: ``bnxt_re0``
* - | ``ROCSHMEM_BOOTSTRAP_SOCKET_IFNAME``
| Chooses the interface to bootstrap rocSHMEM with.
| Only valid when not using MPI.
| The default value is an empty string and rocSHMEM auto-detects the most appropriate interface.
- `` ``
- | Example value: ``eno8303``
* - | ``ROCSHMEM_GDA_PROVIDER``
| When rocSHMEM is compiled with support for multiple NIC vendors,
| the enviroment variable selects the desired provider.
| The default value is an empty string and rocSHMEM auto-detects the most appropriate NIC.
- `` ``
- | ``bnxt``: Broadcom Thor 2
| ``pensando``: AMD Pensando Pollara
| ``ionic``: AMD Pensando Pollara (alias)
| ``mlx5``: Mellanox ConnectX-7
* - | ``ROCSHMEM_GDA_ALTERNATE_QP_PORTS``
| Enables or disables alternating QP mappings across rocSHMEM contexts.
- ``1``
- | ``0``: Disabled.
| ``1``: Enabled. This helps saturate bandwidth on multiport bonded interfaces.
* - | ``ROCSHMEM_GDA_TRAFFIC_CLASS``
| When using an NIC with an Ethernet link layer, this sets the traffic class for the QPs.
- ``0``
- The traffic class number.
* - | ``ROCSHMEM_GDA_PCIE_RELAXED_ORDERING``
| Enables PCIe Relaxed Ordering when registering the symmetric heap with the RDMA NICs.
- ``0``
- | ``0``: Disabled.
| ``1``: Enabled.
+161
Visa fil
@@ -0,0 +1,161 @@
.. meta::
:description: rocSHMEM intra-kernel networking runtime for AMD dGPUs on the ROCm platform.
:keywords: rocSHMEM, API, ROCm, documentation, HIP, Networking, Communication
.. _rocshmem-api-init:
---------------------------------------
Library setup, exit, and query routines
---------------------------------------
ROCSHMEM_INIT
-------------
.. cpp:function:: __host__ void rocshmem_init(void)
:Parameters: None.
:returns: None.
**Description:**
This routine initializes the rocSHMEM library and underlying transport layer.
Before ``rocshmem_init`` is called,
you must select the device that this PE is associated to by calling
`hipSetDevice
<https://rocm.docs.amd.com/projects/HIP/en/docs-6.0.0/doxygen/html/group___device.html#ga43c1e7f15925eeb762195ccb5e063eae>`_.
.. WARNING::
Routine `rocshmem_wg_init` has been deprecated.
.. cpp:function:: [[deprecated]] __device__ void rocshmem_wg_init(void)
:Parameters: None.
:returns: None.
**Description:**
This routine has been deprecated, please do not use.
This routine initializes device-side rocSHMEM resources.
It must be called before any threads in this work-group invoke other rocSHMEM functions.
It must be called collectively by all threads in the work-group.
ROCSHMEM_FINALIZE
-----------------
.. cpp:function:: __host__ void rocshmem_finalize(void)
:Parameters: None.
:returns: None.
**Description:**
This routine finalizes the rocSHMEM library.
.. WARNING::
Routine `rocshmem_wg_finalize` has been deprecated.
.. cpp:function:: [[deprecated]] __device__ void rocshmem_wg_finalize(void)
:Parameters: None.
:returns: None.
**Description:**
This routine has been deprecated, please do not use.
This routine finalizes device-side rocSHMEM resources.
It must be called before work-group completion if the work-group also called ``rocshmem_wg_init``.
It must be called collectively by all threads in the work-group.
ROCSHMEM_INIT_ATTR
------------------
.. cpp:function:: __host__ int rocshmem_init_attr(unsigned int flags, rocshmem_init_attr_t *attr)
:param flags: The initialization method to be used.
:param attr: Attribute structure specifying input characteristics.
:returns int: Returns ``0`` on success; otherwise, returns a nonzero value.
**Description:**
This routine initializes the rocSHMEM runtime and underlying transport layer using
the provided mode and attributes.
The parameter ``flags`` can be either
``ROCSHMEM_INIT_WITH_UNIQUEID`` or ``ROCSHMEM_INIT_WITH_MPI_COMM``.
ROCSHMEM_GET_UNIQUEID
---------------------
.. cpp:function:: __host__ int rocshmem_get_uniqueid(rocshmem_uniqueid_t *uid)
:param uid: Pointer to a unique ID handle.
:returns: Returns ``0`` on success; otherwise, returns a nonzero value.
**Description:**
This routine returns a unique ID.
ROCSHMEM_SET_ATTR_UNIQUEID_ARGS
-------------------------------
.. cpp:function:: __host__ int rocshmem_set_attr_uniqueid_args(int rank, int nranks, rocshmem_uniqueid_t *uid, rocshmem_init_attr_t *attr)
:param rank: Rank of the calling process.
:param nranks: Number of PEs.
:param uid: Unique ID used to identify the group processes.
:param attr: Attribute structure to be passed to ``rocshmem_init_attr_t``.
:returns: Returns ``0`` on success; otherwise, returns a nonzero value.
**Description:**
This routine initializes the ``rocshmem_init_attr_t`` struct.
ROCSHMEM_N_PES
--------------
.. cpp:function:: __host__ int rocshmem_n_pes(void)
:Parameters: None.
:returns: Total number of PEs.
**Description:**
This routine queries the total number of PEs.
It can be called before ``rocshmem_init``.
.. cpp:function:: __device__ int rocshmem_n_pes(void)
.. cpp:function:: __device__ int rocshmem_ctx_n_pes(rocshmem_ctx_t ctx)
:param ctx: GPU side context handle.
:returns: Total number of PEs.
**Description:**
This routine queries the total number of PEs for a given context.
It can be called per thread with no performance penalty.
ROCSHMEM_MY_PE
--------------
.. cpp:function:: __host__ int rocshmem_my_pe(void)
:Parameters: None.
:returns: PE ID of the caller.
**Description:**
This routine queries the PE ID of the caller.
It can be called before ``rocshmem_init``.
.. cpp:function:: __device__ int rocshmem_my_pe(void)
.. cpp:function:: __device__ int rocshmem_ctx_my_pe(rocshmem_ctx_t ctx)
:param ctx: GPU side context handle.
:returns: PE ID of the caller.
**Description:**
This routine queries the PE ID of the caller.
It can be called per thread with no performance penalty.
ROCSHMEM_PTR
--------------
.. cpp:function:: __host__ void* rocshmem_ptr(const void *dest, int pe);
.. cpp:function:: __device__ void* rocshmem_ptr(const void *dest, int pe);
:param dest: Local symmetric heap allocation pointer for current PE.
:param pe: Remote PE.
:returns: Returns remote symmetric heap device pointer from host-side API.
``NULL`` is returned if a valid device pointer cannot be provided.
This pointer can be used to issue load/store from custom kernels
instead of using rocshmem device side get/put APIs for RMA operations.
**Description:**
This routine queries rocSHMEM remote symmetric heap pointer.
@@ -0,0 +1,35 @@
.. meta::
:description: rocSHMEM intra-kernel networking runtime for AMD dGPUs on the ROCm platform.
:keywords: rocSHMEM, API, ROCm, documentation, HIP, Networking, Communication
.. _rocshmem-api-memory-management:
---------------------------
Memory management routines
---------------------------
ROCSHMEM_MALLOC
---------------
.. cpp:function:: __host__ void *rocshmem_malloc(size_t size)
:param size: Memory allocation size in bytes.
:returns: A pointer to the allocated memory on the symmetric heap.
If a valid allocation cannot be made, it returns ``NULL``.
**Description:**
This routine allocates memory of ``size`` bytes from the symmetric heap.
This is a collective operation and must be called by all PEs.
ROCSHMEM_FREE
-------------
.. cpp:function:: __host__ void rocshmem_free(void *ptr)
:param ptr: A pointer to previously allocated memory on the symmetric heap.
:returns: None.
**Description:**
This routine frees a memory allocation from the symmetric heap.
It is a collective operation and must be called by all PEs.
@@ -0,0 +1,51 @@
.. meta::
:description: rocSHMEM intra-kernel networking runtime for AMD dGPUs on the ROCm platform.
:keywords: rocSHMEM, API, ROCm, documentation, HIP, Networking, Communication
.. _rocshmem-api-memory-ordering:
---------------------------
Memory ordering routines
---------------------------
ROCSHMEM_FENCE
--------------
.. cpp:function:: __device__ void rocshmem_fence()
.. cpp:function:: __device__ void rocshmem_fence(int pe)
.. cpp:function:: __device__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx)
.. cpp:function:: __device__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx, int pe)
:param ctx: Context with which to perform this operation.
:param pe: Destination ``pe``.
:returns: None.
**Description:**
This routine ensures order between messages in this context to follow OpenSHMEM semantics.
ROCSHMEM_QUIET
--------------
.. cpp:function:: __device__ void rocshmem_ctx_quiet(rocshmem_ctx_t ctx)
.. cpp:function:: __device__ void rocshmem_quiet()
:param ctx: Context with which to perform this operation.
:returns: None.
**Description:**
This routine completes all previous operations posted to this context.
ROCSHMEM_PE_QUIET
-----------------
.. cpp:function:: __device__ void rocshmem_ctx_pe_quiet(shmem_ctx_t ctx, const int *target_pes, size_t npes)
.. cpp:function:: __device__ void rocshmem_pe_quiet(const int *target_pes, size_t npes)
:param ctx: Context with which to perform this operation.
:param target_pes: Address of target PE array where the operations need to be completed
:param npes: The number of PEs in the target PE array
:returns: None.
**Description:**
This routine completes all previous operations posted to this context
for the PEs in the `target_pes` array.
+142
Visa fil
@@ -0,0 +1,142 @@
.. meta::
:description: rocSHMEM intra-kernel networking runtime for AMD dGPUs on the ROCm platform.
:keywords: rocSHMEM, API, ROCm, documentation, HIP, Networking, Communication
.. _rocshmem-api-pt2pt-sync:
-----------------------------------------
Point-to-point synchronization routines
-----------------------------------------
ROCSHMEM_WAIT_UNTIL
-------------------
.. cpp:function:: __device__ void rocshmem_TYPENAME_wait_until(TYPE *ivars, int cmp, TYPE val)
:param ivars: Pointer to memory on the symmetric heap to wait for.
:param cmp: Operation for the comparison.
:param val: Value to compare the memory at ``ivars`` to.
:returns: None.
**Description:**
This routine blocks the caller until the condition ``(*ivars cmp val)`` is true.
Valid ``cmp`` values are listed in :ref:`CMP_VALUES`.
Valid ``TYPENAME`` and ``TYPE`` values are listed in :ref:`STANDARD_AMO_TYPES`.
ROCSHMEM_WAIT_UNTIL_ALL
-----------------------
.. cpp:function:: __device__ void rocshmem_TYPENAME_wait_until_all(TYPE *ivars, size_t nelems, const int* status, int cmp, TYPE val)
:param ivars: Pointer to memory on the symmetric heap to wait for.
:param nelems: Number of elements in the ``ivars`` array.
:param status: Array of length ``nelems`` to exclude elements from the wait.
:param cmp: Operation for the comparison.
:param val: Value to compare.
:returns: None.
**Description:**
This routine blocks the caller until the condition ``(ivars[i] cmp val)`` is true for all ``ivars``.
Valid ``cmp`` values are listed in :ref:`CMP_VALUES`.
Valid ``TYPENAME`` and ``TYPE`` values are listed in :ref:`STANDARD_AMO_TYPES`.
ROCSHMEM_WAIT_UNTIL_ANY
-----------------------
.. cpp:function:: __device__ size_t rocshmem_TYPENAME_wait_until_any(TYPE *ivars, size_t nelems, const int* status, int cmp, TYPE val)
:param ivars: Pointer to memory on the symmetric heap to wait for.
:param nelems: Number of elements in the ``ivars`` array.
:param status: Array of length ``nelems`` to exclude elements from the wait.
:param cmp: Operation for the comparison.
:param val: Value to compare.
:returns: The index of an element in the ``ivars`` array that satisfies the wait condition. If the wait set is empty, this routine returns ``SIZE_MAX``.
**Description:**
This routine blocks the caller until any of the condition ``(ivars[i] cmp val)`` is true.
Valid ``cmp`` values are listed in :ref:`CMP_VALUES`.
Valid ``TYPENAME`` and ``TYPE`` values are listed in :ref:`STANDARD_AMO_TYPES`.
ROCSHMEM_WAIT_UNTIL_SOME
------------------------
.. cpp:function:: __device__ size_t rocshmem_TYPENAME_wait_until_some(TYPE *ivars, size_t nelems, size_t* indices, const int* status, int cmp, TYPE val)
:param ivars: Pointer to memory on the symmetric heap to wait for.
:param nelems: Number of elements in the ``ivars`` array.
:param indices: List of indices with a length of at least ``nelems``.
:param status: Array of length ``nelems`` to exclude elements from the wait.
:param cmp: Operation for the comparison.
:param val: Value to compare.
:returns: The number of indices returned in the indices array. If the wait set is empty, this routine returns ``0``.
**Description:**
This routine blocks the caller until any of the conditions ``(ivars[i] cmp val)`` is true.
Valid ``cmp`` values are listed in :ref:`CMP_VALUES`.
Valid ``TYPENAME`` and ``TYPE`` values are listed in :ref:`STANDARD_AMO_TYPES`.
ROCSHMEM_TEST
-------------
.. cpp:function:: __device__ int rocshmem_TYPENAME_test(TYPE *ivars, int cmp, TYPE val)
:param ivars: Pointer to memory on the symmetric heap to wait for.
:param cmp: Operation for the comparison.
:param val: Value to compare the memory at ``ivars`` to.
:returns: ``1`` if the evaluation is true. ``0`` otherwise.
**Description:**
This routine tests if the condition ``(*ivars cmp val)`` is true.
ROCSHMEM_SIGNAL_WAIT_UNTIL_ON_STREAM
-------------------------------------
.. cpp:function:: __host__ void rocshmem_signal_wait_until_on_stream(uint64_t *sig_addr, int cmp, uint64_t cmp_value, hipStream_t stream)
:param sig_addr: Address of the signal variable on the symmetric heap.
:param cmp: Comparison operator (e.g., ROCSHMEM_CMP_EQ, ROCSHMEM_CMP_GE, etc.).
:param cmp_value: Value to compare against.
:param stream: HIP stream on which to enqueue the operation.
:returns: None.
**Description:**
This routine enqueues a wait operation on a HIP stream. The function blocks the calling thread
until the signal variable at ``sig_addr`` satisfies the comparison condition ``(*sig_addr cmp cmp_value)``.
The wait operation is executed asynchronously on the specified stream. The caller must synchronize
the stream (e.g., using ``hipStreamSynchronize``) to ensure the wait condition has been satisfied.
Valid ``cmp`` values are listed in :ref:`CMP_VALUES`.
.. _CMP_VALUES:
Supported comparisons
---------------------
The following table lists the point-to-point comparison constants:
.. list-table:: Point-to-Point Comparison Constants
:widths: 20 20
:header-rows: 1
* - Constant
- Description
* - ROCSHMEM_CMP_EQ
- Equal
* - ROCSHMEM_CMP_NE
- Not equal
* - ROCSHMEM_CMP_GT
- Greater than
* - ROCSHMEM_CMP_GE
- Greater than or equal to
* - ROCSHMEM_CMP_LT
- Less than
* - ROCSHMEM_CMP_LE
- Less than or equal to
+278
Visa fil
@@ -0,0 +1,278 @@
.. meta::
:description: rocSHMEM intra-kernel networking runtime for AMD dGPUs on the ROCm platform.
:keywords: rocSHMEM, API, ROCm, documentation, HIP, Networking, Communication
.. _rocshmem-api-rma:
-----------------------------------------
Remote memory access routines
-----------------------------------------
- Routines with the ``_wave`` and ``_wg`` suffixes require all threads in a wavefront and workgroup, respectively,
to call the routine with the same parameters.
- Routines with the ``_nbi`` substring will return as soon as the request is posted.
- Routines without the ``_nbi`` substring will block until the operation completes locally.
- Valid ``TYPENAME`` and ``TYPE`` values can be found in RMA_TYPES_.
ROCSHMEM_PUT
------------
.. cpp:function:: __device__ void rocshmem_TYPENAME_put(TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_put_wave(TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_put_wg(TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_put_nbi(TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_put_nbi_wave(TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_put_nbi_wg(TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_put(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_put_wave(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_put_wg(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_put_nbi(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_put_nbi_wave(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_put_nbi_wg(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, int pe)
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param source: Source address. Must be an address on the symmetric heap.
:param nelems: The number of elements to transfer.
:param pe: PE of the remote process.
:returns: None.
**Description:**
This routine writes contiguous data of ``nelems`` elements from source on the calling PE to ``dest`` at ``pe``.
ROCSHMEM_PUTMEM
---------------
.. cpp:function:: __device__ void rocshmem_putmem(void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_putmem_wave(void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_putmem_wg(void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_putmem_nbi(void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_putmem_nbi_wave(void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_putmem_nbi_wg(void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_putmem(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_putmem_wave(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_putmem_wg(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_putmem_nbi(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_putmem_nbi_wave(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_putmem_nbi_wg(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe)
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param source: Source address. Must be an address on the symmetric heap.
:param nelems: Size of the transfer in bytes.
:param pe: PE of the remote process.
:returns: None.
**Description:**
This routine writes contiguous data of ``nelems`` bytes from source on the calling PE to ``dest`` at ``pe``.
ROCSHMEM_PUTMEM_ON_STREAM
--------------------------
.. cpp:function:: __host__ void rocshmem_putmem_on_stream(void *dest, const void *source, size_t nelems, int pe, hipStream_t stream)
:param dest: Destination address. Must be an address on the symmetric heap.
:param source: Source address. Must be an address on the symmetric heap.
:param nelems: Size of the transfer in bytes.
:param pe: PE of the remote process.
:param stream: HIP stream on which to enqueue the operation.
:returns: None.
**Description:**
This routine enqueues a putmem RMA operation on a HIP stream. The function writes contiguous
data of ``nelems`` bytes from source on the calling PE to ``dest`` at ``pe``. The operation
is enqueued on the specified stream and will execute asynchronously. The caller must
synchronize the stream (e.g., using ``hipStreamSynchronize``) to ensure completion.
ROCSHMEM_P
----------
.. cpp:function:: __device__ void rocshmem_TYPENAME_p(TYPE *dest, TYPE value, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_p(rocshmem_ctx_t ctx, TYPE *dest, TYPE value, int pe)
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param value: Value to write to ``dest`` at ``pe``.
:param pe: PE of the remote process.
:returns: None.
**Description:**
This routine writes a single value to to ``dest`` at ``pe``.
ROCSHMEM_GET
------------
.. cpp:function:: __device__ void rocshmem_TYPENAME_get(TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_get_wave(TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_get_wg(TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_get_nbi(TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_get_nbi_wave(TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_get_nbi_wg(TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_get(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_get_wave(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_get_wg(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_get_nbi(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_get_nbi_wave(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_get_nbi_wg(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, int pe)
:param ctx: Context with which to perform this operation.
:param dest: Destination address; Must be an address on the symmetric heap.
:param source: Source address. Must be an address on the symmetric heap.
:param nelems: The number of elements to transfer.
:param pe: PE of the remote process.
:returns: None.
**Description:**
This routine reads contiguous data of ``nelems`` elements from source on ``pe`` to ``dest`` on the calling PE.
ROCSHMEM_GETMEM
---------------
.. cpp:function:: __device__ void rocshmem_getmem(void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_getmem_wave(void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_getmem_wg(void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_getmem_nbi(void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_getmem_nbi_wave(void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_getmem_nbi_wg(void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_getmem(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_getmem_wave(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_getmem_wg(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_getmem_nbi(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_getmem_nbi_wave(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_getmem_nbi_wg(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, int pe)
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param source: Source address. Must be an address on the symmetric heap.
:param nelems: Size of the transfer in bytes.
:param pe: PE of the remote process.
:returns: None.
**Description:**
This routine reads contiguous data of ``nelems`` bytes from source on ``pe`` to ``dest`` on the calling PE.
ROCSHMEM_GETMEM_ON_STREAM
--------------------------
.. cpp:function:: __host__ void rocshmem_getmem_on_stream(void *dest, const void *source, size_t nelems, int pe, hipStream_t stream)
:param dest: Destination address. Must be an address on the symmetric heap.
:param source: Source address. Must be an address on the symmetric heap.
:param nelems: Size of the transfer in bytes.
:param pe: PE of the remote process.
:param stream: HIP stream on which to enqueue the operation.
:returns: None.
**Description:**
This routine enqueues a getmem RMA operation on a HIP stream. The function reads contiguous
data of ``nelems`` bytes from source on ``pe`` to ``dest`` on the calling PE. The operation
is enqueued on the specified stream and will execute asynchronously. The caller must
synchronize the stream (e.g., using ``hipStreamSynchronize``) to ensure completion.
ROCSHMEM_G
----------
.. cpp:function:: __device__ float rocshmem_ctx_float_g(rocshmem_ctx_t ctx, const float *source, int pe)
.. cpp:function:: __device__ float rocshmem_float_g(const float *source, int pe)
:param ctx: Context with which to perform this operation.
:param source: Source address. Must be an address on the symmetric heap.
:param pe: PE of the remote process.
:returns: The value read from source at ``pe``.
**Description:**
This routine reads and returns single value from source at ``pe``.
Supported RMA data types
------------------------
The following table lists the supported RMA data types:
.. _RMA_TYPES:
.. list-table:: RMA Data Types
:widths: 10 20 20
:header-rows: 1
* - TYPE
- TYPENAME
- Supported
* - float
- float
- Yes
* - double
- double
- Yes
* - long double
- longdouble
- No
* - char
- char
- Yes
* - signed char
- schar
- Yes
* - short
- short
- Yes
* - int
- int
- Yes
* - long
- long
- Yes
* - long long
- longlong
- Yes
* - unsigned char
- uchar
- Yes
* - unsigned short
- ushort
- Yes
* - unsigned int
- uint
- Yes
* - unsigned long
- ulong
- Yes
* - unsigned long long
- ulonglong
- Yes
* - int8_t
- int8
- No
* - int16_t
- int16
- No
* - int32_t
- int32
- No
* - int64_t
- int64
- Yes
* - uint8_t
- uint8
- No
* - uint16_t
- uint16
- No
* - uint32_t
- uint32
- No
* - uint64_t
- uint64
- No
* - size_t
- size
- No
* - ptrdiff_t
- ptrdiff
- No
+125
Visa fil
@@ -0,0 +1,125 @@
.. meta::
:description: rocSHMEM intra-kernel networking runtime for AMD dGPUs on the ROCm platform.
:keywords: rocSHMEM, API, ROCm, documentation, HIP, Networking, Communication
.. _rocshmem-api-sigops:
---------------------
Signaling operations
---------------------
ROCSHMEM_PUTMEM_SIGNAL
----------------------
.. cpp:function:: __device__ void rocshmem_putmem_signal(void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_putmem_signal_wave(void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_putmem_signal_wg(void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_putmem_signal_nbi(void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_putmem_signal_nbi_wave(void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_putmem_signal_nbi_wg(void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_putmem_signal(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_putmem_signal_wave(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_putmem_signal_wg(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_putmem_signal_nbi(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_putmem_signal_nbi_wave(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_putmem_signal_nbi_wg(rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param source: Source address. Must be an address on the symmetric heap.
:param nelems: The number of bytes to transfer.
:param sig_addr: Signal address. Must be an address on the symmetric heap.
:param signal: Signal value.
:param sig_op: Atomic operation to apply the signal value.
:param pe: PE of the remote process.
:returns: None.
**Description:**
This function writes contiguous data of ``nelems`` bytes from source on the calling PE to ``dest`` at ``pe``,
then applies ``sig_op`` at ``sig_addr`` with the signal value.
Valid ``sig_op values`` are listed in SIGNAL_OPERATORS_.
ROCSHMEM_PUT_SIGNAL
-------------------
.. cpp:function:: __device__ void rocshmem_TYPENAME_put_signal(TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_put_signal_wave(TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_put_signal_wg(TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_put_signal_nbi(TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_put_signal_nbi_wave(TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_TYPENAME_put_signal_nbi_wg(TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_put_signal(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_put_signal_wave(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_put_signal_wg(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_put_signal_nbi(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_put_signal_nbi_wave(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
.. cpp:function:: __device__ void rocshmem_ctx_TYPENAME_put_signal_nbi_wg(rocshmem_ctx_t ctx, TYPE *dest, const TYPE *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe)
:param ctx: Context with which to perform this operation.
:param dest: Destination address. Must be an address on the symmetric heap.
:param source: Source address. Must be an address on the symmetric heap.
:param nelems: The number of elements of size ``TYPE`` to transfer.
:param sig_addr: Signal address. Must be an address on the symmetric heap.
:param signal: Signal value.
:param sig_op: Atomic operation to apply the signal value.
:param pe: PE of the remote process.
:returns: None.
**Description:**
This function writes contiguous data of ``nelems`` elements of ``TYPE`` from source on the calling PE to ``dest`` at ``pe``,
then applies ``sig_op`` at ``sig_addr`` with the signal value.
Valid ``sig_op values`` are listed in SIGNAL_OPERATORS_.
Valid ``TYPENAME`` and ``TYPE`` values are listed in :ref:`RMA_TYPES`.
ROCSHMEM_PUTMEM_SIGNAL_ON_STREAM
---------------------------------
.. cpp:function:: __host__ void rocshmem_putmem_signal_on_stream(void *dest, const void *source, size_t nelems, uint64_t *sig_addr, uint64_t signal, int sig_op, int pe, hipStream_t stream)
:param dest: Destination address on the remote PE. Must be an address on the symmetric heap.
:param source: Source address on the local PE. Must be an address on the symmetric heap.
:param nelems: Size of the transfer in bytes.
:param sig_addr: Address of signal variable on the remote PE. Must be an address on the symmetric heap.
:param signal: Signal value to be written.
:param sig_op: Signal operation (ROCSHMEM_SIGNAL_SET or ROCSHMEM_SIGNAL_ADD).
:param pe: PE number of the remote PE.
:param stream: HIP stream on which to enqueue the operation.
:returns: None.
**Description:**
This routine enqueues a put-with-signal operation on a HIP stream. The function writes contiguous
data of ``nelems`` bytes from source on the calling PE to ``dest`` at ``pe``, then applies ``sig_op``
at ``sig_addr`` with the signal value. The operation is enqueued on the specified stream and will
execute asynchronously. The caller must synchronize the stream (e.g., using ``hipStreamSynchronize``)
to ensure completion.
Valid ``sig_op`` values are listed in SIGNAL_OPERATORS_.
ROCSHMEM_SIGNAL_FETCH
---------------------
.. cpp:function:: __device__ uint64_t rocshmem_signal_fetch(const uint64_t *sig_addr)
.. cpp:function:: __device__ uint64_t rocshmem_signal_fetch_wg(const uint64_t *sig_addr)
.. cpp:function:: __device__ uint64_t rocshmem_signal_fetch_wave(const uint64_t *sig_addr)
:param sig_addr: Signal address. Must be an address on the symmetric heap.
:returns: Value at ``sig_addr``.
**Description:**
This function atomically fetches the value stored at ``sig_addr``.
Signal operators
----------------
.. _SIGNAL_OPERATORS:
.. list-table:: Signal Operators
:widths: 20 40
:header-rows: 1
* - Value
- Description
* - ROCSHMEM_SIGNAL_SET
- The signaling operation routines will atomically set the signal value at ``sig_addr``.
* - ROCSHMEM_SIGNAL_ADD
- The signaling operation routines will atomically add the signal value at ``sig_addr``.
+90
Visa fil
@@ -0,0 +1,90 @@
.. meta::
:description: rocSHMEM intra-kernel networking runtime for AMD dGPUs on the ROCm platform.
:keywords: rocSHMEM, API, ROCm, documentation, HIP, Networking, Communication
.. _rocshmem-api-teams:
-------------------------
Team management routines
-------------------------
ROCSHMEM_TEAM_MY_PE
-------------------
.. cpp:function:: __host__ int rocshmem_team_my_pe(rocshmem_team_t team)
:param team: The team to query.
:returns: PE ID of the caller in the provided team.
**Description:**
This routine queries the PE ID of the caller in a team.
ROCSHMEM_TEAM_N_PES
-------------------
.. cpp:function:: __host__ int rocshmem_team_n_pes(rocshmem_team_t team)
:param team: The team to query.
:returns: Number of PEs in the provided team.
**Description:**
This routine queries the number of PEs in a team.
ROCSHMEM_TEAM_TRANSLATE_PE
--------------------------
.. cpp:function:: __host__ int rocshmem_team_translate_pe(rocshmem_team_t src_team, int src_pe, rocshmem_team_t dest_team)
:param src_team: Handle of the team from which to translate.
:param src_pe: PE-of-interest's index in ``src_team``.
:param dest_team: Handle of the team to which to translate.
:returns: PE of ``src_pe`` in ``dest_team``.
If any input is invalid or if ``src_pe`` is
not in both source and destination teams, a value of ``-1`` is returned.
**Description:**
This routine translates the PE in ``src_team`` to that in ``dest_team``.
ROCSHMEM_TEAM_SPLIT_STRIDED
---------------------------
.. cpp:function:: __host__ int rocshmem_team_split_strided(rocshmem_team_t parent_team, int start, int stride, int size, const rocshmem_team_config_t *config, long config_mask, rocshmem_team_t *new_team)
:param parent_team: The team to split from.
:param start: The lowest PE number of the subset of the PEs
from the parent team that will form the new
team.
:param stride: The stride between team PE members in the
parent team that comprise the subset of PEs
that will form the new team.
:param size: The number of PEs in the new team.
:param config: Pointer to the config parameters for the new team.
:param config_mask: Bitwise mask representing parameters to use from config.
:param new_team: Pointer to the newly created team.
If an error occurs during team creation, or if the PE in
the parent team is not in the new team, the value will be
``ROCSHMEM_TEAM_INVALID``.
:returns: Zero upon successful team creation; non-zero if erroneous.
**Description:**
This routine creates a new a team of PEs. It must be called by all PEs in the parent team.
ROCSHMEM_TEAM_DESTROY
---------------------
.. cpp:function:: __host__ void rocshmem_team_destroy(rocshmem_team_t team)
:param team: The team to destroy. The behavior is undefined if
the input team is ``ROCSHMEM_TEAM_WORLD`` or any other
invalid team. If the input is ``ROCSHMEM_TEAM_INVALID``,
this function will not perform any operation.
:returns: None
**Description:**
This routine destroys a team. It must be called by all PEs in the team.
You must destroy all private contexts created in the
team before destroying this team. Otherwise, the behavior
is undefined. This call will destroy only the shareable contexts
created from the referenced team.
+60
Visa fil
@@ -0,0 +1,60 @@
.. meta::
:description: Information on how to compile and run rocSHMEM applications.
:keywords: rocSHMEM, ROCm, library, API, compile, link, hipcc
.. _running-applications:
--------------------------------------------------
Compiling and running rocSHMEM applications
--------------------------------------------------
This topic explains how to compile and run rocSHMEM applications.
Compiling and linking with rocSHMEM
-----------------------------------
rocSHMEM is a library that can be statically linked to your application during compilation with ``hipcc``. For more information, see :doc:`HIPCC <hipcc:index>`.
When compiling your application with ``hipcc``, you must include the rocSHMEM header files and the rocSHMEM library.
Because rocSHMEM depends on MPI (Message Passing Interface), you must manually add the arguments for MPI linkage instead of using ``mpicc``.
When using ``hipcc`` directly without a build system, it's recommended to perform the compilation and linking steps separately.
Example compile and link commands are provided at the top of the example files in the ``examples`` directory:
.. code-block:: bash
# Compile
hipcc -c -fgpu-rdc -x hip rocshmem_allreduce_test.cc \
-I/opt/rocm/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
# Link
hipcc -fgpu-rdc --hip-link rocshmem_allreduce_test.o -o rocshmem_allreduce_test \
$ROCSHMEM_INSTALL_DIR/lib/librocshmem.a \
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
If your project uses CMake, see
`Using CMake with AMD ROCm <https://rocmdocs.amd.com/en/latest/conceptual/cmake-packages.html>`_.
Running a rocSHMEM application
------------------------------
Applications using rocSHMEM typically deploy multiple processes, usually one per GPU.
The MPI launcher, for example, ``mpiexec`` with Open MPI, is used to start the required number
of processes. For example, to launch two ``getmem`` example processes (available when compiled from source):
.. code-block:: bash
mpiexec --map-by numa --mca pml ucx --mca osc ucx -np 2 ./build/examples/rocshmem_getmem_test
See the `Open MPI documentation <https://docs.open-mpi.org/en/main/>`_ for more information about ``mpiexec`` command line parameters.
.. note::
Some systems may have multiple MPI installations, some of which do not
have GPU support enabled. You must use the ``mpiexec`` from the expected
MPI library, especially when using the MPI built by yourself
as part of :ref:`install-dependencies`.
+36
Visa fil
@@ -0,0 +1,36 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
import re
from rocm_docs import ROCmDocs
with open('../include/rocshmem/rocshmem.hpp', encoding='utf-8') as f:
match = re.search(r'constexpr char VERSION\[\] = "([0-9.]+)[^0-9.]+', f.read())
if not match:
raise ValueError("VERSION not found!")
version_number = match[1]
left_nav_title = f"rocSHMEM {version_number} documentation"
# for PDF output on Read the Docs
project = "rocSHMEM"
author = "Advanced Micro Devices, Inc."
copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved."
version = version_number
release = version_number
external_toc_path = "./sphinx/_toc.yml"
docs_core = ROCmDocs(left_nav_title)
docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
docs_core.setup()
external_projects_current_project = "rocshmem"
cpp_id_attributes = ["__host__", "__global__", "__device__"]
exclude_patterns = ["README.md"]
for sphinx_var in ROCmDocs.SPHINX_VARS:
globals()[sphinx_var] = getattr(docs_core, sphinx_var)
Filskillnaden har hållits tillbaka eftersom den är för stor Load Diff
+42
Visa fil
@@ -0,0 +1,42 @@
.. meta::
:description: rocSHMEM is a runtime that provides GPU-centric networking through an OpenSHMEM-like interface.
:keywords: rocSHMEM, ROCm, OpenSHMEM, library, API, IPC, RO
****************************
rocSHMEM documentation
****************************
The ROCm OpenSHMEM (rocSHMEM) is an intra-kernel networking library that provides GPU-centric networking through an OpenSHMEM-like interface. It simplifies application code complexity and enables finer communication and computation overlap than traditional host-driven networking. rocSHMEM uses a single symmetric heap allocated to GPU memories. For more information, see :doc:`introduction`.
The rocSHMEM public repository is located at `<https://github.com/ROCm/rocSHMEM>`_.
.. grid:: 2
:gutter: 3
.. grid-item-card:: Install
* :doc:`Install rocSHMEM <./install>`
.. grid-item-card:: How to
* :doc:`Compile and run applications <./compile_and_run>`
.. grid-item-card:: API reference
* :doc:`Library setup, exit, and query routines <./api/init>`
* :doc:`Memory management routines <./api/memory_management>`
* :doc:`Team management routines <./api/teams>`
* :doc:`Context management routines <./api/ctx>`
* :doc:`Environment variables <./api/env_variables>`
* :doc:`Remote memory access routines <./api/rma>`
* :doc:`Atomic memory operations <./api/amo>`
* :doc:`Signaling operations <./api/sigops>`
* :doc:`Collective routines <./api/coll>`
* :doc:`Point-to-point synchronization routines <./api/pt2pt_sync>`
* :doc:`Memory ordering routines <./api/memory_ordering>`
To contribute to the documentation, refer to
`Contributing to ROCm <https://rocm.docs.amd.com/en/latest/contribute/contributing.html>`_.
You can find licensing information on the
`Licensing <https://rocm.docs.amd.com/en/latest/about/license.html>`_ page.
+222
Visa fil
@@ -0,0 +1,222 @@
.. meta::
:description: Instruction on how to install rocSHMEM.
:keywords: rocSHMEM, ROCm, install, build, dependencies, MPI, UCX, Open MPI
.. _install-rocshmem:
---------------------------
Installing rocSHMEM
---------------------------
This topic describes how to install rocSHMEM.
Requirements
------------
* ROCm 6.4.0 or later, including the :doc:`HIP runtime <hip:index>`. For more information, see `ROCm installation for Linux <https://rocm.docs.amd.com/projects/install-on-linux/en/latest/>`_.
* The following AMD GPUs have been fully tested for compatibility with rocSHMEM:
* MI250X
* MI300X
* MI350X (Requires ROCm 7.0 or later)
.. note::
Other AMD GPUs might function with unknown limitations. For the complete list of supported hardware, see `ROCm System Requirements <https://rocm.docs.amd.com/projects/install-on-linux-internal/en/latest/reference/system-requirements.html>`_.
* The RO backend requires ROCm-aware Open MPI and UCX. When using the IPC or GDA backends, MPI is optional.
For more information about installing ROCm-aware Open MPI and UCX, see :ref:`install-dependencies`.
* Inter-node communication requires AMD Pollara IONIC, Broadcom Thor 2, or CX7 Infiniband NICs.
Available network backends
--------------------------
rocSHMEM supports the following network backends:
* The **IPC (Inter-Process Communication)** backend enables fast communication between GPUs on the same host using ROCm inter-process mechanisms. It does not support inter-node communication.
* The **RO (Reverse Offload)** backend enables communication between GPUs on different nodes through a NIC, using a host-based proxy to forward communication orders to and from the GPU. RO is built on an MPI-RMA compatibility layer.
* The **GDA (GPU Direct Async)** backend enables communication between GPUs on different nodes through a NIC. In this backend, the GPU directly interacts with the NIC with no host (CPU) involvement in the critical path of communication.
You can activate IPC, RO, and GDA backends in the same rocSHMEM build.
.. note::
When RO + IPC is active, all atomic operations use the RO backend, even for intra-node communication.
When GDA + IPC is active, all atomic operations use the GDA backend, even for intra-node communication.
Installing from a package manager
---------------------------------
On Ubuntu, you can install rocSHMEM by running:
.. code-block:: bash
apt install rocshmem-dev
.. note::
This installation method requires ROCm 6.4 or later. You must manually build dependencies such as Open MPI and UCX, because the distribution packaged versions don't include full accelerator support. For more information, see :ref:`install-dependencies`.
.. _install-dependencies:
Building dependencies
---------------------
GDA NIC dependencies
^^^^^^^^^^^^^^^^^^^^
- GDA on Mellanox NICs should work on any recent version of rdma-core.
- GDA on Broadcom Thor requires driver version 233.2.108.0 and firmware version 233.2.104.0 or later.
Building rocSHMEM with MPI (Optional)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
rocSHMEM requires ROCm-Aware Open MPI and UCX for the RO backend.
MPI is optional with the IPC and GDA backends.
Other MPI implementations, such as MPICH, have not been fully tested.
To build and configure ROCm-Aware UCX 1.17.0 or later, run:
.. code-block:: bash
git clone https://github.com/ROCm/ucx.git -b v1.17.x
cd ucx
./autogen.sh
./configure --prefix=<prefix_dir> --with-rocm=<rocm_path> --enable-mt
make -j 8
make -j 8 install
To build Open MPI 5.0.7 or later with UCX support, run:
.. code-block:: bash
git clone --recursive https://github.com/open-mpi/ompi.git -b v5.0.x
cd ompi
./autogen.pl
./configure --prefix=<prefix_dir> --with-rocm=<rocm_path> --with-ucx=<ucx_path>
make -j 8
make -j 8 install
Alternatively, you can use a script to install dependencies:
.. code-block:: bash
export BUILD_DIR=/path/to/not_rocshmem_src_or_build/dependencies
/path/to/rocshmem_src/scripts/install_dependencies.sh
.. note::
Configuration options vary by platform. Review the script to ensure it is compatible with your system.
For more information about OpenMPI-UCX support, see
`GPU-enabled Message Passing Interface <https://rocm.docs.amd.com/en/latest/how-to/gpu-enabled-mpi.html>`_.
Installing from source
--------------------------------
You can choose from three communication backends at build time for rocSHMEM: IPC, RO, and GDA.
Backend can be combined during build time.
MPI is not required to build rocSHMEM. To disable MPI, pass
the following flag to the build configuration scripts ``-DUSE_EXTERNAL_MPI=OFF``.
However, this will disable the functional and unit
tests, as they required MPI to run.
All backends build
^^^^^^^^^^^^^^^^^^
To build and install rocSHMEM with all three backends, run:
.. code-block:: bash
git clone git@github.com:ROCm/rocSHMEM.git
cd rocSHMEM
mkdir build
cd build
../scripts/build_configs/all_backends
The build script passes configuration options to CMake to set up a canonical build.
.. note::
This builds rocSHMEM with all backends. You can select IPC, RO, GDA, or any combination at runtime. However, this portability can reduce performance, so the other build scripts are recommended if you need maximum performance.
GDA backend build
^^^^^^^^^^^^^^^^^
To build and install rocSHMEM with the GDA backends, run:
.. code-block:: bash
git clone git@github.com:ROCm/rocSHMEM.git
cd rocSHMEM
mkdir build
cd build
# Choose one of the following scripts for your NIC vendor:
../scripts/build_configs/gda_bnxt # Broadcom
../scripts/build_configs/gda_ionic # AMD Pollara
../scripts/build_configs/gda_mlx5 # Mellanox
The build script passes configuration options to CMake to set up a canonical build.
RO and IPC backend build
^^^^^^^^^^^^^^^^^^^^^^^^
To build and install rocSHMEM with the hybrid RO (off-node) and IPC (on-node) backends, run:
.. code-block:: bash
git clone git@github.com:ROCm/rocSHMEM.git
cd rocSHMEM
mkdir build
cd build
../scripts/build_configs/ro_ipc
The build script passes configuration options to CMake to set up a canonical build.
.. note::
The only officially supported configuration for the RO backend uses Open MPI and UCX with a CX7 InfiniBand adapter. For more information, see :ref:`install-dependencies`. Other configurations, such as MPI implementations that are thread-safe and support GPU buffers, might work but are considered experimental.
IPC only backend build
^^^^^^^^^^^^^^^^^^^^^^
To build and install rocSHMEM with the IPC on-node, GPU-to-GPU backend, run:
.. code-block:: bash
git clone git@github.com:ROCm/rocSHMEM.git
cd rocSHMEM
mkdir build
cd build
../scripts/build_configs/ipc_single
The build script passes configuration options to CMake to setup a single-node build.
This is similar to the default build in ROCm 6.4.
.. note::
The default configuration changed from IPC only in ROCm 6.4 (built with the ``ipc_single`` script) to RO and IPC in ROCm 7.0 (built with the ``ro_ipc`` script).
Other experimental configuration scripts are available in ``./scripts/build_configs``, but only ``ipc_single`` and ``ro_ipc``
are officially supported.
Installation prefix
^^^^^^^^^^^^^^^^^^^
By default, the build scripts install the library to ``~/rocshmem``. You can customize the installation path by adding
the desired path as the script parameter. For example, to relocate the default configuration:
.. code-block:: bash
../scripts/build_configs/ro_ipc /path/to/install
+45
Visa fil
@@ -0,0 +1,45 @@
.. meta::
:description: rocSHMEM intra-kernel networking runtime for AMD GPUs on the ROCm platform.
:keywords: rocSHMEM, API, ROCm, documentation, HIP, Networking, Communication
.. _rocshmem-introduction:
---------------------------
What is rocSHMEM?
---------------------------
The ROCm OpenSHMEM (rocSHMEM) is an intra-kernel networking library that provides GPU-centric networking through an OpenSHMEM-like interface. It simplifies application code complexity and enables finer communication and computation overlap than traditional host-driven networking. rocSHMEM uses a single symmetric heap allocated on GPU memories.
The rocSHMEM programming model
-------------------------------
Defining how OpenSHMEM applications interact with GPUs remains an
ongoing active discussion within the OpenSHMEM community, and the OpenSHMEM
specification has yet to coalesce on this topic.
rocSHMEM extends beyond the OpenSHMEM specification to add semantics that
support GPU kernel communication while maintaining close resemblance to
the original OpenSHMEM specification semantics.
Applications using :doc:`HIP <hip:index>` can interface with rocSHMEM.
Using the HIP programming model,
rocSHMEM provides ``__host__`` APIs for host code,
and ``__device__`` APIs for GPU kernels.
Device APIs without special suffixes or infixes , for example, ``_wg`` or ``_wave``,
must be called by a single thread.
GPU specific ``_wg`` and ``_wave`` APIs are designed to be called by multiple GPU threads
and will block until the calling scope completes.
These APIs can be called in divergent code paths, but this is not recommended.
Wavefront APIs
==============
Wavefront APIs are those with the ``_wave`` suffix.
The parameters in which these routines are called must be
the same for every thread in the wavefront.
The behavior is undefined if any thread calls these routines with different parameters. These APIs will block until the calling wavefront is complete.
Workgroup APIs
==============
The workgroup APIs have the ``_wg`` suffix or ``_wg_`` infix.
The parameters in which these routines are called must be
the same for every thread in the workgroup.
The behavior is undefined if any thread calls these routines with different parameters. These APIs will block until the calling workgroup is complete.
+4
Visa fil
@@ -0,0 +1,4 @@
# License
```{include} ../LICENSE.md
```
+48
Visa fil
@@ -0,0 +1,48 @@
defaults:
numbered: False
root: index
subtrees:
- entries:
- file: introduction.rst
title: What is rocSHMEM?
- caption: Install
entries:
- file: install.rst
title: Install rocSHMEM
- caption: How to
entries:
- file: compile_and_run.rst
title: Compile and run applications
- caption: API reference
entries:
- file: api/init.rst
title: Library setup, exit, and query routines
- file: api/memory_management.rst
title: Memory management routines
- file: api/teams.rst
title: Team management routines
- file: api/ctx.rst
title: Context management routines
- file: api/env_variables.rst
title: Environment variables
- file: api/rma.rst
title: Remote memory access routines
- file: api/amo.rst
title: Atomic memory operations
- file: api/sigops.rst
title: Signaling operations
- file: api/coll.rst
title: Collective routines
- file: api/pt2pt_sync.rst
title: Point-to-point synchronization routines
- file: api/memory_ordering.rst
title: Memory ordering routines
- caption: About
entries:
- file: license.rst
@@ -0,0 +1,2 @@
rocm-docs-core==1.31.2
@@ -0,0 +1,278 @@
#
# This file is autogenerated by pip-compile with Python 3.10
# by the following command:
#
# pip-compile requirements.in
#
accessible-pygments==0.0.5
# via pydata-sphinx-theme
alabaster==1.0.0
# via sphinx
asttokens==3.0.0
# via stack-data
attrs==25.3.0
# via
# jsonschema
# jupyter-cache
# referencing
babel==2.17.0
# via
# pydata-sphinx-theme
# sphinx
beautifulsoup4==4.13.4
# via pydata-sphinx-theme
breathe==4.36.0
# via rocm-docs-core
certifi==2025.4.26
# via requests
cffi==2.0.0
# via
# cryptography
# pynacl
charset-normalizer==3.4.2
# via requests
click==8.1.8
# via
# jupyter-cache
# sphinx-external-toc
comm==0.2.2
# via ipykernel
cryptography==44.0.3
# via pyjwt
debugpy==1.8.14
# via ipykernel
decorator==5.2.1
# via ipython
deprecated==1.2.18
# via pygithub
docutils==0.21.2
# via
# myst-parser
# pydata-sphinx-theme
# sphinx
exceptiongroup==1.2.2
# via ipython
executing==2.2.0
# via stack-data
fastjsonschema==2.21.1
# via
# nbformat
# rocm-docs-core
gitdb==4.0.12
# via gitpython
gitpython==3.1.44
# via rocm-docs-core
greenlet==3.2.1
# via sqlalchemy
idna==3.10
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==8.7.0
# via
# jupyter-cache
# myst-nb
ipykernel==6.29.5
# via myst-nb
ipython==8.36.0
# via
# ipykernel
# myst-nb
jedi==0.19.2
# via ipython
jinja2==3.1.6
# via
# myst-parser
# sphinx
jsonschema==4.23.0
# via nbformat
jsonschema-specifications==2025.4.1
# via jsonschema
jupyter-cache==1.0.1
# via myst-nb
jupyter-client==8.6.3
# via
# ipykernel
# nbclient
jupyter-core==5.7.2
# via
# ipykernel
# jupyter-client
# nbclient
# nbformat
markdown-it-py==3.0.0
# via
# mdit-py-plugins
# myst-parser
markupsafe==3.0.2
# via jinja2
matplotlib-inline==0.1.7
# via
# ipykernel
# ipython
mdit-py-plugins==0.4.2
# via myst-parser
mdurl==0.1.2
# via markdown-it-py
myst-nb==1.2.0
# via rocm-docs-core
myst-parser==4.0.1
# via myst-nb
nbclient==0.10.2
# via
# jupyter-cache
# myst-nb
nbformat==5.10.4
# via
# jupyter-cache
# myst-nb
# nbclient
nest-asyncio==1.6.0
# via ipykernel
packaging==25.0
# via
# ipykernel
# pydata-sphinx-theme
# sphinx
parso==0.8.4
# via jedi
pexpect==4.9.0
# via ipython
platformdirs==4.3.8
# via jupyter-core
prompt-toolkit==3.0.51
# via ipython
psutil==7.0.0
# via ipykernel
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.3
# via stack-data
pycparser==2.22
# via cffi
pydata-sphinx-theme==0.15.4
# via
# rocm-docs-core
# sphinx-book-theme
pygithub==2.6.1
# via rocm-docs-core
pygments==2.19.1
# via
# accessible-pygments
# ipython
# pydata-sphinx-theme
# sphinx
pyjwt[crypto]==2.10.1
# via pygithub
pynacl==1.6.2
# via pygithub
python-dateutil==2.9.0.post0
# via jupyter-client
pyyaml==6.0.2
# via
# jupyter-cache
# myst-nb
# myst-parser
# rocm-docs-core
# sphinx-external-toc
pyzmq==26.4.0
# via
# ipykernel
# jupyter-client
referencing==0.36.2
# via
# jsonschema
# jsonschema-specifications
requests==2.32.4
# via
# pygithub
# sphinx
rocm-docs-core==1.31.2
# via -r requirements.in
rpds-py==0.24.0
# via
# jsonschema
# referencing
six==1.17.0
# via python-dateutil
smmap==5.0.2
# via gitdb
snowballstemmer==3.0.0.1
# via sphinx
soupsieve==2.7
# via beautifulsoup4
sphinx==8.1.3
# via
# breathe
# myst-nb
# myst-parser
# pydata-sphinx-theme
# rocm-docs-core
# sphinx-book-theme
# sphinx-copybutton
# sphinx-design
# sphinx-external-toc
# sphinx-notfound-page
sphinx-book-theme==1.1.4
# via rocm-docs-core
sphinx-copybutton==0.5.2
# via rocm-docs-core
sphinx-design==0.6.1
# via rocm-docs-core
sphinx-external-toc==1.0.1
# via rocm-docs-core
sphinx-notfound-page==1.1.0
# via rocm-docs-core
sphinxcontrib-applehelp==2.0.0
# via sphinx
sphinxcontrib-devhelp==2.0.0
# via sphinx
sphinxcontrib-htmlhelp==2.1.0
# via sphinx
sphinxcontrib-jsmath==1.0.1
# via sphinx
sphinxcontrib-qthelp==2.0.0
# via sphinx
sphinxcontrib-serializinghtml==2.0.0
# via sphinx
sqlalchemy==2.0.40
# via jupyter-cache
stack-data==0.6.3
# via ipython
tabulate==0.9.0
# via jupyter-cache
tomli==2.2.1
# via sphinx
tornado==6.5.1
# via
# ipykernel
# jupyter-client
traitlets==5.14.3
# via
# comm
# ipykernel
# ipython
# jupyter-client
# jupyter-core
# matplotlib-inline
# nbclient
# nbformat
typing-extensions==4.13.2
# via
# beautifulsoup4
# ipython
# myst-nb
# pydata-sphinx-theme
# pygithub
# referencing
# sqlalchemy
urllib3==2.6.3
# via
# pygithub
# requests
wcwidth==0.2.13
# via prompt-toolkit
wrapt==1.17.2
# via deprecated
zipp==3.21.0
# via importlib-metadata
+65
Visa fil
@@ -0,0 +1,65 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
cmake_minimum_required(VERSION 3.16.3 FATAL_ERROR)
###############################################################################
# PROJECT
###############################################################################
include(${CMAKE_SOURCE_DIR}/cmake/setup_project.cmake)
project(rocshmem_examples VERSION 1.0.0 LANGUAGES CXX)
find_package(MPI)
find_package(hip REQUIRED PATHS /opt/rocm)
if (NOT TARGET roc::rocshmem)
find_package(rocshmem REQUIRED PATHS /opt/rocm)
endif()
###############################################################################
# SOURCES
###############################################################################
set(EXAMPLE_SOURCES
rocshmem_allreduce_test.cc
rocshmem_alltoall_test.cc
rocshmem_broadcast_test.cc
rocshmem_getmem_test.cc
rocshmem_put_signal_test.cc
)
if (MPI_CXX_FOUND)
list(APPEND EXAMPLE_SOURCES
rocshmem_init_attr_test.cc)
endif()
foreach(SOURCE_FILE IN LISTS EXAMPLE_SOURCES)
get_filename_component(EXECUTABLE_NAME ${SOURCE_FILE} NAME_WE)
add_executable(${EXECUTABLE_NAME} ${SOURCE_FILE})
target_link_libraries(
${EXECUTABLE_NAME}
PRIVATE
roc::rocshmem
$<TARGET_NAME_IF_EXISTS:MPI::MPI_CXX>
)
endforeach()
@@ -0,0 +1,155 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
/*
* First find your offload target, and if xnack is enabled/disabled using
rocminfo | grep amdgcn
* It should output a string like so:
"Name: amdgcn-amd-amdhsa--gfx942:sramecc+:xnack-"
* This lists the offload taret (gfx942) and that xnack is disabled (xnack-).
* Therefore, we need to specify --offload-arch=gfx942:xnack- to our link and compile commands.
* Please modify the compile and link commands to suit your system
* To compile:
hipcc -c -fgpu-rdc -x hip rocshmem_allreduce_test.cc \
--offload-arch=<target>:<xnack> \
-I/opt/rocm/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
* To link:
hipcc -fgpu-rdc --hip-link rocshmem_allreduce_test.o -o rocshmem_allreduce_test \
--offload-arch=<target>:<xnack> \
$ROCSHMEM_INSTALL_DIR/lib/librocshmem.a \
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
* To run:
mpirun -np 8 -x ROCSHMEM_MAX_NUM_CONTEXTS=2 ./rocshmem_allreduce_test
*/
#include <rocshmem/rocshmem.hpp>
#include "util.h"
using namespace rocshmem;
__global__ void allreduce_test(int *source, int *dest, size_t nelem,
rocshmem_team_t team) {
__shared__ rocshmem_ctx_t ctx;
int64_t ctx_type = 0;
rocshmem_wg_ctx_create(ctx_type, &ctx);
int num_pes = rocshmem_ctx_n_pes(ctx);
rocshmem_ctx_int_sum_reduce_wg(ctx, team, dest, source, nelem);
rocshmem_ctx_quiet(ctx);
__syncthreads();
rocshmem_wg_ctx_destroy(&ctx);
}
static void init_sendbuf (int *source, int nelem, int my_pe)
{
for (int i = 0; i < nelem; i++) {
source[i] = my_pe + i%9;
}
}
static bool check_recvbuf(int *dest, int nelem, int my_pe, int npes)
{
bool res=true;
int expected = npes * (npes -1) / 2;
for (int i = 0; i < nelem; i++) {
int result = expected + npes * (i%9);
if (dest[i] != result) {
res = false;
#ifdef VERBOSE
printf("recvbuf[%d] = %d expected %d \n", i, dest[i], result);
#endif
}
}
return res;
}
#define MAX_ELEM 256
int main (int argc, char **argv)
{
int nelem = MAX_ELEM;
if (argc > 1) {
nelem = atoi(argv[1]);
}
CHECK_HIP(hipSetDevice(get_launcher_local_rank()));
rocshmem_init();
int my_pe = rocshmem_my_pe();
int npes = rocshmem_n_pes();
int *source = (int *)rocshmem_malloc(nelem * sizeof(int));
int *dest = (int *)rocshmem_malloc(nelem * sizeof(int));
if (NULL == source || NULL == dest) {
std::cout << "Error allocating memory from symmetric heap" << std::endl;
std::cout << "source: " << source << ", dest: " << dest << ", size: "
<< sizeof(int) * nelem << std::endl;
rocshmem_global_exit(1);
}
init_sendbuf(source, nelem, my_pe);
for (int i=0; i<nelem; i++) {
dest[i] = -1;
}
rocshmem_team_t team_reduce_world_dup;
team_reduce_world_dup = ROCSHMEM_TEAM_INVALID;
rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
&team_reduce_world_dup);
CHECK_HIP(hipDeviceSynchronize());
int threadsPerBlock=256;
allreduce_test<<<dim3(1), dim3(threadsPerBlock), 0, 0>>>(source, dest,
nelem, team_reduce_world_dup);
CHECK_HIP(hipDeviceSynchronize());
bool pass = check_recvbuf(dest, nelem, my_pe, npes);
printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]");
rocshmem_free(source);
rocshmem_free(dest);
rocshmem_finalize();
return 0;
}
@@ -0,0 +1,161 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
/*
* First find your offload target, and if xnack is enabled/disabled using
rocminfo | grep amdgcn
* It should output a string like so:
"Name: amdgcn-amd-amdhsa--gfx942:sramecc+:xnack-"
* This lists the offload taret (gfx942) and that xnack is disabled (xnack-).
* Therefore, we need to specify --offload-arch=gfx942:xnack- to our link and compile commands.
* Please modify the compile and link commands to suit your system
* To compile:
hipcc -c -fgpu-rdc -x hip rocshmem_alltoall_test.cc \
--offload-arch=<target>:<xnack> \
-I/opt/rocm/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
* To link:
hipcc -fgpu-rdc --hip-link rocshmem_alltoall_test.o -o rocshmem_alltoall_test \
--offload-arch=<target>:<xnack> \
$ROCSHMEM_INSTALL_DIR/lib/librocshmem.a \
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
* To run:
mpirun -np 8 -x ROCSHMEM_MAX_NUM_CONTEXTS=2 ./rocshmem_alltoall_test
*/
#include <rocshmem/rocshmem.hpp>
#include "util.h"
using namespace rocshmem;
__global__ void alltoall_test(int *source, int *dest, size_t nelem,
rocshmem_team_t team) {
__shared__ rocshmem_ctx_t ctx;
int64_t ctx_type = 0;
rocshmem_wg_ctx_create(ctx_type, &ctx);
int num_pes = rocshmem_ctx_n_pes(ctx);
rocshmem_ctx_int_alltoall_wg(ctx, team, dest, source, nelem);
rocshmem_ctx_quiet(ctx);
__syncthreads();
rocshmem_wg_ctx_destroy(&ctx);
}
static void init_sendbuf (int *source, int nelem, int my_pe, int npes)
{
for (int pe = 0; pe < npes; pe++) {
for (int i = 0; i < nelem; i++) {
int idx = (pe * nelem) + i;
source[idx] = my_pe + pe;
}
}
}
static bool check_recvbuf(int *dest, int nelem, int my_pe, int npes)
{
bool res=true;
for(int pe = 0; pe < npes; pe++) {
for(int i = 0; i < nelem; i++) {
int idx = (pe * nelem) + i;
int result = my_pe + pe;
if (dest[idx] != result) {
res = false;
#ifdef VERBOSE
printf("recvbuf[%d] = %d expected %d \n", i, dest[i], result);
#endif
}
}
}
return res;
}
#define MAX_ELEM 256
int main (int argc, char **argv)
{
int nelem = MAX_ELEM;
if (argc > 1) {
nelem = atoi(argv[1]);
}
CHECK_HIP(hipSetDevice(get_launcher_local_rank()));
rocshmem_init();
int my_pe = rocshmem_my_pe();
int npes = rocshmem_n_pes();
int *source = (int *)rocshmem_malloc(nelem * npes * sizeof(int));
int *dest = (int *)rocshmem_malloc(nelem * npes * sizeof(int));
if (NULL == source || NULL == dest) {
std::cout << "Error allocating memory from symmetric heap" << std::endl;
std::cout << "source: " << source << ", dest: " << dest << ", size: "
<< sizeof(int) * nelem * npes << std::endl;
rocshmem_global_exit(1);
}
init_sendbuf(source, nelem, my_pe, npes);
for (int i = 0; i < nelem * npes; i++) {
dest[i] = -1;
}
rocshmem_team_t team_reduce_world_dup;
team_reduce_world_dup = ROCSHMEM_TEAM_INVALID;
rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
&team_reduce_world_dup);
CHECK_HIP(hipDeviceSynchronize());
int threadsPerBlock=256;
alltoall_test<<<dim3(1), dim3(threadsPerBlock), 0, 0>>>(source, dest,
nelem, team_reduce_world_dup);
CHECK_HIP(hipDeviceSynchronize());
bool pass = check_recvbuf(dest, nelem, my_pe, npes);
printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]");
rocshmem_free(source);
rocshmem_free(dest);
rocshmem_finalize();
return 0;
}
@@ -0,0 +1,156 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
/*
* First find your offload target, and if xnack is enabled/disabled using
rocminfo | grep amdgcn
* It should output a string like so:
"Name: amdgcn-amd-amdhsa--gfx942:sramecc+:xnack-"
* This lists the offload taret (gfx942) and that xnack is disabled (xnack-).
* Therefore, we need to specify --offload-arch=gfx942:xnack- to our link and compile commands.
* Please modify the compile and link commands to suit your system
* To compile:
hipcc -c -fgpu-rdc -x hip rocshmem_broadcast_test.cc \
--offload-arch=<target>:<xnack> \
-I/opt/rocm/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
* To link:
hipcc -fgpu-rdc --hip-link rocshmem_broadcast_test.o -o rocshmem_broadcast_test \
--offload-arch=<target>:<xnack> \
$ROCSHMEM_INSTALL_DIR/lib/librocshmem.a \
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
* To run:
mpirun -np 8 -x ROCSHMEM_MAX_NUM_CONTEXTS=2 ./rocshmem_broadcast_test
*/
#include <rocshmem/rocshmem.hpp>
#include "util.h"
using namespace rocshmem;
__global__ void broadcast_test(int *source, int *dest, size_t nelem,
int root, rocshmem_team_t team) {
__shared__ rocshmem_ctx_t ctx;
int64_t ctx_type = 0;
rocshmem_wg_ctx_create(ctx_type, &ctx);
int num_pes = rocshmem_ctx_n_pes(ctx);
rocshmem_ctx_int_broadcast_wg(ctx, team, dest, source, nelem, root);
rocshmem_ctx_quiet(ctx);
__syncthreads();
rocshmem_wg_ctx_destroy(&ctx);
}
static void init_sendbuf(int *source, int nelem, int my_pe)
{
for (int i = 0; i < nelem; i++) {
source[i] = i;
}
}
static bool check_recvbuf(int *dest, int nelem, int my_pe, int npes)
{
bool res=true;
for (int i = 0; i < npes; i++) {
if (dest[i] != i) {
res = false;
#ifdef VERBOSE
printf("PE: %d, dest[%d] = %d, expected %d \n", my_pe, i, dest[i], i);
#endif
}
}
return res;
}
#define MAX_ELEM 256
int main(int argc, char **argv)
{
int nelem = MAX_ELEM;
if (argc > 1) {
nelem = atoi(argv[1]);
}
CHECK_HIP(hipSetDevice(get_launcher_local_rank()));
rocshmem_init();
int my_pe = rocshmem_my_pe();
int npes = rocshmem_n_pes();
int *source = (int *)rocshmem_malloc(nelem * sizeof(int));
int *dest = (int *)rocshmem_malloc(nelem * sizeof(int));
if (NULL == source || NULL == dest) {
std::cout << "Error allocating memory from symmetric heap" << std::endl;
std::cout << "source: " << source << ", dest: " << dest << ", size: "
<< sizeof(int) * nelem << std::endl;
rocshmem_global_exit(1);
}
init_sendbuf(source, nelem, my_pe);
for (int i=0; i<nelem; i++) {
dest[i] = -1;
}
int root = 0;
rocshmem_team_t team_reduce_world_dup;
team_reduce_world_dup = ROCSHMEM_TEAM_INVALID;
rocshmem_team_split_strided(ROCSHMEM_TEAM_WORLD, 0, 1, npes, nullptr, 0,
&team_reduce_world_dup);
CHECK_HIP(hipDeviceSynchronize());
int threadsPerBlock=256;
broadcast_test<<<dim3(1), dim3(threadsPerBlock), 0, 0>>>(source, dest,
nelem, root, team_reduce_world_dup);
CHECK_HIP(hipDeviceSynchronize());
if(my_pe != root) {
bool pass = check_recvbuf(dest, nelem, my_pe, npes);
printf("Test %s \t nelem %d %s\n", argv[0], nelem, pass ? "[PASS]" : "[FAIL]");
}
rocshmem_free(source);
rocshmem_free(dest);
rocshmem_finalize();
return 0;
}
@@ -0,0 +1,129 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
/*
* First find your offload target, and if xnack is enabled/disabled using
rocminfo | grep amdgcn
* It should output a string like so:
"Name: amdgcn-amd-amdhsa--gfx942:sramecc+:xnack-"
* This lists the offload taret (gfx942) and that xnack is disabled (xnack-).
* Therefore, we need to specify --offload-arch=gfx942:xnack- to our link and compile commands.
* Please modify the compile and link commands to suit your system
* To compile:
hipcc -c -fgpu-rdc -x hip rocshmem_getmem_test.cc \
--offload-arch=<target>:<xnack> \
-I/opt/rocm/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
* To link:
hipcc -fgpu-rdc --hip-link rocshmem_getmem_test.o -o rocshmem_getmem_test \
--offload-arch=<target>:<xnack> \
$ROCSHMEM_INSTALL_DIR/lib/librocshmem.a \
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
* To run:
mpirun -np 8 -x ROCSHMEM_MAX_NUM_CONTEXTS=2 ./rocshmem_getmem_test
*/
#include <rocshmem/rocshmem.hpp>
#include "util.h"
using namespace rocshmem;
__global__ void simple_getmem_test(int *src, int *dst, size_t nelem)
{
int threadId = blockIdx.x * blockDim.x + threadIdx.x;
if (threadId == 0) {
int my_pe = rocshmem_my_pe();
int peer = my_pe ? 0 : 1;
rocshmem_getmem(dst, src, nelem * sizeof(int), peer);
rocshmem_quiet();
}
__syncthreads();
}
#define MAX_ELEM 256
int main (int argc, char **argv)
{
int nelem = MAX_ELEM;
if (argc > 1) {
nelem = atoi(argv[1]);
}
CHECK_HIP(hipSetDevice(get_launcher_local_rank()));
rocshmem_init();
int my_pe = rocshmem_my_pe();
int npes = rocshmem_n_pes();
int *src = (int *)rocshmem_malloc(nelem * sizeof(int));
int *dst = (int *)rocshmem_malloc(nelem * sizeof(int));
if (NULL == src || NULL == dst) {
std::cout << "Error allocating memory from symmetric heap" << std::endl;
std::cout << "source: " << src << ", dest: " << dst << ", size: "
<< sizeof(int) * nelem << std::endl;
rocshmem_global_exit(1);
}
for (int i=0; i<nelem; i++) {
src[i] = 0;
dst[i] = 1;
}
CHECK_HIP(hipDeviceSynchronize());
int threadsPerBlock=256;
simple_getmem_test<<<dim3(1), dim3(threadsPerBlock), 0, 0>>>(src, dst, nelem);
rocshmem_barrier_all();
CHECK_HIP(hipDeviceSynchronize());
bool pass = true;
for (int i=0; i<nelem; i++) {
if (dst[i] != 0) {
pass = false;
#if VERBOSE
printf("[%d] Error in element %d expected 0 got %d\n", my_pe, i, dst[i]);
#endif
}
}
printf("Test %s \t %s\n", argv[0], pass ? "[PASS]" : "[FAIL]");
rocshmem_free(src);
rocshmem_free(dst);
rocshmem_finalize();
return 0;
}
@@ -0,0 +1,121 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
/*
* First find your offload target, and if xnack is enabled/disabled using
rocminfo | grep amdgcn
* It should output a string like so:
"Name: amdgcn-amd-amdhsa--gfx942:sramecc+:xnack-"
* This lists the offload taret (gfx942) and that xnack is disabled (xnack-).
* Therefore, we need to specify --offload-arch=gfx942:xnack- to our link and compile commands.
* Please modify the compile and link commands to suit your system
* To compile:
hipcc -c -fgpu-rdc -x hip rocshmem_init_attr_test.cc \
--offload-arch=<target>:<xnack> \
-I/opt/rocm/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
* To link:
hipcc -fgpu-rdc --hip-link rocshmem_init_attr_test.o -o rocshmem_init_attr_test \
--offload-arch=<target>:<xnack> \
$ROCSHMEM_INSTALL_DIR/lib/librocshmem.a \
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
* To run:
mpirun -np 8 -x ROCSHMEM_MAX_NUM_CONTEXTS=2 ./rocshmem_init_attr_test
* Note:
running this test with the Reverse Offload (RO) conduit requires setting
ROCSHMEM_UNIQUEID_WITH_MPI=1
*/
#include <rocshmem/rocshmem.hpp>
#include <mpi.h>
#include "util.h"
using namespace rocshmem;
int main (int argc, char **argv)
{
int world_rank, world_nranks;
int ret;
rocshmem_uniqueid_t uid;
rocshmem_init_attr_t attr;
int provided;
MPI_Init_thread (&argc, &argv, MPI_THREAD_MULTIPLE, &provided);
if (provided != MPI_THREAD_MULTIPLE) {
std::cerr << "MPI_THREAD_MULTIPLE support disabled.\n";
}
MPI_Comm_rank (MPI_COMM_WORLD, &world_rank);
MPI_Comm_size (MPI_COMM_WORLD, &world_nranks);
// Create two disjoint groups of processes, each
// one creating a unique rocshmem environment independent
// of the other group
MPI_Comm newcomm;
int color = world_rank %2;
int rank, nranks;
MPI_Comm_split(MPI_COMM_WORLD, color, world_rank, &newcomm);
MPI_Comm_rank (newcomm, &rank);
MPI_Comm_size (newcomm, &nranks);
if (rank == 0) {
ret = rocshmem_get_uniqueid (&uid);
if (ret != ROCSHMEM_SUCCESS) {
std::cout << rank << ": Error in rocshmem_get_uniqueid. Aborting.\n";
MPI_Abort (MPI_COMM_WORLD, ret);
}
}
MPI_Bcast (&uid, sizeof(rocshmem_uniqueid_t), MPI_BYTE, 0, newcomm);
ret = rocshmem_set_attr_uniqueid_args(rank, nranks, &uid, &attr);
if (ret != ROCSHMEM_SUCCESS) {
std::cout << rank << ": Error in rocshmem_set_attr_uniqueid_args. Aborting.\n";
MPI_Abort (MPI_COMM_WORLD, ret);
}
ret = rocshmem_init_attr(ROCSHMEM_INIT_WITH_UNIQUEID, &attr);
if (ret != ROCSHMEM_SUCCESS) {
std::cout << rank << ": Error in rocshmem_init_attr. Aborting.\n";
MPI_Abort (MPI_COMM_WORLD, ret);
}
std::cout << rank << ": rocshmem_init_attr SUCCESS\n";
rocshmem_finalize();
MPI_Comm_free (&newcomm);
MPI_Finalize();
return 0;
}
@@ -0,0 +1,140 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
/*
* First find your offload target, and if xnack is enabled/disabled using
rocminfo | grep amdgcn
* It should output a string like so:
"Name: amdgcn-amd-amdhsa--gfx942:sramecc+:xnack-"
* This lists the offload taret (gfx942) and that xnack is disabled (xnack-).
* Therefore, we need to specify --offload-arch=gfx942:xnack- to our link and compile commands.
* Please modify the compile and link commands to suit your system
* To compile:
hipcc -c -fgpu-rdc -x hip rocshmem_put_signal_test.cc \
--offload-arch=<target>:<xnack> \
-I/opt/rocm/include \
-I$ROCSHMEM_INSTALL_DIR/include \
-I$OPENMPI_UCX_INSTALL_DIR/include/
* To link:
hipcc -fgpu-rdc --hip-link rocshmem_put_signal_test.o -o rocshmem_put_signal_test \
--offload-arch=<target>:<xnack> \
$ROCSHMEM_INSTALL_DIR/lib/librocshmem.a \
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
* To run:
mpirun -np 8 -x ROCSHMEM_MAX_NUM_CONTEXTS=2 ./rocshmem_put_signal_test
*/
#include <rocshmem/rocshmem.hpp>
#include "util.h"
using namespace rocshmem;
__global__ void simple_put_signal_test(uint64_t *data, uint64_t *message, size_t nelem,
uint64_t *sig_addr, int my_pe, int dst_pe)
{
int threadId = blockIdx.x * blockDim.x + threadIdx.x;
if (threadId == 0) {
if (my_pe == 0) {
rocshmem_ulong_put_signal(data, message, nelem, sig_addr, 1, ROCSHMEM_SIGNAL_SET, dst_pe);
}
else {
rocshmem_ulong_wait_until(sig_addr, ROCSHMEM_CMP_EQ, 1);
rocshmem_ulong_put_signal(data, data, nelem, sig_addr, 1, ROCSHMEM_SIGNAL_SET, dst_pe);
}
}
__syncthreads();
}
#define MAX_ELEM 256
int main (int argc, char **argv)
{
int nelem = MAX_ELEM;
if (argc > 1) {
nelem = atoi(argv[1]);
}
CHECK_HIP(hipSetDevice(get_launcher_local_rank()));
rocshmem_init();
int my_pe = rocshmem_my_pe();
int npes = rocshmem_n_pes();
int dst_pe = (my_pe + 1) % npes;
uint64_t *message = (uint64_t*)rocshmem_malloc(nelem * sizeof(uint64_t));
uint64_t *data = (uint64_t*)rocshmem_malloc(nelem * sizeof(uint64_t));
uint64_t *sig_addr = (uint64_t*)rocshmem_malloc(sizeof(uint64_t));
if (NULL == data || NULL == message || NULL == sig_addr) {
std::cout << "Error allocating memory from symmetric heap" << std::endl;
std::cout << "data: " << data
<< ", message: " << message
<< ", size: " << sizeof(uint64_t) * nelem
<< ", sig_addr: " << sig_addr
<< std::endl;
rocshmem_global_exit(1);
}
for (int i=0; i<nelem; i++) {
message[i] = my_pe;
}
CHECK_HIP(hipMemset(data, 0, (nelem * sizeof(uint64_t))));
CHECK_HIP(hipDeviceSynchronize());
int threadsPerBlock=256;
simple_put_signal_test<<<dim3(1), dim3(threadsPerBlock), 0, 0>>>(data, message, nelem, sig_addr, my_pe, dst_pe);
rocshmem_barrier_all();
CHECK_HIP(hipDeviceSynchronize());
bool pass = true;
for (int i=0; i<nelem; i++) {
if (data[i] != 0) {
pass = false;
#if VERBOSE
printf("[%d] Error in element %d expected 0 got %d\n", my_pe, i, dst[i]);
#endif
}
}
printf("[%d] Test %s \t %s\n", my_pe, argv[0], pass ? "[PASS]" : "[FAIL]");
rocshmem_free(data);
rocshmem_free(message);
rocshmem_finalize();
return 0;
}
+52
Visa fil
@@ -0,0 +1,52 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef __ROCSHMEM_EXAMPLES_UTIL_H__
#define __ROCSHMEM_EXAMPLES_UTIL_H__
#include <iostream>
#include <hip/hip_runtime_api.h>
#include <hip/hip_runtime.h>
#define CHECK_HIP(condition) { \
hipError_t error = condition; \
if(error != hipSuccess){ \
fprintf(stderr,"HIP error: %d line: %d\n", error, __LINE__); \
exit(error); \
} \
}
static int get_launcher_local_rank() {
char *local_rank_str = nullptr;
local_rank_str = getenv("OMPI_COMM_WORLD_LOCAL_RANK");
if (nullptr != local_rank_str) {
return atoi(local_rank_str);
}
return -1;
}
#endif /* __ROCSHMEM_EXAMPLES_UTIL_H__ */
@@ -0,0 +1,678 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_HPP
#include <hip/hip_runtime.h>
#include "rocshmem_config.h"
#include "rocshmem_common.hpp"
#include "rocshmem_RMA.hpp"
#include "rocshmem_AMO.hpp"
#include "rocshmem_SIG_OP.hpp"
#include "rocshmem_COLL.hpp"
#include "rocshmem_P2P_SYNC.hpp"
#include "rocshmem_RMA_X.hpp"
#if defined(HAVE_EXTERNAL_MPI)
#include <mpi.h>
#endif
/**
* @file rocshmem.hpp
* @brief Public header for rocSHMEM device and host libraries.
*
* This file contains all the callable functions and data structures for both
* the device-side runtime and host-side runtime.
*
* The comments on these functions are sparse, but the semantics are the same
* as those implemented in OpenSHMEM unless otherwise documented. Please see
* the OpenSHMEM 1.4 standards documentation for more details:
*
* http://openshmem.org/site/sites/default/site_files/OpenSHMEM-1.4.pdf
*/
namespace rocshmem {
constexpr char VERSION[] = "3.2.1";
/******************************************************************************
**************************** HOST INTERFACE **********************************
*****************************************************************************/
#if defined(HAVE_EXTERNAL_MPI)
/**
* @brief Initialize the rocSHMEM runtime and underlying transport layer.
*
* @param[in] comm MPI Communicator that rocSHMEM will be using
* If MPI_COMM_NULL, rocSHMEM will be using MPI_COMM_WORLD
*/
[[deprecated]] __host__ void rocshmem_init(MPI_Comm comm);
#endif
/**
* @brief Initialize the rocSHMEM runtime and underlying transport layer.
* This is equivalent to the previous function, using implicitely
* MPI_COMM_WORLD for initialization
*/
__host__ void rocshmem_init(void);
/**
* @brief Query rocSHMEM context from host API
*
* @param[out] ctx Returns ROCSHMEM_CTX_DEFAULT device pointer that users
* can query from one instance of rocshmem host library and
* use use later for dynamic module initialization in
* kernel bitcode device library in the same application
*/
__host__ void * rocshmem_get_device_ctx();
/**
* @brief Query rocSHMEM remote symmetric heap pointer
*
* @param[in] dest local symmetric heap allocation pointer for current pe/device
*
* @param[in] pe remote PE
*
* @param[out] ptr Returns remote symmetric heap device pointer from host-side API.
* This can be used to issue load/store from custom kernels
* instead of using rocshmem device side get/put APIs for RMA operations.
*/
__host__ void* rocshmem_ptr(const void *dest, int pe);
__device__ ATTR_NO_INLINE void* rocshmem_ptr(const void *dest, int pe);
#if defined(HAVE_EXTERNAL_MPI)
/**
* @brief Initialize the rocSHMEM runtime and underlying transport layer
* with an attempt to enable the requested thread support.
*
* @param[in] requested Requested thread mode (from rocshmem_thread_ops)
* for host-facing functions.
* @param[out] provided Thread mode selected by the runtime. May not be equal
* to requested thread mode.
* @param[in] comm (Optional) MPI Communicator that rocSHMEM will be using
* If MPI_COMM_NULL, rocSHMEM will be using MPI_COMM_WORLD
*
* @return int returns 0 upon success; otherwise, it returns a nonzero
* value
*/
[[deprecated]] __host__ int rocshmem_init_thread(int requested, int *provided,
MPI_Comm comm);
#endif
/**
* @brief Initialize the rocSHMEM runtime and underlying transport layer
* using the provided mode and attributes
*
* @param[in] flags initialization method to be used.
* Valid values are ROCSHMEM_INIT_WITH_UNIQUEID and
* ROCSHMEM_INIT_WITH_MPI_COMM
* @param[in] attr attribute structure specifying input characteristics
*
* @return int returns 0 upon success; otherwise, it returns a nonzero
* value
*/
__host__ int rocshmem_init_attr(unsigned int flags, rocshmem_init_attr_t *attr);
/**
* @brief Return a uniqueID
*
* @return int returns 0 upon success; otherwise, it returns a nonzero
* value
*/
__host__ int rocshmem_get_uniqueid(rocshmem_uniqueid_t *uid);
/**
* @brief Initalizes the rocshmem_init_attr_t struct
*
* @param[in] rank rank of the calling process
* @param[in] nranks number of pes
* @param[in] uid unique ID used to identify the group processes.
* All processes that
* @param[out] attr attribute structure to be passed to rocshmem_init_attr
*
* @return int returns 0 upon success; otherwise, it returns a nonzero
* value
*/
__host__ int rocshmem_set_attr_uniqueid_args(int rank, int nranks,
rocshmem_uniqueid_t *uid,
rocshmem_init_attr_t *attr);
/**
* @brief Query the thread mode used by the runtime.
*
* @param[out] provided Thread mode the runtime is operating in.
*
* @return void.
*/
__host__ void rocshmem_query_thread(int *provided);
/**
* @brief Function that dumps internal stats to stdout.
*/
__host__ void rocshmem_dump_stats();
/**
* @brief Reset all internal stats.
*/
__host__ void rocshmem_reset_stats();
/**
* @brief Finalize the rocSHMEM runtime.
*/
__host__ void rocshmem_finalize();
/**
* @brief Allocate memory of \p size bytes from the symmetric heap.
* This is a collective operation and must be called by all PEs.
*
* @param[in] size Memory allocation size in bytes.
*
* @return A pointer to the allocated memory on the symmetric heap.
*
* @todo Return error code instead of ptr.
*/
__host__ void *rocshmem_malloc(size_t size);
/**
* @brief Free a memory allocation from the symmetric heap.
* This is a collective operation and must be called by all PEs.
*
* @param[in] ptr Pointer to previously allocated memory on the symmetric heap.
*/
__host__ void rocshmem_free(void *ptr);
/**
* @brief Query for the number of PEs.
*
* @return Number of PEs.
*/
__host__ int rocshmem_n_pes();
/**
* @brief Query the PE ID of the caller.
*
* @return PE ID of the caller.
*/
__host__ int rocshmem_my_pe();
/**
* @brief Creates an OpenSHMEM context.
*
* @param[in] options Options for context creation. Ignored in current design.
* @param[out] ctx Context handle.
*
* @return Zero on success and nonzero otherwise.
*/
__host__ int rocshmem_ctx_create(int64_t options, rocshmem_ctx_t *ctx);
/**
* @brief Destroys an OpenSHMEM context.
*
* @param[out] ctx Context handle.
*
* @return void.
*/
__host__ void rocshmem_ctx_destroy(rocshmem_ctx_t ctx);
/**
* @brief Translate the PE in src_team to that in dest_team.
*
* @param[in] src_team Handle of the team from which to translate
* @param[in] src_pe PE-of-interest's index in src_team
* @param[in] dest_team Handle of the team to which to translate
*
* @return PE of src_pe in dest_team. If any input is invalid
* or if src_pe is not in both source and destination
* teams, a value of -1 is returned.
*/
__host__ int rocshmem_team_translate_pe(rocshmem_team_t src_team, int src_pe,
rocshmem_team_t dest_team);
/**
* @brief Query the number of PEs in a team.
*
* @param[in] team The team to query PE ID in.
*
* @return Number of PEs in the provided team.
*/
__host__ int rocshmem_team_n_pes(rocshmem_team_t team);
/**
* @brief Query the PE ID of the caller in a team.
*
* @param[in] team The team to query PE ID in.
*
* @return PE ID of the caller in the provided team.
*/
__host__ int rocshmem_team_my_pe(rocshmem_team_t team);
/**
* @brief Create a new a team of PEs. Must be called by all PEs
* in the parent team.
*
* @param[in] parent_team The team to split from.
* @param[in] start The lowest PE number of the subset of the PEs
* from the parent team that will form the new
* team.
* @param[in] stide The stride between team PE members in the
* parent team that comprise the subset of PEs
* that will form the new team.
* @param[in] size The number of PEs in the new team.
* @param[in] config Pointer to the config parameters for the new
* team.
* @param[in] config_mask Bitwise mask representing parameters to use
* from config
* @param[out] new_team Pointer to the newly created team. If an error
* occurs during team creation, or if the PE in
* the parent team is not in the new team, the
* value will be ROCSHMEM_TEAM_INVALID.
*
* @return Zero upon successful team creation; non-zero if erroneous.
*/
__host__ int rocshmem_team_split_strided(rocshmem_team_t parent_team,
int start, int stride, int size,
const rocshmem_team_config_t *config,
long config_mask,
rocshmem_team_t *new_team);
/**
* @brief Destroy a team. Must be called by all PEs in the team.
* The user must destroy all private contexts created in the
* team before destroying this team. Otherwise, the behavior
* is undefined. This call will destroy only the shareable contexts
* created from the referenced team.
*
* @param[in] team The team to destroy. The behavior is undefined if
* the input team is ROCSHMEM_TEAM_WORLD or any other
* invalid team. If the input is ROCSHMEM_TEAM_INVALID,
* this function will not perform any operation.
*
* @return None.
*/
__host__ void rocshmem_team_destroy(rocshmem_team_t team);
/**
* @brief Guarantees order between messages in this context in accordance with
* OpenSHMEM semantics.
*
* @param[in] ctx Context with which to perform this operation.
*
* @return void.
*/
__host__ void rocshmem_ctx_fence(rocshmem_ctx_t ctx);
__host__ void rocshmem_fence();
/**
* @brief Completes all previous operations posted on the host.
*
* @param[in] ctx Context with which to perform this operation.
*
* @return void.
*/
__host__ void rocshmem_ctx_quiet(rocshmem_ctx_t ctx);
__host__ void rocshmem_quiet();
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* @return void
*/
__host__ void rocshmem_barrier_all();
/**
* @brief enqueues a collective barrier on given stream.
*
* @return void
*/
__host__ void rocshmem_barrier_all_on_stream(hipStream_t stream);
/**
* @brief enqueues an alltoall collective operation on given stream.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] size Number of bytes to transfer per pair of PEs.
* @param[in] stream HIP stream on which to enqueue the operation.
*
* @return void
*/
__host__ void rocshmem_alltoallmem_on_stream(rocshmem_team_t team, void *dest,
const void *source, size_t size,
hipStream_t stream);
/**
* @brief enqueues a broadcast collective operation on given stream.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Number of bytes to broadcast.
* @param[in] pe_root Root PE (relative to team) from which to broadcast.
* @param[in] stream HIP stream on which to enqueue the operation.
*
* @return void
*/
__host__ void rocshmem_broadcastmem_on_stream(rocshmem_team_t team, void *dest,
const void *source, size_t nelems,
int pe_root, hipStream_t stream);
/**
* @brief enqueues a getmem RMA operation on given stream.
*
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
* @param[in] stream HIP stream on which to enqueue the operation.
*
* @return void
*/
__host__ void rocshmem_getmem_on_stream(void *dest, const void *source,
size_t nelems, int pe,
hipStream_t stream);
/**
* @brief enqueues a putmem RMA operation on given stream.
*
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Size of the transfer in bytes.
* @param[in] pe PE of the remote process.
* @param[in] stream HIP stream on which to enqueue the operation.
*
* @return void
*/
__host__ void rocshmem_putmem_on_stream(void *dest, const void *source,
size_t nelems, int pe,
hipStream_t stream);
/**
* @brief Perform a put operation with signal on a HIP stream.
*
* This routine initiates a remote memory transfer on a specified HIP stream.
* The source data is copied from the local PE to the remote PE's destination
* address. After the put operation completes, a signal operation is performed
* on a remote symmetric signal variable.
*
* @param[in] dest Destination address on the remote PE
* @param[in] source Source address on the local PE
* @param[in] nelems Size of the transfer in bytes
* @param[in] sig_addr Address of signal variable on the remote PE
* @param[in] signal Signal value to be written
* @param[in] sig_op Signal operation (ROCSHMEM_SIGNAL_SET or
* ROCSHMEM_SIGNAL_ADD)
* @param[in] pe PE number of the remote PE
* @param[in] stream HIP stream on which to enqueue the operation
*
* @return void
*/
__host__ void rocshmem_putmem_signal_on_stream(void *dest, const void *source,
size_t nelems,
uint64_t *sig_addr,
uint64_t signal, int sig_op,
int pe, hipStream_t stream);
/**
* @brief Wait on a signal variable until it satisfies the specified condition,
* with the operation enqueued on a HIP stream.
*
* This function blocks the calling thread until the signal variable at
* \p sig_addr satisfies the comparison condition (* \p sig_addr \p cmp
* \p cmp_value). The wait operation is executed asynchronously on the
* specified HIP stream.
*
* @param[in] sig_addr Address of the signal variable on the symmetric heap
* @param[in] cmp Comparison operator (e.g., ROCSHMEM_CMP_EQ,
* ROCSHMEM_CMP_GE, ROCSHMEM_CMP_NE, etc.)
* @param[in] cmp_value Value to compare against
* @param[in] stream HIP stream on which to enqueue the operation
*
* @return void
*/
__host__ void rocshmem_signal_wait_until_on_stream(uint64_t *sig_addr, int cmp,
uint64_t cmp_value,
hipStream_t stream);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* @return void
*/
__host__ void rocshmem_sync_all();
/**
* @brief allows any PE to force the termination of an entire program.
*
* @param[in] status The exit status from the main program.
*
* @return void
*/
__host__ void rocshmem_global_exit(int status);
/******************************************************************************
**************************** DEVICE INTERFACE ********************************
*****************************************************************************/
/**
* @brief Initializes device-side rocSHMEM resources. Must be called before
* any threads in this work-group invoke other rocSHMEM functions.
*
* Must be called collectively by all threads in the work-group.
*
* @return void.
*/
[[deprecated]] __device__ void rocshmem_wg_init();
/**
* @brief Finalizes device-side rocSHMEM resources. Must be called before
* work-group completion if the work-group also called rocshmem_wg_init().
*
* Must be called collectively by all threads in the work-group.
*
* @return void.
*/
[[deprecated]] __device__ void rocshmem_wg_finalize();
/**
* @brief Initializes device-side rocSHMEM resources. Must be called before
* any threads in this work-group invoke other rocSHMEM functions. This is
* a variant of rocshmem_wg_init that allows the caller to request a
* threading mode.
*
* @param[in] requested Requested thread mode from rocshmem_thread_ops.
* @param[out] provided Thread mode selected by the runtime. May not be equal
* to requested thread mode.
*
* Must be called collectively by all threads in the work-group.
*
* @return void.
*/
[[deprecated]] __device__ void rocshmem_wg_init_thread(int requested, int *provided);
/**
* @brief Query the thread mode used by the runtime.
*
* @param[out] provided Thread mode the runtime is operating in.
*
* @return void.
*/
__device__ void rocshmem_query_thread(int *provided);
/**
* @brief Creates an OpenSHMEM context. By design, the context is private
* to the calling work-group.
*
* Must be called collectively by all threads in the work-group.
*
* @param[in] options Options for context creation. Ignored in current design.
* @param[out] ctx Context handle.
*
* @return All threads returns 0 if the context was created successfully. If any
* thread returns non-zero value, the operation failed and a higher number of
* `ROCSHMEM_MAX_NUM_CONTEXTS` is required.
*/
__device__ ATTR_NO_INLINE int rocshmem_wg_ctx_create(int64_t options,
rocshmem_ctx_t *ctx);
__device__ ATTR_NO_INLINE int rocshmem_wg_team_create_ctx(
rocshmem_team_t team, long options, rocshmem_ctx_t *ctx);
/**
* @brief Destroys an OpenSHMEM context.
*
* Must be called collectively by all threads in the work-group.
*
* @param[in] The context to destroy.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_wg_ctx_destroy(rocshmem_ctx_t *ctx);
/**
* @brief Guarantees order between messages in this context in accordance with
* OpenSHMEM semantics.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* rocSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_fence(rocshmem_ctx_t ctx);
__device__ ATTR_NO_INLINE void rocshmem_fence();
/**
* @brief Guarantees order between messages in this context in accordance with
* OpenSHMEM semantics.
*
* This function is an extension as it is per PE. has same semantics as default
* API but it is per PE
*
* @param[in] ctx Context with which to perform this operation.
* @param[in] pe destination pe.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_fence(rocshmem_ctx_t ctx, int pe);
__device__ ATTR_NO_INLINE void rocshmem_fence(int pe);
/**
* @brief Completes all previous operations posted to this context.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* rocSHMEM function.
*
* @param[in] ctx Context with which to perform this operation.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_quiet(rocshmem_ctx_t ctx);
__device__ ATTR_NO_INLINE void rocshmem_quiet();
/**
* @brief Completes all previous operations posted to this context for PEs in the
* `target_pes` array.
*
* @param[in] ctx Context with which to perform this operation.
*
* @param[in] target_pes Address of target PE array where the operations need to be completed.
*
* @param[in] npes The number of PEs in the target PE array.
*
* @return void.
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_pe_quiet(rocshmem_ctx_t ctx, const int *target_pes, size_t npes);
__device__ ATTR_NO_INLINE void rocshmem_pe_quiet(const int *target_pes, size_t npes);
/**
* @brief Query the total number of PEs.
*
* Can be called per thread with no performance penalty.
*
* @param[in] ctx GPU side handle.
*
* @return Total number of PEs.
*/
__device__ int rocshmem_ctx_n_pes(rocshmem_ctx_t ctx);
__device__ int rocshmem_n_pes();
/**
* @brief Query the PE ID of the caller.
*
* Can be called per thread with no performance penalty.
*
* @param[in] ctx GPU side handle
*
* @return PE ID of the caller.
*/
__device__ int rocshmem_ctx_my_pe(rocshmem_ctx_t ctx);
__device__ int rocshmem_my_pe();
/**
* @brief Translate the PE in src_team to that in dest_team.
*
* @param[in] src_team Handle of the team from which to translate
* @param[in] src_pe PE-of-interest's index in src_team
* @param[in] dest_team Handle of the team to which to translate
*
* @return PE of src_pe in dest_team. If any input is invalid
* or if src_pe is not in both source and destination
* teams, a value of -1 is returned.
*/
__device__ int rocshmem_team_translate_pe(rocshmem_team_t src_team,
int src_pe,
rocshmem_team_t dest_team);
__device__ ATTR_NO_INLINE void rocshmem_ctx_threadfence_system(
rocshmem_ctx_t ctx);
__device__ ATTR_NO_INLINE void rocshmem_threadfence_system();
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_HPP
Filskillnaden har hållits tillbaka eftersom den är för stor Load Diff
@@ -0,0 +1,852 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
namespace rocshmem {
/**
* @name SHMEM_ALLTOALL
* @brief Exchanges a fixed amount of contiguous data blocks between all pairs
* of PEs participating in the collective routine.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelems Number of data blocks transferred per pair of PEs.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_alltoall_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_float_alltoall_wg(
rocshmem_team_t team, float *dest, const float *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_double_alltoall_wg(
rocshmem_team_t team, double *dest, const double *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_char_alltoall_wg(
rocshmem_team_t team, char *dest, const char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_schar_alltoall_wg(
rocshmem_team_t team, signed char *dest, const signed char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_short_alltoall_wg(
rocshmem_team_t team, short *dest, const short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_int_alltoall_wg(
rocshmem_team_t team, int *dest, const int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_long_alltoall_wg(
rocshmem_team_t team, long *dest, const long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_longlong_alltoall_wg(
rocshmem_team_t team, long long *dest, const long long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_uchar_alltoall_wg(
rocshmem_team_t team, unsigned char *dest, const unsigned char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ushort_alltoall_wg(
rocshmem_team_t team, unsigned short *dest, const unsigned short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_uint_alltoall_wg(
rocshmem_team_t team, unsigned int *dest, const unsigned int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ulong_alltoall_wg(
rocshmem_team_t team, unsigned long *dest, const unsigned long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_alltoall_wg(
rocshmem_team_t team, unsigned long long *dest, const unsigned long long *source, int nelems);
/**
* @name SHMEM_BROADCAST
* @brief Perform a broadcast between PEs in the active set. The caller
* is blocked until the broadcase completes.
*
* This function must be called as a work-group collective.
*
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelement Size of the buffer to participate in the broadcast.
* @param[in] PE_root Zero-based ordinal of the PE, with respect to the
active set, from which the data is copied
* @param[in] PE_start PE to start the reduction.
* @param[in] logPE_stride Stride of PEs participating in the reduction.
* @param[in] PE_size Number PEs participating in the reduction.
* @param[in] pSync Temporary sync buffer provided to ROCSHMEM. Must
be of size at least ROCSHMEM_REDUCE_SYNC_SIZE.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_float_broadcast(
rocshmem_ctx_t ctx, float *dest, const float *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_float_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_double_broadcast(
rocshmem_ctx_t ctx, double *dest, const double *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_double_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_char_broadcast(
rocshmem_ctx_t ctx, char *dest, const char *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_char_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_schar_broadcast(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_schar_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_short_broadcast(
rocshmem_ctx_t ctx, short *dest, const short *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_short_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_int_broadcast(
rocshmem_ctx_t ctx, int *dest, const int *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_int_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_long_broadcast(
rocshmem_ctx_t ctx, long *dest, const long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_long_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_longlong_broadcast(
rocshmem_ctx_t ctx, long long *dest, const long long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_longlong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_uchar_broadcast(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_uchar_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ushort_broadcast(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_ushort_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_uint_broadcast(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_uint_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ulong_broadcast(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_ulong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems, int pe_root);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_broadcast_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems, int pe_root);
__host__ void rocshmem_ctx_ulonglong_broadcast(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source,
int nelems, int pe_root, int pe_start, int log_pe_stride,
int pe_size, long *p_sync);
__host__ void rocshmem_ctx_ulonglong_broadcast(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems, int pe_root);
/**
* @name SHMEM_FCOLLECT
* @brief Concatenates blocks of data from multiple PEs to an array in every
* PE participating in the collective routine.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nelems Number of data blocks in source array.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest,
const float *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest,
const double *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, char *dest,
const char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, signed char *dest,
const signed char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest,
const short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest,
const int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest,
const long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest,
const long long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned char *dest,
const unsigned char *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned short *dest,
const unsigned short *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned int *dest,
const unsigned int *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long *dest,
const unsigned long *source, int nelems);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_fcollect_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, unsigned long long *dest,
const unsigned long long *source, int nelems);
/**
* @name SHMEM_REDUCTIONS
* @brief Perform an allreduce between PEs in the active set. The caller
* is blocked until the reduction completes.
*
* This function must be called as a work-group collective.
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the
* symmetric heap.
* @param[in] source Source address. Must be an address on the symmetric
heap.
* @param[in] nreduce Size of the buffer to participate in the reduction.
*
* @return int (Zero on successful local completion. Nonzero otherwise.)
*/
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_or_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_and_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_short_xor_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__host__ int rocshmem_ctx_short_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, short *dest, const short *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_or_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_and_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_int_xor_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__host__ int rocshmem_ctx_int_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, int *dest, const int *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_or_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_and_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_long_xor_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__host__ int rocshmem_ctx_long_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long *dest, const long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_or_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_or_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_and_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_and_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_longlong_xor_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__host__ int rocshmem_ctx_longlong_xor_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, long long *dest, const long long *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_float_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__host__ int rocshmem_ctx_float_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, float *dest, const float *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_sum_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_sum_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_min_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_min_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_max_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_max_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__device__ ATTR_NO_INLINE int rocshmem_ctx_double_prod_reduce_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
__host__ int rocshmem_ctx_double_prod_reduce(
rocshmem_ctx_t ctx, rocshmem_team_t team, double *dest, const double *source,
int nreduce);
/**
* @brief kernel for performing a barrier synchronization.
* Caller enqueues the kernel on given stream
*
* @return void
*/
__global__ ATTR_NO_INLINE void rocshmem_barrier_all_kernel();
/**
* @brief kernel for performing an alltoall collective operation.
* Caller enqueues the kernel on given stream
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] size Number of bytes to transfer per pair of PEs.
*
* @return void
*/
__global__ ATTR_NO_INLINE void rocshmem_alltoallmem_kernel(rocshmem_team_t team,
void *dest,
const void *source,
size_t size);
/**
* @brief kernel for performing a broadcast collective operation.
* Caller enqueues the kernel on given stream
*
* @param[in] team The team participating in the collective.
* @param[in] dest Destination address. Must be an address on the symmetric
* heap.
* @param[in] source Source address. Must be an address on the symmetric heap.
* @param[in] nelems Number of bytes to broadcast.
* @param[in] pe_root Root PE (relative to team) from which to broadcast.
*
* @return void
*/
__global__ ATTR_NO_INLINE void rocshmem_broadcastmem_kernel(
rocshmem_team_t team, void *dest, const void *source, size_t nelems,
int pe_root);
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be invoked by a single thread within the PE.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_barrier_all();
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a wave-front collective.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_barrier_all_wave();
/**
* @brief perform a collective barrier between all PEs in the system.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a work-group collective.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_barrier_all_wg();
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__ void rocshmem_ctx_barrier(rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__ void rocshmem_ctx_barrier_wave(rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief perform a collective barrier between all PEs in the team.
* The caller is blocked until the barrier is resolved.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
*
* @param[in] team The team on which to perform barrier synchronization
*
* @return void
*/
__device__ void rocshmem_ctx_barrier_wg(rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be invoked by a single thread within the PE.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_sync_all();
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a wave-front collective.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_sync_all_wave();
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_sync_all only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a work-group collective.
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_sync_all_wg();
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be invoked by a single thread within the PE.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync(
rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a wave-front collective.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wave(
rocshmem_ctx_t ctx, rocshmem_team_t team);
/**
* @brief registers the arrival of a PE at a barrier.
* The caller is blocked until the synchronization is resolved.
*
* In contrast with the shmem_barrier_all routine, shmem_team_sync only ensures
* completion and visibility of previously issued memory stores and does not
* ensure completion of remote memory updates issued via OpenSHMEM routines.
*
* This function must be called as a work-group collective.
*
* @param[in] handle GPU side handle.
* @param[in] team Handle of the team being synchronized
*
* @return void
*/
__device__ ATTR_NO_INLINE void rocshmem_ctx_sync_wg(
rocshmem_ctx_t ctx, rocshmem_team_t team);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_COLL_HPP
@@ -0,0 +1,710 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
namespace rocshmem {
/**
* @name SHMEM_WAIT_UNTIL
* @brief Block the caller until the condition (* \p ptr \p cmps \p val) is
* true.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
* @param[in] cmp Operation for the comparison.
* @param[in] val Value to compare the memory at \p ptr to.
*
* @return void
*/
__device__ void rocshmem_float_wait_until(
float *ivars, int cmp, float val);
__device__ size_t rocshmem_float_wait_until_any(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ void rocshmem_float_wait_until_all(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ size_t rocshmem_float_wait_until_some(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__device__ size_t rocshmem_float_wait_until_any_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ void rocshmem_float_wait_until_all_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__device__ size_t rocshmem_float_wait_until_some_vector(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__host__ void rocshmem_float_wait_until(
float *ivars, int cmp, float val);
__host__ size_t rocshmem_float_wait_until_any(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ void rocshmem_float_wait_until_all(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ size_t rocshmem_float_wait_until_some(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__host__ size_t rocshmem_float_wait_until_any_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ void rocshmem_float_wait_until_all_vector(
float *ivars, size_t nelems, const int* status,
int cmp, float val);
__host__ size_t rocshmem_float_wait_until_some_vector(
float *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, float val);
__device__ void rocshmem_double_wait_until(
double *ivars, int cmp, double val);
__device__ size_t rocshmem_double_wait_until_any(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ void rocshmem_double_wait_until_all(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ size_t rocshmem_double_wait_until_some(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__device__ size_t rocshmem_double_wait_until_any_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ void rocshmem_double_wait_until_all_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__device__ size_t rocshmem_double_wait_until_some_vector(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__host__ void rocshmem_double_wait_until(
double *ivars, int cmp, double val);
__host__ size_t rocshmem_double_wait_until_any(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ void rocshmem_double_wait_until_all(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ size_t rocshmem_double_wait_until_some(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__host__ size_t rocshmem_double_wait_until_any_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ void rocshmem_double_wait_until_all_vector(
double *ivars, size_t nelems, const int* status,
int cmp, double val);
__host__ size_t rocshmem_double_wait_until_some_vector(
double *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, double val);
__device__ void rocshmem_char_wait_until(
char *ivars, int cmp, char val);
__device__ size_t rocshmem_char_wait_until_any(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ void rocshmem_char_wait_until_all(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ size_t rocshmem_char_wait_until_some(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__device__ size_t rocshmem_char_wait_until_any_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ void rocshmem_char_wait_until_all_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__device__ size_t rocshmem_char_wait_until_some_vector(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__host__ void rocshmem_char_wait_until(
char *ivars, int cmp, char val);
__host__ size_t rocshmem_char_wait_until_any(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ void rocshmem_char_wait_until_all(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ size_t rocshmem_char_wait_until_some(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__host__ size_t rocshmem_char_wait_until_any_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ void rocshmem_char_wait_until_all_vector(
char *ivars, size_t nelems, const int* status,
int cmp, char val);
__host__ size_t rocshmem_char_wait_until_some_vector(
char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, char val);
__device__ void rocshmem_schar_wait_until(
signed char *ivars, int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_any(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ void rocshmem_schar_wait_until_all(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_some(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_any_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ void rocshmem_schar_wait_until_all_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__device__ size_t rocshmem_schar_wait_until_some_vector(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__host__ void rocshmem_schar_wait_until(
signed char *ivars, int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_any(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ void rocshmem_schar_wait_until_all(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_some(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_any_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ void rocshmem_schar_wait_until_all_vector(
signed char *ivars, size_t nelems, const int* status,
int cmp, signed char val);
__host__ size_t rocshmem_schar_wait_until_some_vector(
signed char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, signed char val);
__device__ void rocshmem_short_wait_until(
short *ivars, int cmp, short val);
__device__ size_t rocshmem_short_wait_until_any(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ void rocshmem_short_wait_until_all(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ size_t rocshmem_short_wait_until_some(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__device__ size_t rocshmem_short_wait_until_any_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ void rocshmem_short_wait_until_all_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__device__ size_t rocshmem_short_wait_until_some_vector(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__host__ void rocshmem_short_wait_until(
short *ivars, int cmp, short val);
__host__ size_t rocshmem_short_wait_until_any(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ void rocshmem_short_wait_until_all(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ size_t rocshmem_short_wait_until_some(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__host__ size_t rocshmem_short_wait_until_any_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ void rocshmem_short_wait_until_all_vector(
short *ivars, size_t nelems, const int* status,
int cmp, short val);
__host__ size_t rocshmem_short_wait_until_some_vector(
short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, short val);
__device__ void rocshmem_int_wait_until(
int *ivars, int cmp, int val);
__device__ size_t rocshmem_int_wait_until_any(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ void rocshmem_int_wait_until_all(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ size_t rocshmem_int_wait_until_some(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__device__ size_t rocshmem_int_wait_until_any_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ void rocshmem_int_wait_until_all_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__device__ size_t rocshmem_int_wait_until_some_vector(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__host__ void rocshmem_int_wait_until(
int *ivars, int cmp, int val);
__host__ size_t rocshmem_int_wait_until_any(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ void rocshmem_int_wait_until_all(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ size_t rocshmem_int_wait_until_some(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__host__ size_t rocshmem_int_wait_until_any_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ void rocshmem_int_wait_until_all_vector(
int *ivars, size_t nelems, const int* status,
int cmp, int val);
__host__ size_t rocshmem_int_wait_until_some_vector(
int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, int val);
__device__ void rocshmem_long_wait_until(
long *ivars, int cmp, long val);
__device__ size_t rocshmem_long_wait_until_any(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ void rocshmem_long_wait_until_all(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ size_t rocshmem_long_wait_until_some(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__device__ size_t rocshmem_long_wait_until_any_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ void rocshmem_long_wait_until_all_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__device__ size_t rocshmem_long_wait_until_some_vector(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__host__ void rocshmem_long_wait_until(
long *ivars, int cmp, long val);
__host__ size_t rocshmem_long_wait_until_any(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ void rocshmem_long_wait_until_all(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ size_t rocshmem_long_wait_until_some(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__host__ size_t rocshmem_long_wait_until_any_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ void rocshmem_long_wait_until_all_vector(
long *ivars, size_t nelems, const int* status,
int cmp, long val);
__host__ size_t rocshmem_long_wait_until_some_vector(
long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long val);
__device__ void rocshmem_longlong_wait_until(
long long *ivars, int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_any(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ void rocshmem_longlong_wait_until_all(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_some(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_any_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ void rocshmem_longlong_wait_until_all_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__device__ size_t rocshmem_longlong_wait_until_some_vector(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__host__ void rocshmem_longlong_wait_until(
long long *ivars, int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_any(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ void rocshmem_longlong_wait_until_all(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_some(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_any_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ void rocshmem_longlong_wait_until_all_vector(
long long *ivars, size_t nelems, const int* status,
int cmp, long long val);
__host__ size_t rocshmem_longlong_wait_until_some_vector(
long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, long long val);
__device__ void rocshmem_uchar_wait_until(
unsigned char *ivars, int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_any(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ void rocshmem_uchar_wait_until_all(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_some(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_any_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ void rocshmem_uchar_wait_until_all_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__device__ size_t rocshmem_uchar_wait_until_some_vector(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__host__ void rocshmem_uchar_wait_until(
unsigned char *ivars, int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_any(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ void rocshmem_uchar_wait_until_all(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_some(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_any_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ void rocshmem_uchar_wait_until_all_vector(
unsigned char *ivars, size_t nelems, const int* status,
int cmp, unsigned char val);
__host__ size_t rocshmem_uchar_wait_until_some_vector(
unsigned char *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned char val);
__device__ void rocshmem_ushort_wait_until(
unsigned short *ivars, int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_any(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ void rocshmem_ushort_wait_until_all(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_some(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_any_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ void rocshmem_ushort_wait_until_all_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__device__ size_t rocshmem_ushort_wait_until_some_vector(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__host__ void rocshmem_ushort_wait_until(
unsigned short *ivars, int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_any(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ void rocshmem_ushort_wait_until_all(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_some(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_any_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ void rocshmem_ushort_wait_until_all_vector(
unsigned short *ivars, size_t nelems, const int* status,
int cmp, unsigned short val);
__host__ size_t rocshmem_ushort_wait_until_some_vector(
unsigned short *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned short val);
__device__ void rocshmem_uint_wait_until(
unsigned int *ivars, int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_any(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ void rocshmem_uint_wait_until_all(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_some(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_any_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ void rocshmem_uint_wait_until_all_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__device__ size_t rocshmem_uint_wait_until_some_vector(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__host__ void rocshmem_uint_wait_until(
unsigned int *ivars, int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_any(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ void rocshmem_uint_wait_until_all(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_some(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_any_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ void rocshmem_uint_wait_until_all_vector(
unsigned int *ivars, size_t nelems, const int* status,
int cmp, unsigned int val);
__host__ size_t rocshmem_uint_wait_until_some_vector(
unsigned int *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned int val);
__device__ void rocshmem_ulong_wait_until(
unsigned long *ivars, int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_any(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ void rocshmem_ulong_wait_until_all(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_some(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_any_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ void rocshmem_ulong_wait_until_all_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__device__ size_t rocshmem_ulong_wait_until_some_vector(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__host__ void rocshmem_ulong_wait_until(
unsigned long *ivars, int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_any(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ void rocshmem_ulong_wait_until_all(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_some(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_any_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ void rocshmem_ulong_wait_until_all_vector(
unsigned long *ivars, size_t nelems, const int* status,
int cmp, unsigned long val);
__host__ size_t rocshmem_ulong_wait_until_some_vector(
unsigned long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long val);
__device__ void rocshmem_ulonglong_wait_until(
unsigned long long *ivars, int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_any(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ void rocshmem_ulonglong_wait_until_all(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_some(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_any_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ void rocshmem_ulonglong_wait_until_all_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__device__ size_t rocshmem_ulonglong_wait_until_some_vector(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
__host__ void rocshmem_ulonglong_wait_until(
unsigned long long *ivars, int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_any(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ void rocshmem_ulonglong_wait_until_all(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_some(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_any_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ void rocshmem_ulonglong_wait_until_all_vector(
unsigned long long *ivars, size_t nelems, const int* status,
int cmp, unsigned long long val);
__host__ size_t rocshmem_ulonglong_wait_until_some_vector(
unsigned long long *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, unsigned long long val);
__device__ void rocshmem_uint64_wait_until(
uint64_t *ivars, int cmp, uint64_t val);
__device__ size_t rocshmem_uint64_wait_until_any(
uint64_t *ivars, size_t nelems, const int* status,
int cmp, uint64_t val);
__device__ void rocshmem_uint64_wait_until_all(
uint64_t *ivars, size_t nelems, const int* status,
int cmp, uint64_t val);
__device__ size_t rocshmem_uint64_wait_until_some(
uint64_t *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, uint64_t val);
__device__ size_t rocshmem_uint64_wait_until_any_vector(
uint64_t *ivars, size_t nelems, const int* status,
int cmp, uint64_t val);
__device__ void rocshmem_uint64_wait_until_all_vector(
uint64_t *ivars, size_t nelems, const int* status,
int cmp, uint64_t val);
__device__ size_t rocshmem_uint64_wait_until_some_vector(
uint64_t *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, uint64_t val);
__host__ void rocshmem_uint64_wait_until(
uint64_t *ivars, int cmp, uint64_t val);
__host__ size_t rocshmem_uint64_wait_until_any(
uint64_t *ivars, size_t nelems, const int* status,
int cmp, uint64_t val);
__host__ void rocshmem_uint64_wait_until_all(
uint64_t *ivars, size_t nelems, const int* status,
int cmp, uint64_t val);
__host__ size_t rocshmem_uint64_wait_until_some(
uint64_t *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, uint64_t val);
__host__ size_t rocshmem_uint64_wait_until_any_vector(
uint64_t *ivars, size_t nelems, const int* status,
int cmp, uint64_t val);
__host__ void rocshmem_uint64_wait_until_all_vector(
uint64_t *ivars, size_t nelems, const int* status,
int cmp, uint64_t val);
__host__ size_t rocshmem_uint64_wait_until_some_vector(
uint64_t *ivars, size_t nelems, size_t* indices, const int* status,
int cmp, uint64_t val);
/**
* @name SHMEM_TEST
* @brief test if the condition (* \p ptr \p cmps \p val) is
* true.
*
* This function can be called from divergent control paths at per-thread
* granularity. However, performance may be improved if the caller can
* coalesce contiguous messages and elect a leader thread to call into the
* ROCSHMEM function.
*
* @param[in] ivars Pointer to memory on the symmetric heap to wait for.
* @param[in] cmp Operation for the comparison.
* @param[in] val Value to compare the memory at \p ptr to.
*
* @return 1 if the evaluation is true else 0
*/
__device__ int rocshmem_float_test(
float *ivars, int cmp, float val);
__host__ int rocshmem_float_test(
float *ivars, int cmp, float val);
__device__ int rocshmem_double_test(
double *ivars, int cmp, double val);
__host__ int rocshmem_double_test(
double *ivars, int cmp, double val);
__device__ int rocshmem_char_test(
char *ivars, int cmp, char val);
__host__ int rocshmem_char_test(
char *ivars, int cmp, char val);
__device__ int rocshmem_schar_test(
signed char *ivars, int cmp, signed char val);
__host__ int rocshmem_schar_test(
signed char *ivars, int cmp, signed char val);
__device__ int rocshmem_short_test(
short *ivars, int cmp, short val);
__host__ int rocshmem_short_test(
short *ivars, int cmp, short val);
__device__ int rocshmem_int_test(
int *ivars, int cmp, int val);
__host__ int rocshmem_int_test(
int *ivars, int cmp, int val);
__device__ int rocshmem_long_test(
long *ivars, int cmp, long val);
__host__ int rocshmem_long_test(
long *ivars, int cmp, long val);
__device__ int rocshmem_longlong_test(
long long *ivars, int cmp, long long val);
__host__ int rocshmem_longlong_test(
long long *ivars, int cmp, long long val);
__device__ int rocshmem_uchar_test(
unsigned char *ivars, int cmp, unsigned char val);
__host__ int rocshmem_uchar_test(
unsigned char *ivars, int cmp, unsigned char val);
__device__ int rocshmem_ushort_test(
unsigned short *ivars, int cmp, unsigned short val);
__host__ int rocshmem_ushort_test(
unsigned short *ivars, int cmp, unsigned short val);
__device__ int rocshmem_uint_test(
unsigned int *ivars, int cmp, unsigned int val);
__host__ int rocshmem_uint_test(
unsigned int *ivars, int cmp, unsigned int val);
__device__ int rocshmem_ulong_test(
unsigned long *ivars, int cmp, unsigned long val);
__host__ int rocshmem_ulong_test(
unsigned long *ivars, int cmp, unsigned long val);
__device__ int rocshmem_ulonglong_test(
unsigned long long *ivars, int cmp, unsigned long long val);
__host__ int rocshmem_ulonglong_test(
unsigned long long *ivars, int cmp, unsigned long long val);
__device__ int rocshmem_uint64_test(
uint64_t *ivars, int cmp, uint64_t val);
__host__ int rocshmem_uint64_test(
uint64_t *ivars, int cmp, uint64_t val);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_P2P_SYNC_HPP
Filskillnaden har hållits tillbaka eftersom den är för stor Load Diff
Filskillnaden har hållits tillbaka eftersom den är för stor Load Diff
@@ -0,0 +1,654 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
namespace rocshmem {
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wg(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wg(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wg(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wg(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wg(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wg(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wg(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wg(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wg(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wg(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wg(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wg(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wg(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wg(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wg(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wg(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wg(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wg(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wg(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wg(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wg(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wg(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wg(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wg(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wg(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wg(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wg(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wg(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_wave(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_wave(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_wave(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_wave(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_wave(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_wave(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_wave(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_wave(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_wave(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_wave(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_wave(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_wave(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_wave(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_wave(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_wave(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_wave(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_wave(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_wave(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_wave(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_wave(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_wave(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_wave(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_wave(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_wave(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_wave(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_wave(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_wave(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_wave(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wg(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wg(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wg(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wg(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wg(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wg(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wg(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wg(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wg(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wg(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wg(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wg(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wg(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wg(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wg(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wg(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wg(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wg(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wg(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wg(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wg(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wg(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wg(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wg(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_putmem_signal_nbi_wave(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_putmem_signal_nbi_wave(
rocshmem_ctx_t ctx, void *dest, const void *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_float_put_signal_nbi_wave(
rocshmem_ctx_t ctx, float *dest, const float *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_float_put_signal_nbi_wave(
float *dest, const float *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_double_put_signal_nbi_wave(
rocshmem_ctx_t ctx, double *dest, const double *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_double_put_signal_nbi_wave(
double *dest, const double *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_char_put_signal_nbi_wave(
rocshmem_ctx_t ctx, char *dest, const char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_char_put_signal_nbi_wave(
char *dest, const char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_schar_put_signal_nbi_wave(
rocshmem_ctx_t ctx, signed char *dest, const signed char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_schar_put_signal_nbi_wave(
signed char *dest, const signed char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_short_put_signal_nbi_wave(
rocshmem_ctx_t ctx, short *dest, const short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_short_put_signal_nbi_wave(
short *dest, const short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_int_put_signal_nbi_wave(
rocshmem_ctx_t ctx, int *dest, const int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_int_put_signal_nbi_wave(
int *dest, const int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_long_put_signal_nbi_wave(
rocshmem_ctx_t ctx, long *dest, const long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_long_put_signal_nbi_wave(
long *dest, const long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_longlong_put_signal_nbi_wave(
rocshmem_ctx_t ctx, long long *dest, const long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_longlong_put_signal_nbi_wave(
long long *dest, const long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uchar_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned char *dest, const unsigned char *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uchar_put_signal_nbi_wave(
unsigned char *dest, const unsigned char *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ushort_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned short *dest, const unsigned short *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ushort_put_signal_nbi_wave(
unsigned short *dest, const unsigned short *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_uint_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned int *dest, const unsigned int *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_uint_put_signal_nbi_wave(
unsigned int *dest, const unsigned int *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulong_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned long *dest, const unsigned long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulong_put_signal_nbi_wave(
unsigned long *dest, const unsigned long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ctx_ulonglong_put_signal_nbi_wave(
rocshmem_ctx_t ctx, unsigned long long *dest, const unsigned long long *source, size_t nelems,
uint64_t *sig_addr, uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE void rocshmem_ulonglong_put_signal_nbi_wave(
unsigned long long *dest, const unsigned long long *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch(const uint64_t *sig_addr);
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wg(const uint64_t *sig_addr);
__device__ ATTR_NO_INLINE uint64_t rocshmem_signal_fetch_wave(const uint64_t *sig_addr);
/**
* @brief Kernel wrapper for putmem_signal operation on stream
*
* @param[in] dest Destination address on remote PE
* @param[in] source Source address on local PE
* @param[in] nelems Size of the transfer in bytes
* @param[in] sig_addr Address of signal variable on remote PE
* @param[in] signal Signal value to write
* @param[in] sig_op Signal operation (ROCSHMEM_SIGNAL_SET or
* ROCSHMEM_SIGNAL_ADD)
* @param[in] pe PE of the remote process
*
* @return void
*/
__global__ ATTR_NO_INLINE void rocshmem_putmem_signal_kernel(
void *dest, const void *source, size_t nelems, uint64_t *sig_addr,
uint64_t signal, int sig_op, int pe);
/**
* @brief Kernel wrapper for signal_wait_until operation on stream
*
* @param[in] sig_addr Address of signal variable on the symmetric heap
* @param[in] cmp Comparison operator
* @param[in] cmp_value Value to compare against
*
* @return void
*/
__global__ ATTR_NO_INLINE void rocshmem_signal_wait_until_kernel(
uint64_t *sig_addr, int cmp, uint64_t cmp_value);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_SIG_OP_HPP
@@ -0,0 +1,179 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
namespace rocshmem {
#ifdef USE_FUNC_CALL
#define ATTR_NO_INLINE __attribute__((noinline))
#else
#define ATTR_NO_INLINE
#endif
enum ROCSHMEM_STATUS {
ROCSHMEM_SUCCESS = 0,
ROCSHMEM_ERROR = 1,
};
enum ROCSHMEM_OP {
ROCSHMEM_SUM,
ROCSHMEM_MAX,
ROCSHMEM_MIN,
ROCSHMEM_PROD,
ROCSHMEM_AND,
ROCSHMEM_OR,
ROCSHMEM_XOR,
ROCSHMEM_REPLACE
};
enum ROCSHMEM_SIGNAL_OPS {
ROCSHMEM_SIGNAL_SET,
ROCSHMEM_SIGNAL_ADD,
};
/**
* @brief Types defined for rocshmem_wait() operations.
*/
enum rocshmem_cmps {
ROCSHMEM_CMP_EQ,
ROCSHMEM_CMP_NE,
ROCSHMEM_CMP_GT,
ROCSHMEM_CMP_GE,
ROCSHMEM_CMP_LT,
ROCSHMEM_CMP_LE,
};
enum rocshmem_thread_ops {
ROCSHMEM_THREAD_SINGLE,
ROCSHMEM_THREAD_FUNNELED,
ROCSHMEM_THREAD_WG_FUNNELED,
ROCSHMEM_THREAD_SERIALIZED,
ROCSHMEM_THREAD_MULTIPLE
};
/**
* @brief Bitwise flags to mask configuration parameters.
*/
enum rocshmem_team_configs {
ROCSHMEM_TEAM_DEFAULT_CONFIGS,
ROCSHMEM_TEAM_NUM_CONTEXTS
};
typedef struct {
int num_contexts;
} rocshmem_team_config_t;
constexpr size_t ROCSHMEM_REDUCE_MIN_WRKDATA_SIZE = 1024;
constexpr size_t ROCSHMEM_BARRIER_SYNC_SIZE = 256;
constexpr size_t ROCSHMEM_REDUCE_SYNC_SIZE = 256;
// Internally calls sync function, which matches barrier implementation
constexpr size_t ROCSHMEM_BCAST_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE;
constexpr size_t ROCSHMEM_ALLTOALL_SYNC_SIZE = ROCSHMEM_BARRIER_SYNC_SIZE + 1;
constexpr size_t ROCSHMEM_FCOLLECT_SYNC_SIZE = ROCSHMEM_ALLTOALL_SYNC_SIZE;
constexpr size_t ROCSHMEM_SYNC_VALUE = 0;
const int ROCSHMEM_CTX_ZERO = 0;
const int ROCSHMEM_CTX_NOSTORE = 1;
const int ROCSHMEM_CTX_SERIALIZED = 2;
const int ROCSHMEM_CTX_WG_PRIVATE = 4;
const int ROCSHMEM_CTX_SHARED = 8;
/**
* @brief GPU side OpenSHMEM context created from each work-groups'
* rocshmem_wg_handle_t
*/
typedef struct rocshmem_ctx{
void *ctx_opaque;
void *team_opaque;
__host__ __device__ bool operator==(const struct rocshmem_ctx& other) const {
return (ctx_opaque == other.ctx_opaque &&
team_opaque == other.team_opaque);
}
__host__ __device__ bool operator!=(const struct rocshmem_ctx& other) const {
return !(*this == other);
}
} rocshmem_ctx_t;
/**
* Shmem default context.
*/
extern "C" __device__ rocshmem_ctx_t __attribute__((visibility("default"))) ROCSHMEM_CTX_DEFAULT;
/**
* A value corresponding to an invalid communication context. This value can be
* used to initialize or update context handles to indicate that they do not
* reference a valid context. When managed in this way, applications can use an
* equality comparison to test whether a given context handle references a
* valid context.
*/
extern __constant__ rocshmem_ctx_t ROCSHMEM_CTX_INVALID;
/**
* Used internally to set default context.
*/
void set_internal_ctx(rocshmem_ctx_t *ctx);
/**
* Used internally to query the loaded backend
*/
//TODO: this should remain internal?
enum class BackendType { GDA_BACKEND, RO_BACKEND, IPC_BACKEND };
BackendType get_backend_type();
typedef uint64_t *rocshmem_team_t;
extern rocshmem_team_t ROCSHMEM_TEAM_WORLD;
const rocshmem_team_t ROCSHMEM_TEAM_INVALID = nullptr;
/**
* @brief Data structure defining the unqiueId
*/
/// Unique ID for a process. This is a ROCSHMEM_UNIQUE_ID_BYTES byte array that uniquely identifies a process.
#define ROCSHMEM_UNIQUE_ID_BYTES 128
using rocshmem_uniqueid_t = std::array<uint8_t, ROCSHMEM_UNIQUE_ID_BYTES>;
/**
* @brief Data structure used for attribute based
* initialization
*/
struct rocshmem_init_attr_t {
int32_t rank;
int32_t nranks;
rocshmem_uniqueid_t uid;
void* mpi_comm;
};
typedef struct rocshmem_init_attr_t rocshmem_init_attr_t;
constexpr unsigned int ROCSHMEM_INIT_WITH_MPI_COMM = 0;
constexpr unsigned int ROCSHMEM_INIT_WITH_UNIQUEID = 1;
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_ROCSHMEM_COMMON_HPP
@@ -0,0 +1,36 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_DEBUG_HPP
#define LIBRARY_INCLUDE_DEBUG_HPP
namespace rocshmem {
void debug_print_cq(int dest_pe, int src_wg, int cqe_index);
void debug_print_sq(int dest_pe, int src_wg, int index_wqe);
} // namespace rocshmem
#endif // LIBRARY_INCLUDE_DEBUG_HPP
@@ -0,0 +1,145 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_INCLUDE_ROCSHMEM_MPI_HPP
#define LIBRARY_INCLUDE_ROCSHMEM_MPI_HPP
#if defined(HAVE_EXTERNAL_MPI)
#include <mpi.h>
#endif
#if defined(c_plusplus) || defined(__cplusplus)
extern "C" {
#endif
#if !defined(MPI_VERSION)
// Open MPI based values for the constants/handles etc.
// Even though we did not include an external MPI header file
// The includer may have (e.g., a unit test).
typedef void* MPI_Comm;
typedef void* MPI_Win;
typedef void* MPI_Group;
typedef void* MPI_Op;
typedef void* MPI_Datatype;
typedef void* MPI_Request;
typedef void* MPI_Info;
struct ompi_status_public_t {
int MPI_SOURCE;
int MPI_TAG;
int MPI_ERROR;
int _cancelled;
size_t _ucount;
};
typedef struct ompi_status_public_t MPI_Status;
#define MPI_Aint uint64_t
#define MPI_UNDEFINED -32766
#define MPI_THREAD_MULTIPLE 3
#define MPI_SUCCESS 0
#define MPI_IN_PLACE (void*)1
#define MPI_MODE_NOCHECK 1
#define MPI_COMM_TYPE_SHARED 0
#define MPI_ANY_SOURCE -1
#define MPI_STATUSES_IGNORE (static_cast<MPI_Status*>(0))
#define MPI_Aint_diff(addr1, addr2) ((MPI_Aint) ((char *) (addr1) - (char *) (addr2)))
struct ompi_internal_symbols_t {
void *ompi_mpi_comm_world;
void *ompi_mpi_comm_null;
void *ompi_request_null;
void *ompi_mpi_info_null;
void *ompi_mpi_datatype_null;
void *ompi_mpi_op_max;
void *ompi_mpi_op_min;
void *ompi_mpi_op_sum;
void *ompi_mpi_op_prod;
void *ompi_mpi_op_band;
void *ompi_mpi_op_bor;
void *ompi_mpi_op_bxor;
void *ompi_mpi_op_replace;
void *ompi_mpi_op_no_op;
void *ompi_mpi_char;
void *ompi_mpi_unsigned_char;
void *ompi_mpi_signed_char;
void *ompi_mpi_short;
void *ompi_mpi_unsigned_short;
void *ompi_mpi_int;
void *ompi_mpi_unsigned;
void *ompi_mpi_long;
void *ompi_mpi_unsigned_long;
void *ompi_mpi_long_long_int;
void *ompi_mpi_unsigned_long_long;
void *ompi_mpi_float;
void *ompi_mpi_double;
void *ompi_mpi_long_double;
};
extern struct ompi_internal_symbols_t ompi_symbols_;
#define OMPI_PREDEFINED_GLOBAL(type, global) (static_cast<type> (global))
#define MPI_COMM_WORLD OMPI_PREDEFINED_GLOBAL(MPI_Comm, ompi_symbols_.ompi_mpi_comm_world)
#define MPI_COMM_NULL OMPI_PREDEFINED_GLOBAL(MPI_Comm, ompi_symbols_.ompi_mpi_comm_null)
#define MPI_REQUEST_NULL OMPI_PREDEFINED_GLOBAL(MPI_Request, ompi_symbols_.ompi_request_null)
#define MPI_WIN_NULL OMPI_PREDEFINED_GLOBAL(MPI_Win, ompi_symbols_.ompi_mpi_win_null)
#define MPI_INFO_NULL OMPI_PREDEFINED_GLOBAL(MPI_Info, ompi_symbols_.ompi_mpi_info_null)
#define MPI_MAX OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_max)
#define MPI_MIN OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_min)
#define MPI_SUM OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_sum)
#define MPI_PROD OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_prod)
#define MPI_BAND OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_band)
#define MPI_BOR OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_bor)
#define MPI_BXOR OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_bxor)
#define MPI_REPLACE OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_replace)
#define MPI_NO_OP OMPI_PREDEFINED_GLOBAL(MPI_Op, ompi_symbols_.ompi_mpi_op_no_op)
#define MPI_DATATYPE_NULL OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_datatype_null)
#define MPI_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_char)
#define MPI_UNSIGNED_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned_char)
#define MPI_SIGNED_CHAR OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_signed_char)
#define MPI_SHORT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_short)
#define MPI_UNSIGNED_SHORT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned_short)
#define MPI_INT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_int)
#define MPI_UNSIGNED OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned)
#define MPI_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_long)
#define MPI_UNSIGNED_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned_long)
#define MPI_LONG_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_long_long_int)
#define MPI_UNSIGNED_LONG_LONG OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_unsigned_long_long)
#define MPI_FLOAT OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_float)
#define MPI_DOUBLE OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_double)
#define MPI_LONG_DOUBLE OMPI_PREDEFINED_GLOBAL(MPI_Datatype, ompi_symbols_.ompi_mpi_long_double)
#endif //!defined(MPI_VERSION)
#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
#endif //LIBRARY_INCLUDE_ROCSHMEM_MPI_HPP
+52
Visa fil
@@ -0,0 +1,52 @@
#!/bin/bash
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
set -e
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DBUILD_FUNCTIONAL_TESTS=ON \
-DBUILD_UNIT_TESTS=ON \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GDA=ON \
-DGDA_MLX5=ON \
-DGDA_BNXT=ON \
-DGDA_IONIC=ON \
-DUSE_RO=ON \
-DUSE_IPC=ON \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_HDP_FLUSH=OFF \
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
$* $src_path
cmake --build . --parallel 8
cmake --install .
+71
Visa fil
@@ -0,0 +1,71 @@
#!/bin/bash
set -e
export CODE_COV="ON"
src_path=$(dirname "$(realpath $0)")
mkdir -p ipc && pushd ipc
$src_path/ipc_single
popd
mkdir -p ro_net && pushd ro_net
$src_path/ro_net
popd
mkdir -p ro_ipc && pushd ro_ipc
$src_path/ro_ipc
popd
mkdir -p ./test_output
export PROFRAW_DIR=./coverage-report/profraw
mkdir -p $PROFRAW_DIR
# Unit Tests
LLVM_PROFILE_FILE="$PROFRAW_DIR/ipc_unit-%p.profraw" ../scripts/unit_tests/driver.sh ./ipc/tests/unit_tests/rocshmem_unit_tests all
LLVM_PROFILE_FILE="$PROFRAW_DIR/ro_net-unit-%p.profraw" ../scripts/unit_tests/driver.sh ./ro_net/tests/unit_tests/rocshmem_unit_tests all
LLVM_PROFILE_FILE="$PROFRAW_DIR/ro_ipc-unit-%p.profraw" ../scripts/unit_tests/driver.sh ./ro_ipc/tests/unit_tests/rocshmem_unit_tests all
# Examples coverage
LLVM_PROFILE_FILE="$PROFRAW_DIR/init_test-%m.profraw" mpirun -np 4 ./ipc/examples/rocshmem_init_attr_test
LLVM_PROFILE_FILE="$PROFRAW_DIR/allreduce_test-%m.profraw" mpirun -np 4 ./ipc/examples/rocshmem_allreduce_test
# Functional Tests
LLVM_PROFILE_FILE="$PROFRAW_DIR/ipc-functional-%p.profraw" ../scripts/functional_tests/driver.sh ./ipc/tests/functional_tests/rocshmem_functional_tests all ./test_output/
LLVM_PROFILE_FILE="$PROFRAW_DIR/ro_net-functional-%p.profraw" ../scripts/functional_tests/driver.sh ./ro_net/tests/functional_tests/rocshmem_functional_tests all ./test_output/
LLVM_PROFILE_FILE="$PROFRAW_DIR/ro_ipc-functional-%p.profraw" ../scripts/functional_tests/driver.sh ./ro_ipc/tests/functional_tests/rocshmem_functional_tests all ./test_output/
# Coverage Report
/opt/rocm/llvm/bin/llvm-profdata merge -sparse $PROFRAW_DIR/*.profraw -o ./coverage-report/rocshmem.profdata
/opt/rocm/llvm/bin/llvm-cov report \
-object ./ipc/tests/unit_tests/rocshmem_unit_tests \
-object ./ipc/tests/functional_tests/rocshmem_functional_tests \
-object ./ipc/examples/rocshmem_init_attr_test \
-object ./ipc/examples/rocshmem_allreduce_test \
-object ./ro_net/tests/unit_tests/rocshmem_unit_tests \
-object ./ro_net/tests/functional_tests/rocshmem_functional_tests \
-object ./ro_ipc/tests/unit_tests/rocshmem_unit_tests \
-object ./ro_ipc/tests/functional_tests/rocshmem_functional_tests \
-instr-profile=./coverage-report/rocshmem.profdata \
--ignore-filename-regex=".*test.*"
/opt/rocm/llvm/bin/llvm-cov show \
-object ./ipc/tests/unit_tests/rocshmem_unit_tests \
-object ./ipc/tests/functional_tests/rocshmem_functional_tests \
-object ./ipc/examples/rocshmem_init_attr_test \
-object ./ipc/examples/rocshmem_allreduce_test \
-object ./ro_net/tests/unit_tests/rocshmem_unit_tests \
-object ./ro_net/tests/functional_tests/rocshmem_functional_tests \
-object ./ro_ipc/tests/unit_tests/rocshmem_unit_tests \
-object ./ro_ipc/tests/functional_tests/rocshmem_functional_tests \
-instr-profile=./coverage-report/rocshmem.profdata \
--ignore-filename-regex=".*test.*" \
-format=html \
-output-dir=coverage-report
cd coverage-report && python3 -m http.server
+49
Visa fil
@@ -0,0 +1,49 @@
#!/bin/bash
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
set -e
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DBUILD_FUNCTIONAL_TESTS=ON \
-DBUILD_UNIT_TESTS=ON \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GDA=ON \
-DUSE_RO=OFF \
-DUSE_IPC=OFF \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_HDP_FLUSH=OFF \
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
$* $src_path
cmake --build . --parallel 8
cmake --install .
+30
Visa fil
@@ -0,0 +1,30 @@
#!/bin/bash
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
set -e
script_path=$(dirname "$(realpath $0)")
source $script_path/gda -DGDA_BNXT=ON $*
+30
Visa fil
@@ -0,0 +1,30 @@
#!/bin/bash
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
set -e
script_path=$(dirname "$(realpath $0)")
source $script_path/gda -DGDA_IONIC=ON $*
+30
Visa fil
@@ -0,0 +1,30 @@
#!/bin/bash
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
set -e
script_path=$(dirname "$(realpath $0)")
source $script_path/gda -DGDA_MLX5=ON $*
+50
Visa fil
@@ -0,0 +1,50 @@
#!/bin/bash
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
set -e
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DBUILD_FUNCTIONAL_TESTS=ON \
-DBUILD_UNIT_TESTS=ON \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GDA=OFF \
-DUSE_RO=OFF \
-DUSE_IPC=ON \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_HDP_FLUSH=OFF \
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
-DUSE_SINGLE_NODE=ON \
$* $src_path
cmake --build . --parallel 8
cmake --install .
+51
Visa fil
@@ -0,0 +1,51 @@
#!/bin/bash
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
set -e
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DBUILD_FUNCTIONAL_TESTS=ON \
-DBUILD_EXAMPLES=ON \
-DBUILD_UNIT_TESTS=OFF \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GDA=OFF \
-DUSE_RO=OFF \
-DUSE_IPC=ON \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_SINGLE_NODE=ON \
-DUSE_HDP_FLUSH=OFF \
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
-DBUILD_TESTS_ONLY=ON \
$* $src_path
cmake --build . --parallel 8
+49
Visa fil
@@ -0,0 +1,49 @@
#!/bin/bash
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
set -e
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DBUILD_FUNCTIONAL_TESTS=ON \
-DBUILD_UNIT_TESTS=ON \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GDA=OFF \
-DUSE_RO=ON \
-DUSE_IPC=ON \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_HDP_FLUSH=OFF \
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
$* $src_path
cmake --build . --parallel 8
cmake --install .
+49
Visa fil
@@ -0,0 +1,49 @@
#!/bin/bash
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
set -e
src_path=$(dirname "$(realpath $0)")/../../
cmake \
-DBUILD_CODE_COVERAGE=${CODE_COV:-OFF} \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE:-Release} \
-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-~/rocshmem} \
-DCMAKE_VERBOSE_MAKEFILE=OFF \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DBUILD_FUNCTIONAL_TESTS=ON \
-DBUILD_UNIT_TESTS=ON \
-DDEBUG=OFF \
-DPROFILE=OFF \
-DUSE_GDA=OFF \
-DUSE_RO=ON \
-DUSE_IPC=OFF \
-DUSE_THREADS=OFF \
-DUSE_WF_COAL=OFF \
-DUSE_HDP_FLUSH=OFF \
-DUSE_HDP_FLUSH_HOST_SIDE=OFF \
$* $src_path
cmake --build . --parallel 8
cmake --install .
+30
Visa fil
@@ -0,0 +1,30 @@
#!/bin/bash
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
set -e
script_path=$(dirname "$(realpath $0)")
BUILD_TYPE=Debug source $script_path/ro_net $*
@@ -0,0 +1,5 @@
gdb scripts allow launching rocshmem tests repeatedly with gdb
and dump backtrace on error
- gdbscript - consists of commands which are executed on gdb launch
- gdbrun - run script, launches test in loop with gdb enabled
e.g ./gdbrun 14 10 launches pingPong 10 times
+778
Visa fil
@@ -0,0 +1,778 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
#!/bin/bash
if true || tty -s; then
PRETTY_FAILED="\033[1;31mFAILED\033[0m"
PRETTY_PASSED="\033[1;32mPASSED\033[0m"
else
PRETTY_FAILED="FAILED"
PRETTY_PASSED="PASSED"
fi
# This names/values should match the TestType enum in rocSHMEM/tests/functional_tests/tester.hpp
declare -A TEST_NUMBERS=(
["get"]="0"
["getnbi"]="1"
["put"]="2"
["putnbi"]="3"
["amo_fadd"]="4"
["amo_finc"]="5"
["amo_fetch"]="6"
["amo_fcswap"]="7"
["amo_add"]="8"
["amo_inc"]="9"
["amo_cswap"]="10"
["init"]="11"
["pingpong"]="12"
["randomaccess"]="13"
["barrierall"]="14"
["syncall"]="15"
["teamsync"]="16"
["collect"]="17"
["fcollect"]="18"
["alltoall"]="19"
["alltoalls"]="20"
["shmemptr"]="21"
["p"]="22"
["g"]="23"
["wgget"]="24"
["wggetnbi"]="25"
["wgput"]="26"
["wgputnbi"]="27"
["waveget"]="28"
["wavegetnbi"]="29"
["waveput"]="30"
["waveputnbi"]="31"
["teambroadcast"]="32"
["teamreduction"]="33"
["teamctxget"]="34"
["teamctxgetnbi"]="35"
["teamctxput"]="36"
["teamctxputnbi"]="37"
["teamctxinfra"]="38"
["putnbimr"]="39"
["amo_set"]="40"
["amo_swap"]="41"
["amo_fetchand"]="42"
["amo_fetchor"]="43"
["amo_fetchxor"]="44"
["amo_and"]="45"
["amo_or"]="46"
["amo_xor"]="47"
["pingall"]="48"
["putsignal"]="49"
["wgputsignal"]="50"
["waveputsignal"]="51"
["putsignalnbi"]="52"
["wgputsignalnbi"]="53"
["waveputsignalnbi"]="54"
["signalfetch"]="55"
["wgsignalfetch"]="56"
["wavesignalfetch"]="57"
["teamwgbarrier"]="58"
["defaultctxget"]="59"
["defaultctxgetnbi"]="60"
["defaultctxput"]="61"
["defaultctxputnbi"]="62"
["defaultctxp"]="63"
["defaultctxg"]="64"
["wavebarrierall"]="65"
["wgbarrierall"]="66"
["wavesyncall"]="67"
["wgsyncall"]="68"
["teambarrier"]="69"
["teamwavebarrier"]="70"
["teamwavesync"]="71"
["teamwgsync"]="72"
["teamctxsingleinfra"]="73"
["teamctxblockinfra"]="74"
["teamctxoddeveninfra"]="75"
["alltoallmem_on_stream"]="76"
["barrier_all_on_stream"]="77"
["broadcastmem_on_stream"]="78"
["getmem_on_stream"]="79"
["putmem_on_stream"]="80"
["putmem_signal_on_stream"]="81"
["signal_wait_until_on_stream"]="82"
["flood_put"]="83"
["flood_putnbi"]="84"
["flood_p"]="85"
["flood_get"]="86"
["flood_getnbi"]="87"
["flood_g"]="88"
)
ExecTest() {
TEST_NAME=$1
NUM_RANKS=$2
NUM_WG=$3
NUM_THREADS=$4
MAX_MSG_SIZE=$5
TIMEOUT=$((5 * 60)) # Timeout in seconds
if command -v amd-smi >/dev/null && amd-smi version 2>&1 >/dev/null
then
NUM_GPUS=${NUM_GPUS:-$(amd-smi list | grep GPU | wc -l)}
elif command -v rocm-smi >/dev/null && rocm-smi --version 2>&1 >/dev/null
then
NUM_GPUS=${NUM_GPUS:-$(rocm-smi --showserial | grep GPU | wc -l)}
fi
NUM_GPUS=${NUM_GPUS:-0}
NUM_GPUS=$(($NUM_GPUS > 0? $NUM_GPUS: 8))
TEST_NUM=${TEST_NUMBERS[$TEST_NAME]}
if [[ "" == "$TEST_NUM" ]]
then
echo "Test $TEST_NAME does not exist" >&2
DRIVER_RETURN_STATUS=1
return
fi
if [[ "" == "$ROCSHMEM_MAX_NUM_CONTEXTS" ]]
then
ROCSHMEM_MAX_NUM_CONTEXTS=$NUM_WG
fi
# MPI Parameters
LAUNCHER=mpirun
OPTIONS=" -n $NUM_RANKS -mca pml ucx -mca osc ucx"
OPTIONS+=" -x ROCSHMEM_MAX_NUM_CONTEXTS=$ROCSHMEM_MAX_NUM_CONTEXTS"
OPTIONS+=" -x UCX_ROCM_IPC_SIGPOOL_MAX_ELEMS=16384"
OPTIONS+=" --map-by numa --timeout $TIMEOUT"
if [[ "" != "$ROCSHMEM_TEST_USE_DEFAULT_STREAM" ]]
then
OPTIONS+=" -x ROCSHMEM_TEST_USE_DEFAULT_STREAM=$ROCSHMEM_TEST_USE_DEFAULT_STREAM"
fi
if [[ "" != "$HOSTFILE" ]]
then
OPTIONS+=" --hostfile $HOSTFILE"
fi
# Construct Test Command
TEST_LOG_NAME="$TEST_NAME"_n"$NUM_RANKS"_w"$NUM_WG"_z"$NUM_THREADS"
CMD="$LAUNCHER $OPTIONS $APP -a $TEST_NUM -w $NUM_WG -z $NUM_THREADS"
if [[ "" != "$MAX_MSG_SIZE" ]]
then
CMD+=" -s $MAX_MSG_SIZE"
TEST_LOG_NAME+=_"$MAX_MSG_SIZE"B
fi
CMD+=" >> $LOG_DIR/$TEST_LOG_NAME.log 2>&1"
# Run Test
if [ $NUM_GPUS -ge $NUM_RANKS ] || [[ "" != "$HOSTFILE" ]]; then
echo $TEST_LOG_NAME
echo "# $CMD" >"$LOG_DIR/$TEST_LOG_NAME.log"
eval $CMD
else
echo "Skipping test $TEST_LOG_NAME ($NUM_RANKS greater than $NUM_GPUS)"
fi
# Validate Test
if [ $? -ne 0 ]
then
echo -e "$PRETTY_FAILED: $TEST_LOG_NAME" >&2
cat "$LOG_DIR/$TEST_LOG_NAME.log"
DRIVER_RETURN_STATUS=1
FAILED_LIST="$FAILED_LIST $TEST_LOG_NAME"
fi
unset ROCSHMEM_MAX_NUM_CONTEXTS
}
TestRMAPut() {
##############################################################################
# | Name | Ranks | Workgroups | Threads | Max Message Size #
##############################################################################
ExecTest "put" 2 1 1 1048576
ExecTest "put" 2 1 1024 512
ExecTest "put" 2 8 1 1048576
ExecTest "put" 2 16 128 8
ExecTest "put" 2 32 256 512
ExecTest "put" 2 64 1024 8
ExecTest "wgput" 2 1 64 1048576
ExecTest "wgput" 2 2 64 1048576
ExecTest "wgput" 2 16 64 8
ExecTest "waveput" 2 1 64 1048576
ExecTest "waveput" 2 2 64 1048576
ExecTest "waveput" 2 2 128 1048576
ExecTest "waveput" 2 16 128 8
ExecTest "defaultctxput" 2 4 128 1024
ExecTest "teamctxput" 2 4 128 1024
ExecTest "teamctxput" 2 16 256 1024
ExecTest "p" 2 1 1 128
ExecTest "p" 2 1 1024 2
ExecTest "p" 2 8 1 32
ExecTest "p" 2 16 128 4
ExecTest "shmemptr" 2 1 1 8
ExecTest "shmemptr" 2 1 1024 8
ExecTest "shmemptr" 2 8 1 8
ExecTest "shmemptr" 2 16 128 8
ExecTest "putmem_on_stream" 2 1 1 1048576
export ROCSHMEM_TEST_USE_DEFAULT_STREAM=1
ExecTest "putmem_on_stream" 2 1 1 1048576
unset ROCSHMEM_TEST_USE_DEFAULT_STREAM
################################ Non-Blocking ################################
ExecTest "putnbi" 2 1 1 1048576
ExecTest "putnbi" 2 1 1024 512
ExecTest "putnbi" 2 8 1 1048576
ExecTest "putnbi" 2 16 128 8
ExecTest "putnbi" 2 32 256 512
ExecTest "putnbi" 2 64 1024 8
ExecTest "wgputnbi" 2 1 64 1048576
ExecTest "wgputnbi" 2 2 64 1048576
ExecTest "wgputnbi" 2 16 64 8
ExecTest "waveputnbi" 2 1 64 1048576
ExecTest "waveputnbi" 2 2 64 1048576
ExecTest "waveputnbi" 2 2 128 1048576
ExecTest "waveputnbi" 2 16 128 8
ExecTest "defaultctxputnbi" 2 4 128 1024
ExecTest "teamctxputnbi" 2 4 128 1024
ExecTest "teamctxputnbi" 2 16 256 1024
}
TestRMAGet() {
##############################################################################
# | Name | Ranks | Workgroups | Threads | Max Message Size #
##############################################################################
ExecTest "get" 2 1 1 1048576
ExecTest "get" 2 1 1024 512
ExecTest "get" 2 8 1 1048576
ExecTest "get" 2 16 128 8
ExecTest "get" 2 32 256 512
ExecTest "get" 2 64 1024 8
ExecTest "wgget" 2 1 64 1048576
ExecTest "wgget" 2 2 64 1048576
ExecTest "wgget" 2 16 64 8
ExecTest "waveget" 2 1 64 1048576
ExecTest "waveget" 2 2 64 1048576
ExecTest "waveget" 2 2 128 1048576
ExecTest "waveget" 2 16 128 8
ExecTest "defaultctxget" 2 4 128 1024
ExecTest "teamctxget" 2 4 128 1024
ExecTest "teamctxget" 2 16 256 1024
ExecTest "g" 2 1 1 128
ExecTest "g" 2 1 1024 1
ExecTest "g" 2 8 1 32
ExecTest "g" 2 16 128 4
ExecTest "getmem_on_stream" 2 1 1 1048576
################################ Non-Blocking ################################
ExecTest "getnbi" 2 1 1 1048576
ExecTest "getnbi" 2 1 1024 512
ExecTest "getnbi" 2 8 1 1048576
ExecTest "getnbi" 2 16 128 8
ExecTest "getnbi" 2 32 256 512
ExecTest "getnbi" 2 64 1024 8
ExecTest "wggetnbi" 2 1 64 1048576
ExecTest "wggetnbi" 2 2 64 1048576
ExecTest "wggetnbi" 2 16 64 8
ExecTest "wavegetnbi" 2 1 64 1048576
ExecTest "wavegetnbi" 2 2 64 1048576
ExecTest "wavegetnbi" 2 2 128 1048576
ExecTest "wavegetnbi" 2 16 128 8
ExecTest "defaultctxgetnbi" 2 4 128 1024
ExecTest "teamctxgetnbi" 2 4 128 1024
ExecTest "teamctxgetnbi" 2 16 256 1024
}
TestRMA() {
TestRMAPut
TestRMAGet
}
TestAMORO() {
##############################################################################
# | Name | Ranks | Workgroups | Threads | Max Message Size #
##############################################################################
ExecTest "amo_fetch" 2 1 1
ExecTest "amo_fetch" 2 1 1024
ExecTest "amo_fetch" 2 8 1
ExecTest "amo_fetch" 2 32 128
ExecTest "amo_set" 2 1 1
ExecTest "amo_set" 2 8 1
ExecTest "amo_set" 2 32 1
ExecTest "amo_fcswap" 2 1 1
ExecTest "amo_fcswap" 2 32 1
ExecTest "amo_fcswap" 2 8 1
ExecTest "amo_fetchand" 2 1 1
ExecTest "amo_and" 2 1 1
ExecTest "amo_xor" 2 1 1
}
TestAMO() {
TestAMORO
##############################################################################
# | Name | Ranks | Workgroups | Threads | Max Message Size #
##############################################################################
ExecTest "amo_finc" 2 1 1
ExecTest "amo_finc" 2 1 1024
ExecTest "amo_finc" 2 8 1
ExecTest "amo_finc" 2 32 128
ExecTest "amo_inc" 2 1 1
ExecTest "amo_inc" 2 1 1024
ExecTest "amo_inc" 2 8 1
ExecTest "amo_inc" 2 32 128
ExecTest "amo_fadd" 2 1 1
ExecTest "amo_fadd" 2 1 1024
ExecTest "amo_fadd" 2 8 1
ExecTest "amo_fadd" 2 32 128
ExecTest "amo_add" 2 1 1
ExecTest "amo_add" 2 1 1024
ExecTest "amo_add" 2 8 1
ExecTest "amo_add" 2 32 128
}
TestSigOpsRO() {
##############################################################################
# | Name | Ranks | Workgroups | Threads | Max Message Size #
##############################################################################
ExecTest "putsignal" 2 1 1 1048576
ExecTest "putsignal" 2 2 32 1048576
ExecTest "wgputsignal" 2 2 32 1048576
ExecTest "waveputsignal" 2 1 32 1048576
ExecTest "waveputsignal" 2 2 64 1048576
ExecTest "putsignalnbi" 2 1 1 1048576
ExecTest "putsignalnbi" 2 2 32 1048576
ExecTest "wgputsignalnbi" 2 2 32 1048576
ExecTest "waveputsignalnbi" 2 1 32 1048576
ExecTest "waveputsignalnbi" 2 2 64 1048576
ExecTest "signalfetch" 2 1 1
ExecTest "wgsignalfetch" 2 2 32
ExecTest "wavesignalfetch" 2 1 32
ExecTest "wavesignalfetch" 2 1 64
ExecTest "signal_wait_until_on_stream" 2 1 1
}
TestSigOps() {
TestSigOpsRO
ExecTest "putmem_signal_on_stream" 2 1 1 1048576
}
TestColl() {
##############################################################################
# | Name | Ranks | Workgroups | Threads | Max Message Size #
##############################################################################
ExecTest "barrierall" 2 1 1
ExecTest "wavebarrierall" 2 1 1
ExecTest "wgbarrierall" 2 1 1
ExecTest "teambarrier" 2 1 1
ExecTest "teambarrier" 2 16 64
ExecTest "teambarrier" 2 32 256
ExecTest "teambarrier" 2 39 1024
ExecTest "teamwavebarrier" 2 1 1
ExecTest "teamwavebarrier" 2 16 64
ExecTest "teamwavebarrier" 2 32 256
ExecTest "teamwavebarrier" 2 39 1024
ExecTest "teamwgbarrier" 2 1 1
ExecTest "teamwgbarrier" 2 16 64
ExecTest "teamwgbarrier" 2 32 256
ExecTest "teamwgbarrier" 2 39 1024
ExecTest "teamsync" 2 1 1
ExecTest "teamsync" 2 16 64
ExecTest "teamsync" 2 32 256
ExecTest "teamsync" 2 39 1024
ExecTest "teamwavesync" 2 1 1
ExecTest "teamwavesync" 2 16 64
ExecTest "teamwavesync" 2 32 256
ExecTest "teamwavesync" 2 39 1024
ExecTest "teamwgsync" 2 1 1
ExecTest "teamwgsync" 2 16 64
ExecTest "teamwgsync" 2 32 256
ExecTest "teamwgsync" 2 39 1024
ExecTest "syncall" 2 1 1
ExecTest "wavesyncall" 2 1 1
ExecTest "wgsyncall" 2 1 1
ExecTest "alltoall" 2 1 64 512
ExecTest "teambroadcast" 2 1 64 32768
ExecTest "fcollect" 2 1 64 32768
ExecTest "teamreduction" 2 1 64 32768
ExecTest "alltoallmem_on_stream" 2 1 64 1048576
ExecTest "broadcastmem_on_stream" 2 1 64 1048576
ExecTest "barrier_all_on_stream" 2 1 1
}
TestOther() {
##############################################################################
# | Name | Ranks | Workgroups | Threads | Max Message Size #
##############################################################################
ExecTest "init" 2 1 1
ExecTest "pingpong" 2 1 1
ExecTest "pingpong" 2 8 1
ExecTest "pingpong" 2 32 1
ExecTest "pingall" 2 1 1
ExecTest "pingall" 2 8 1
ExecTest "pingall" 2 32 1
ExecTest "flood_put" 2 64 1024
ExecTest "flood_get" 2 64 1024
ExecTest "flood_put" 8 64 1024
ExecTest "flood_putnbi" 8 64 1024
ExecTest "flood_p" 8 64 1024
ExecTest "flood_get" 8 64 1024
ExecTest "flood_getnbi" 8 64 1024
ExecTest "flood_g" 8 64 1024
# This test requires more contexts than workgroups
export ROCSHMEM_MAX_NUM_CONTEXTS=1024
ExecTest "teamctxinfra" 2 1 1
ExecTest "teamctxsingleinfra" 2 1 1
ExecTest "teamctxblockinfra" 4 1 1
ExecTest "teamctxblockinfra" 5 1 1
ExecTest "teamctxoddeveninfra" 4 1 1
ExecTest "teamctxoddeveninfra" 5 1 1
unset ROCSHMEM_MAX_NUM_CONTEXTS
}
# TODO: remove when GDA is feature complete
TestGDA() {
##############################################################################
# | Name | Ranks | Workgroups | Threads | Max Message Size #
##############################################################################
ExecTest "put" 2 1 1 1048576
ExecTest "put" 2 1 1024 512
ExecTest "put" 2 8 1 1048576
ExecTest "put" 2 16 128 8
ExecTest "put" 2 32 256 512
ExecTest "put" 2 64 1024 8
ExecTest "wgput" 2 1 64 1048576
ExecTest "wgput" 2 2 64 1048576
ExecTest "wgput" 2 16 64 8
ExecTest "waveput" 2 1 64 1048576
ExecTest "waveput" 2 2 64 1048576
ExecTest "waveput" 2 2 128 1048576
ExecTest "waveput" 2 16 128 8
ExecTest "defaultctxput" 2 4 128 1024
ExecTest "teamctxput" 2 4 128 1024
ExecTest "teamctxput" 2 16 256 1024
ExecTest "get" 2 1 1 1048576
ExecTest "get" 2 1 1024 512
ExecTest "get" 2 8 1 1048576
ExecTest "get" 2 16 128 8
ExecTest "get" 2 32 256 512
ExecTest "get" 2 64 1024 8
ExecTest "wgget" 2 1 64 1048576
ExecTest "wgget" 2 2 64 1048576
ExecTest "wgget" 2 16 64 8
ExecTest "waveget" 2 1 64 1048576
ExecTest "waveget" 2 2 64 1048576
ExecTest "waveget" 2 2 128 1048576
ExecTest "waveget" 2 16 128 8
ExecTest "defaultctxget" 2 4 128 1024
ExecTest "teamctxget" 2 4 128 1024
ExecTest "teamctxget" 2 16 256 1024
# ExecTest "g" 2 1 1 128
# ExecTest "g" 2 1 1024 2
# ExecTest "g" 2 8 1 32
# ExecTest "g" 2 16 128 4
ExecTest "p" 2 1 1 128
ExecTest "p" 2 1 1024 2
ExecTest "p" 2 8 1 32
ExecTest "p" 2 16 128 4
################################ Non-Blocking ################################
ExecTest "putnbi" 2 1 1 1048576
ExecTest "putnbi" 2 1 1024 512
ExecTest "putnbi" 2 8 1 1048576
ExecTest "putnbi" 2 16 128 8
ExecTest "putnbi" 2 32 256 512
ExecTest "putnbi" 2 64 1024 8
ExecTest "wgputnbi" 2 1 64 1048576
ExecTest "wgputnbi" 2 2 64 1048576
ExecTest "wgputnbi" 2 16 64 8
ExecTest "waveputnbi" 2 1 64 1048576
ExecTest "waveputnbi" 2 2 64 1048576
ExecTest "waveputnbi" 2 2 128 1048576
ExecTest "waveputnbi" 2 16 128 8
ExecTest "defaultctxputnbi" 2 4 128 1024
ExecTest "teamctxputnbi" 2 4 128 1024
ExecTest "teamctxputnbi" 2 16 256 1024
ExecTest "getnbi" 2 1 1 1048576
ExecTest "getnbi" 2 1 1024 512
ExecTest "getnbi" 2 8 1 1048576
ExecTest "getnbi" 2 16 128 8
ExecTest "getnbi" 2 32 256 512
ExecTest "getnbi" 2 64 1024 8
ExecTest "wggetnbi" 2 1 64 1048576
ExecTest "wggetnbi" 2 2 64 1048576
ExecTest "wggetnbi" 2 16 64 8
ExecTest "wavegetnbi" 2 1 64 1048576
ExecTest "wavegetnbi" 2 2 64 1048576
ExecTest "wavegetnbi" 2 2 128 1048576
ExecTest "wavegetnbi" 2 16 128 8
ExecTest "defaultctxgetnbi" 2 4 128 1024
ExecTest "teamctxgetnbi" 2 4 128 1024
ExecTest "teamctxgetnbi" 2 16 256 1024
#TestAMO() {
##############################################################################
# | Name | Ranks | Workgroups | Threads | Max Message Size #
##############################################################################
ExecTest "amo_fetch" 2 1 1
ExecTest "amo_fetch" 2 1 1024
ExecTest "amo_fetch" 2 8 1
ExecTest "amo_fetch" 2 32 128
ExecTest "amo_set" 2 1 1
ExecTest "amo_set" 2 8 1
ExecTest "amo_set" 2 32 1
ExecTest "amo_fcswap" 2 1 1
ExecTest "amo_fcswap" 2 32 1
ExecTest "amo_fcswap" 2 8 1
ExecTest "amo_finc" 2 1 1
ExecTest "amo_finc" 2 1 1024
ExecTest "amo_finc" 2 8 1
ExecTest "amo_finc" 2 32 128
ExecTest "amo_inc" 2 1 1
ExecTest "amo_inc" 2 1 1024
ExecTest "amo_inc" 2 8 1
ExecTest "amo_inc" 2 32 128
ExecTest "amo_fadd" 2 1 1
ExecTest "amo_fadd" 2 1 1024
ExecTest "amo_fadd" 2 8 1
ExecTest "amo_fadd" 2 32 128
ExecTest "amo_add" 2 1 1
ExecTest "amo_add" 2 1 1024
ExecTest "amo_add" 2 8 1
ExecTest "amo_add" 2 32 128
ExecTest "amo_fetchand" 2 1 1
ExecTest "amo_and" 2 1 1
ExecTest "amo_xor" 2 1 1
#TestColl() {
##############################################################################
# | Name | Ranks | Workgroups | Threads | Max Message Size #
##############################################################################
ExecTest "barrierall" 2 1 1
ExecTest "teambarrier" 2 1 1
ExecTest "teamsync" 2 1 1
ExecTest "syncall" 2 1 1
ExecTest "alltoall" 2 1 1 512
ExecTest "teambroadcast" 2 1 1 32768
ExecTest "fcollect" 2 1 1 32768
# deadlock on gda, size 8KB
# ExecTest "teamreduction" 2 1 1 32768
#TestOther() {
##############################################################################
# | Name | Ranks | Workgroups | Threads | Max Message Size #
##############################################################################
ExecTest "init" 2 1 1
ExecTest "pingpong" 2 1 1
ExecTest "pingpong" 2 8 1
ExecTest "pingpong" 2 32 1
ExecTest "flood_put" 2 64 1024
ExecTest "flood_get" 2 64 1024
ExecTest "flood_put" 8 64 1024
ExecTest "flood_putnbi" 8 64 1024
ExecTest "flood_p" 8 64 1024
ExecTest "flood_get" 8 64 1024
ExecTest "flood_getnbi" 8 64 1024
# ExecTest "flood_g" 8 64 1024 # _g not implemented
# This test requires more contexts than workgroups
export ROCSHMEM_MAX_NUM_CONTEXTS=1024
ExecTest "teamctxinfra" 2 1 1
ExecTest "teamctxsingleinfra" 2 1 1
ExecTest "teamctxblockinfra" 4 1 1
ExecTest "teamctxblockinfra" 5 1 1
ExecTest "teamctxoddeveninfra" 4 1 1
ExecTest "teamctxoddeveninfra" 5 1 1
unset ROCSHMEM_MAX_NUM_CONTEXTS
}
ValidateInput() {
INPUT_COUNT=$1
if [ $INPUT_COUNT -lt 3 ] ; then
echo "This script must be run with at least 3 arguments."
echo 'Usage: ${0} argument1 argument2 argument3 [argument4]'
echo " argument1 : path to the tester driver"
echo " argument2 : test type to run, e.g put"
echo " argument3 : directory to put the output logs"
echo " argument4 : path to hostfile"
exit 1
fi
}
ValidateLogDir() {
if [ ! -d $1 ]; then
echo "LOG_DIR=$1 does not exist"
mkdir -p $1
echo "Created $1"
fi
}
APP=$1
TEST=$2
LOG_DIR=$3
HOSTFILE=$4
DRIVER_RETURN_STATUS=0
ValidateInput $#
ValidateLogDir $LOG_DIR
case $TEST in
*"gda")
TestGDA
;;
*"all")
TestRMA
TestAMO
TestSigOps
TestColl
TestOther
;;
*"all-ro")
TestRMAPut
TestAMORO
TestSigOpsRO
TestColl
TestOther
;;
*"rma")
TestRMA
;;
*"put")
TestRMAPut
;;
*"get")
TestRMAGet
;;
*"amo")
TestAMO
;;
*"sigops")
TestSigOps
;;
*"coll")
TestColl
;;
*"other")
TestOther
;;
*)
##############################################################################
# | Name | Ranks | Workgroups | Threads | Max Message Size #
##############################################################################
ExecTest $TEST 2 1 1 8
;;
esac
EXIT_STATUS=$(($DRIVER_RETURN_STATUS || $?))
if [ $EXIT_STATUS -eq 0 ]; then
echo -e "TESTS PASSED"
else
echo -e "TESTS FAILED: $FAILED_LIST"
fi
exit $EXIT_STATUS
+34
Visa fil
@@ -0,0 +1,34 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
# argument1 - operation to run
# argument2 - loop count
# e.g ./gdbrun 14 10 (launches pingpong for 10 times)
set -e
for i in {1..$2};
do
mpirun -np 2 xterm -e gdb -x gdbscript --args build/rocshmem_functional_tests -t 1 -w 1 -s 32768 -a $1 -x 8
test $? -eq 0 || exit 1
done
@@ -0,0 +1,40 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
set pagination off
set print frame-arguments all
set logging file log.dat
set logging on
set $_exitcode = -1
run
if $_exitcode != -1
quit
else
#backtrace
# backtrace for all threads
thread apply all bt full
quit
end
+99
Visa fil
@@ -0,0 +1,99 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
#!/bin/bash
set -e
set -o pipefail
if [[ -z "${_ROCM_DIR}" ]]; then
export _ROCM_DIR=/opt/rocm
fi
# Location of dependencies source code
_BUILD_DIR=${BUILD_DIR:-$PWD}
export _INSTALL_DIR=${INSTALL_DIR:-$_BUILD_DIR/install}
echo "rocSHMEM dependencies UCX and Open MPI will install in $_INSTALL_DIR"
export _DEPS_SRC_DIR=$_BUILD_DIR/deps-src
mkdir -p $_DEPS_SRC_DIR
#Adjust branches and installation location as necessary
export _UCX_INSTALL_DIR=${INSTALL_DIR:-$_INSTALL_DIR/ucx}
export _UCX_REPO=https://github.com/ROCm/ucx.git
export _UCX_COMMIT_HASH=18770fdc1c3b5de202d14a088a14b734d2c4bbf3
export _OMPI_INSTALL_DIR=${INSTALL_DIR:-$_INSTALL_DIR/ompi}
export _OMPI_REPO=https://github.com/ROCm/ompi.git
export _OMPI_COMMIT_HASH=697a596dde68815fe50db3c2a75a42ddb41b5ef4
# Step 1: Build UCX with ROCm support
cd $_DEPS_SRC_DIR
rm -rf ucx
git clone $_UCX_REPO
cd ucx
git checkout $_UCX_COMMIT_HASH
./autogen.sh
./contrib/configure-release --prefix=$_UCX_INSTALL_DIR \
--with-rocm=$_ROCM_DIR \
--enable-mt \
--without-go \
--without-java \
--without-cuda \
--without-knem
make -j
make install
# Step 2: Install OpenMPI with UCX support
cd $_DEPS_SRC_DIR
rm -rf ompi
git clone --recursive $_OMPI_REPO
cd ompi
git checkout $_OMPI_COMMIT_HASH
git submodule update --init --recursive
python3 -m ensurepip && python3 -m venv venv || python3 -m venv --system-site-packages --without-pip venv
. venv/bin/activate
python3 -m pip install -r docs/requirements.txt
./autogen.pl
./configure --prefix=$_OMPI_INSTALL_DIR \
--with-rocm=$_ROCM_DIR \
--with-ucx=$_UCX_INSTALL_DIR \
--disable-oshmem \
--with-prrte=internal \
--with-hwloc=internal \
--with-libevent=internal \
--without-cuda \
--disable-mpi-fortran \
--without-ofi
make -j
make install
rm -rf $_DEPS_SRC_DIR
echo "Dependencies for rocSHMEM are now installed"
echo ""
echo "UCX ($_UCX_COMMIT_HASH) Installed to $_UCX_INSTALL_DIR"
echo "OpenMPI ($_OMPI_COMMIT_HASH) Installed to $_OMPI_INSTALL_DIR"
echo ""
echo "Please update your PATH and LD_LIBRARY_PATH"
+99
Visa fil
@@ -0,0 +1,99 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
#!/bin/bash
# Function to display help information
function display_help {
echo "Usage:"
echo " $0 binary_name all # Runs all standard tests"
echo " $0 binary_name custom <ranks> <filter> # Runs custom test configuration"
echo
echo "Arguments:"
echo " binary_name: Name of the binary to run."
echo " all: Executes predefined test configurations."
echo " custom: Executes a test with custom MPI ranks and GTest filter."
echo " ranks: Number of MPI ranks (required for custom mode)."
echo " filter: GTest filter string (required for custom mode)."
echo
}
# Validate number of arguments for each mode
if [[ "$#" -lt 2 ]] ||
{ [[ "$2" == "all" ]] && [[ "$#" -ne 2 ]]; } ||
{ [[ "$2" == "custom" ]] && [[ "$#" -ne 4 ]]; }; then
display_help
exit 1
fi
driver_return_status=0
binary_name=$1
mode=$2
timestamp=$(date "+%Y-%m-%d-%H:%M:%S")
log_file="unit_tests_${timestamp}.log"
mpi_timeout=$((20 * 60)) # 20 minutes in seconds
# Function to execute mpirun command
function run_mpirun {
local np=$1
local gtest_filter=$2
cmd_str="mpirun -np $np --timeout $mpi_timeout $binary_name --gtest_filter=$gtest_filter >> $log_file 2>&1"
echo $cmd_str
eval $cmd_str
# Test if mpirun failed
if [ $? -ne 0 ]
then
echo "FAILED: $cmd_str" >&2
cat $log_file
driver_return_status=1
fi
}
# Processing modes
case $mode in
all)
test_with_two_pes="IPCImplSimpleCoarseTestFixture/*:IPCImplSimpleFineTestFixture/*:IPCImplTiledFineTestFixture/*:DegenerateTiledFine.*"
run_mpirun 4 "-$test_with_two_pes"
#run_mpirun 2 "$test_with_two_pes"
;;
custom)
# Check if ranks is a positive integer
if [[ "$3" -le 1 ]]; then
echo "Error: 'ranks' must be a positive integer."
display_help
exit 1
fi
run_mpirun $3 $4
;;
*)
echo "Error: Invalid mode '$mode'." | tee -a "$log_file"
display_help
exit 1
;;
esac
echo "Tests Completed"
echo "log file: '$log_file'"
exit $driver_return_status
+83
Visa fil
@@ -0,0 +1,83 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
###############################################################################
# ADD ROCSHMEM TARGET FOR FILES IN CURRENT DIRECTORY
###############################################################################
target_sources(
${PROJECT_NAME}
PRIVATE
atomic_return.cpp
backend_bc.cpp
envvar.cpp
context_host.cpp
context_device.cpp
mpi_instance.cpp
rocshmem_gpu.cpp
rocshmem.cpp
team.cpp
team_tracker.cpp
util.cpp
wf_coal_policy.cpp
ipc_policy.cpp
)
set(
ROCSHMEM_COMPILE_FLAGS
-fgpu-rdc
# xnack allows address translation fault recovery
# required option for managed heap configs
# -mxnack
)
if (BUILD_CODE_COVERAGE)
set(ROCSHMEM_COMPILE_FLAGS ${ROCSHMEM_COMPILE_FLAGS} -fprofile-instr-generate -fcoverage-mapping)
target_link_options(${PROJECT_NAME} PUBLIC -fprofile-instr-generate)
endif()
target_compile_options(${PROJECT_NAME} PUBLIC ${ROCSHMEM_COMPILE_FLAGS})
#target_link_options(
#${PROJECT_NAME}
#PUBLIC
#--hip-link
#)
###############################################################################
# ROCSHMEM TARGET FOR BACKENDS
###############################################################################
if (USE_GDA)
add_subdirectory(gda)
endif()
if (USE_RO)
add_subdirectory(reverse_offload)
endif()
if (USE_IPC)
add_subdirectory(ipc)
endif()
add_subdirectory(host)
add_subdirectory(containers)
add_subdirectory(memory)
add_subdirectory(sync)
add_subdirectory(bootstrap)
if (BUILD_TOOLS)
add_subdirectory(tools)
endif()
+287
Visa fil
@@ -0,0 +1,287 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_ASSEMBLY_HPP_
#define LIBRARY_SRC_ASSEMBLY_HPP_
#include <hip/hip_runtime.h>
#include <hsa/hsa.h>
#include <hsa/hsa_ext_amd.h>
namespace rocshmem {
#define DO_PRAGMA(x) _Pragma(#x)
#define NOWARN(warnoption, ...) \
DO_PRAGMA(GCC diagnostic push) \
DO_PRAGMA(GCC diagnostic ignored #warnoption) \
__VA_ARGS__ \
DO_PRAGMA(GCC diagnostic pop)
#define SFENCE() asm volatile("sfence" ::: "memory")
__device__ __forceinline__ int uncached_load_ubyte(uint8_t* src) {
int ret;
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__) || defined(__gfx1100__)
asm volatile(
"global_load_ubyte %0 %1 off glc slc \n"
"s_waitcnt vmcnt(0)"
: "=v"(ret)
: "v"(src));
#endif
#if defined(__gfx942__) || defined(__gfx950__)
asm volatile(
"global_load_ubyte %0 %1 off sc0 sc1 \n"
"s_waitcnt vmcnt(0)"
: "=v"(ret)
: "v"(src));
#endif
#if defined(__gfx1201__)
asm volatile(
"global_load_u8 %0 %1 off scope:SCOPE_SYS \n"
"s_wait_loadcnt 0x0"
: "=v"(ret)
: "v"(src));
#endif
return ret;
}
__device__ __forceinline__ void refresh_volatile_sbyte(volatile int *assigned_value,
volatile char *read_value) {
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__) || defined(__gfx1100__)
asm volatile(
"global_load_sbyte %0 %1 off glc slc\n "
"s_waitcnt vmcnt(0)"
: "=v"(*assigned_value)
: "v"(read_value));
#endif
#if defined(__gfx942__) || defined(__gfx950__)
asm volatile(
"global_load_sbyte %0 %1 off sc0 sc1\n "
"s_waitcnt vmcnt(0)"
: "=v"(*assigned_value)
: "v"(read_value));
#endif
#if defined(__gfx1201__)
asm volatile(
"global_load_i8 %0 %1 off scope:SCOPE_SYS \n"
"s_wait_loadcnt 0x0"
: "=v"(*assigned_value)
: "v"(read_value));
#endif
}
__device__ __forceinline__ void refresh_volatile_dwordx2(volatile uint64_t *assigned_value,
volatile uint64_t *read_value) {
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__) || defined(__gfx1100__)
asm volatile(
"global_load_dwordx2 %0 %1 off glc slc\n "
"s_waitcnt vmcnt(0)"
: "=v"(*assigned_value)
: "v"(read_value));
#endif
#if defined(__gfx942__) || defined(__gfx950__)
asm volatile(
"global_load_dwordx2 %0 %1 off sc0 sc1\n "
"s_waitcnt vmcnt(0)"
: "=v"(*assigned_value)
: "v"(read_value));
#endif
#if defined(__gfx1201__)
asm volatile(
"global_load_b64 %0 %1 off scope:SCOPE_SYS \n"
"s_wait_loadcnt 0x0"
: "=v"(*assigned_value)
: "v"(read_value));
#endif
}
/* Ignore the warning about deprecated volatile.
* The only usage of volatile is to force the compiler to generate
* the assembly instruction. If volatile is omitted, the compiler
* will NOT generate the non-temporal load or the waitcnt.
*/
// clang-format off
NOWARN(-Wdeprecated-volatile,
template <typename T> __device__ __forceinline__ T uncached_load(T* src) {
T ret;
switch (sizeof(T)) {
case 4:
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__) || defined(__gfx1100__)
asm volatile(
"global_load_dword %0 %1 off glc slc \n"
"s_waitcnt vmcnt(0)"
: "=v"(ret)
: "v"(src));
#endif
#if defined(__gfx942__) || defined(__gfx950__)
asm volatile(
"global_load_dword %0 %1 off sc0 sc1 \n"
"s_waitcnt vmcnt(0)"
: "=v"(ret)
: "v"(src));
#endif
#if defined(__gfx1201__)
asm volatile(
"global_load_b32 %0 %1 off scope:SCOPE_SYS \n"
"s_wait_loadcnt 0x0"
: "=v"(ret)
: "v"(src));
#endif
break;
case 8:
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__) || defined(__gfx1100__)
asm volatile(
"global_load_dwordx2 %0 %1 off glc slc \n"
"s_waitcnt vmcnt(0)"
: "=v"(ret)
: "v"(src));
#endif
#if defined(__gfx942__) || defined(__gfx950__)
asm volatile(
"global_load_dwordx2 %0 %1 off sc0 sc1 \n"
"s_waitcnt vmcnt(0)"
: "=v"(ret)
: "v"(src));
#endif
#if defined(__gfx1201__)
asm volatile(
"global_load_b64 %0 %1 off scope:SCOPE_SYS \n"
"s_wait_loadcnt 0x0"
: "=v"(ret)
: "v"(src));
#endif
break;
default:
break;
}
return ret;
}
)
// clang-format on
__device__ __forceinline__ void __roc_flush() {
#if not defined USE_HDP_FLUSH
#if defined(__gfx906__)
#endif
#if defined(__gfx908__) || defined(__gfx1100__)
#endif
#if defined(__gfx90a__)
// asm volatile("s_dcache_wb;");
// asm volatile("buffer_wbl2;");
#endif
#if defined(__gfx942__) || defined(__gfx950__)
// asm volatile("s_dcache_wb;");
// asm volatile("buffer_wbl2;");
#endif
#endif
}
__device__ __forceinline__ void store_asm(uint8_t* val, uint8_t* dst,
int size) {
switch (size) {
case 2: {
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
int16_t val16{*(reinterpret_cast<int16_t*>(val))};
asm volatile("flat_store_short %0 %1 glc slc" : : "v"(dst), "v"(val16));
#endif
#if defined(__gfx942__) || defined(__gfx950__)
int16_t val16{*(reinterpret_cast<int16_t*>(val))};
asm volatile("flat_store_short %0 %1 sc0 sc1" : : "v"(dst), "v"(val16));
#endif
#if defined(__gfx1100__)
int32_t val32{*(reinterpret_cast<int32_t*>(val))};
asm volatile("flat_store_short %0 %1 glc slc" : : "v"(dst), "v"(val32));
#endif
#if defined(__gfx1201__)
int32_t val32{*(reinterpret_cast<int32_t*>(val))};
asm volatile("flat_store_b16 %0 %1 scope:SCOPE_SYS" : : "v"(dst), "v"(val32));
#endif
break;
}
case 4: {
int32_t val32{*(reinterpret_cast<int32_t*>(val))};
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__) || defined(__gfx1100__)
asm volatile("flat_store_dword %0 %1 glc slc" : : "v"(dst), "v"(val32));
#endif
#if defined(__gfx942__) || defined(__gfx950__)
asm volatile("flat_store_dword %0 %1 sc0 sc1" : : "v"(dst), "v"(val32));
#endif
#if defined(__gfx1201__)
asm volatile("flat_store_b32 %0 %1 scope:SCOPE_SYS" : : "v"(dst), "v"(val32));
#endif
break;
}
case 8: {
int64_t val64{*(reinterpret_cast<int64_t*>(val))};
#if defined(__gfx906__)
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__) || defined(__gfx1100__)
asm volatile("flat_store_dwordx2 %0 %1 glc slc" : : "v"(dst), "v"(val64));
#endif
#if defined(__gfx942__) || defined(__gfx950__)
asm volatile("flat_store_dwordx2 %0 %1 sc0 sc1" : : "v"(dst), "v"(val64));
#endif
#if defined(__gfx1201__)
asm volatile("flat_store_b64 %0 %1 scope:SCOPE_SYS" : : "v"(dst), "v"(val64));
#endif
break;
}
default:
break;
}
}
} // namespace rocshmem
#endif // LIBRARY_SRC_ASSEMBLY_HPP_
+143
Visa fil
@@ -0,0 +1,143 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_ATOMIC_HPP
#define LIBRARY_SRC_ATOMIC_HPP
#include <hip/hip_runtime.h>
namespace rocshmem {
namespace detail {
namespace atomic {
typedef enum rocshmem_memory_scope {
memory_scope_thread = __HIP_MEMORY_SCOPE_SINGLETHREAD,
memory_scope_wavefront = __HIP_MEMORY_SCOPE_WAVEFRONT,
memory_scope_workgroup = __HIP_MEMORY_SCOPE_WORKGROUP,
memory_scope_agent = __HIP_MEMORY_SCOPE_AGENT,
memory_scope_system = __HIP_MEMORY_SCOPE_SYSTEM,
} rocshmem_memory_scope;
typedef enum rocshmem_memory_order {
memory_order_relaxed = __ATOMIC_RELAXED,
memory_order_consume = __ATOMIC_CONSUME,
memory_order_acquire = __ATOMIC_ACQUIRE,
memory_order_release = __ATOMIC_RELEASE,
memory_order_acq_rel = __ATOMIC_ACQ_REL,
memory_order_seq_cst = __ATOMIC_SEQ_CST
} rocshmem_memory_order;
struct rocshmem_memory_orders {
rocshmem_memory_order load {memory_order_acquire};
rocshmem_memory_order store {memory_order_release};
rocshmem_memory_order atomic {memory_order_acq_rel};
rocshmem_memory_order weak_cas_success {memory_order_acq_rel};
rocshmem_memory_order weak_cas_failure {memory_order_acq_rel};
rocshmem_memory_order strong_cas_success {memory_order_acq_rel};
rocshmem_memory_order strong_cas_failure {memory_order_acq_rel};
};
template <typename T, rocshmem_memory_scope s>
__host__ __device__
T load(const T* address, rocshmem_memory_orders o) {
return __hip_atomic_load(address, o.load, s);
}
template <typename T, rocshmem_memory_scope s>
__host__ __device__
void store(T* address, const T value, rocshmem_memory_orders o) {
return __hip_atomic_store(address, value, o.store, s);
}
template <typename T, rocshmem_memory_scope s>
__host__ __device__
bool compare_exchange_weak(T& expected, T desired, rocshmem_memory_orders o) {
return __hip_atomic_compare_exchange_weak(expected, desired, o.weak_cas_success, o.weak_cas_failure, s);
}
template <typename T, rocshmem_memory_scope s>
__host__ __device__
bool compare_exchange_strong(T& expected, T desired, rocshmem_memory_orders o) {
return __hip_atomic_compare_exchange_strong(expected, desired, o.strong_cas_success, o.strong_cas_failure, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_add(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_add(obj, arg, o.atomic, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_sub(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_sub(obj, arg, o.atomic, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_and(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_and(obj, arg, o.atomic, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_or(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_or(obj, arg, o, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_xor(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_xor(obj, arg, o.atomic, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_max(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_max(obj, arg, o.atomic, s);
}
template <class T, class U, rocshmem_memory_scope s>
__host__ __device__
T fetch_min(T* obj, U arg, rocshmem_memory_orders o) {
return __hip_atomic_fetch_min(obj, arg, o.atomic, s);
}
template <rocshmem_memory_scope s>
__device__
void threadfence() {
if constexpr (s == memory_scope_system) {
__threadfence_system();
} else if constexpr (s == memory_scope_agent) {
__threadfence();
} else if constexpr (s == memory_scope_workgroup) {
__threadfence_block();
}
}
} // namespace atomic
} // namespace detail
} // namespace rocshmem
#endif // LIBRARY_SRC_ATOMIC_HPP_
+86
Visa fil
@@ -0,0 +1,86 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "atomic_return.hpp"
#include <cassert>
#include "constants.hpp"
namespace rocshmem {
void allocate_atomic_region(atomic_ret_t** atomic_ret, int num_wg) {
atomic_ret_t* tmp_ret{nullptr};
/*
* Allocate device-side control struct for the atomic return region.
*/
CHECK_HIP(
hipMalloc(reinterpret_cast<void**>(&tmp_ret), sizeof(atomic_ret_t)));
/*
* Allocate fine-grained device-side memory for the atomic return
* region.
*/
size_t size_bytes{max_nb_atomic * num_wg * sizeof(uint64_t)};
#ifdef HIP_SUPPORTS_MALLOC_UNCACHED
CHECK_HIP(
hipExtMallocWithFlags(reinterpret_cast<void**>(&tmp_ret->atomic_base_ptr),
size_bytes, hipDeviceMallocUncached));
#else
CHECK_HIP(
hipExtMallocWithFlags(reinterpret_cast<void**>(&tmp_ret->atomic_base_ptr),
size_bytes, hipDeviceMallocFinegrained));
#endif
/*
* Zero-initialize the entire atomic return region.
*/
CHECK_HIP(hipMemset(tmp_ret->atomic_base_ptr, 0, size_bytes));
*atomic_ret = tmp_ret;
}
void init_g_ret(SymmetricHeap* heap_handle, MPI_Comm thread_comm, int num_wg,
char** g_ret) {
/*
* Create space on the symmetric heap
*/
void* ptr{nullptr};
size_t size_bytes{sizeof(int64_t) * MAX_WG_SIZE * num_wg};
heap_handle->malloc(&ptr, size_bytes);
assert(ptr);
/*
* Assign g_ret the output of the malloc
*/
*g_ret = reinterpret_cast<char*>(ptr);
/*
* Make sure that all processing elements have done this before
* continuing.
*/
mpilib_ftable_.Barrier(thread_comm);
}
} // namespace rocshmem
+50
Visa fil
@@ -0,0 +1,50 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_ATOMIC_RETURN_HPP_
#define LIBRARY_SRC_ATOMIC_RETURN_HPP_
#include <hip/hip_runtime.h>
#include "memory/symmetric_heap.hpp"
#include "util.hpp"
namespace rocshmem {
const int max_nb_atomic = 4096;
struct atomic_ret_t {
uint64_t* atomic_base_ptr;
uint32_t atomic_lkey;
uint64_t atomic_counter;
};
void allocate_atomic_region(atomic_ret_t** atomic_ret, int num_wg);
void init_g_ret(SymmetricHeap* heap_handle, MPI_Comm thread_comm, int num_wg,
char** g_ret);
} // namespace rocshmem
#endif // LIBRARY_SRC_ATOMIC_RETURN_HPP_
+301
Visa fil
@@ -0,0 +1,301 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "backend_bc.hpp"
#include "backend_type.hpp"
#include "context_incl.hpp"
#if defined(USE_GDA)
#include "gda/backend_gda.hpp"
#endif
#if defined(USE_RO)
#include "reverse_offload/backend_ro.hpp"
#endif
#if defined(USE_IPC)
#include "ipc/backend_ipc.hpp"
#endif
#include <cassert>
namespace rocshmem {
#define NET_CHECK(cmd) \
{ \
if (cmd != MPI_SUCCESS) { \
fprintf(stderr, "Unrecoverable error: MPI Failure\n"); \
abort() ; \
} \
}
Backend::Backend(MPI_Comm comm) : heap(comm, nullptr) {
init();
init_mpi_once(comm);
/*
* Notify other threads that Backend has been initialized.
*/
*done_init = 0;
}
Backend::Backend(TcpBootstrap* bootstrap) : heap(MPI_COMM_NULL, bootstrap) {
init();
backend_bootstr = bootstrap;
backend_comm = MPI_COMM_NULL;
my_pe = bootstrap->getRank();
num_pes = bootstrap->getNranks();
/*
* Notify other threads that Backend has been initialized.
*/
*done_init = 0;
}
void Backend::init(void) {
CHECK_HIP(hipGetDevice(&hip_dev_id));
int num_cus{};
CHECK_HIP(hipDeviceGetAttribute(&num_cus, hipDeviceAttributeMultiprocessorCount, hip_dev_id));
/*
* Initialize 'print_lock' global and copy to the device memory space.
*/
CHECK_HIP(hipMalloc(&print_lock, sizeof(*print_lock)));
*print_lock = 0;
int* print_lock_addr{nullptr};
CHECK_HIP(hipGetSymbolAddress(reinterpret_cast<void**>(&print_lock_addr),
HIP_SYMBOL(print_lock)));
CHECK_HIP(hipMemcpy(print_lock_addr, &print_lock, sizeof(print_lock),
hipMemcpyDefault));
/*
* Copy this Backend object to 'backend_device_proxy' global in the
* device memory space to provide a device-side handle to Backend.
*/
int* device_backend_proxy_addr{nullptr};
CHECK_HIP(
hipGetSymbolAddress(reinterpret_cast<void**>(&device_backend_proxy_addr),
HIP_SYMBOL(device_backend_proxy)));
Backend* this_temp_addr{this};
CHECK_HIP(hipMemcpy(device_backend_proxy_addr, &this_temp_addr, sizeof(this),
hipMemcpyDefault));
CHECK_HIP(
hipHostMalloc(reinterpret_cast<void**>(&done_init), sizeof(uint8_t)));
}
void Backend::init_mpi_once(MPI_Comm comm) {
if (comm == MPI_COMM_NULL) comm = MPI_COMM_WORLD;
NET_CHECK(mpilib_ftable_.Comm_dup(comm, &backend_comm));
NET_CHECK(mpilib_ftable_.Comm_size(backend_comm, &num_pes));
NET_CHECK(mpilib_ftable_.Comm_rank(backend_comm, &my_pe));
}
void Backend::track_ctx(Context* ctx) {
/**
* TODO: Don't track CTX_PRIVATE when we support it
* since destroying CTX_PRIVATE is the user's
* responsibility.
*/
list_of_ctxs.push_back(ctx);
}
void Backend::untrack_ctx(Context* ctx) {
/* Get an iterator to this ctx in the vector */
std::vector<Context*>::iterator it =
std::find(list_of_ctxs.begin(), list_of_ctxs.end(), ctx);
assert(it != list_of_ctxs.end());
/* Remove the ctx from the vector */
list_of_ctxs.erase(it);
}
void Backend::destroy_remaining_ctxs() {
while (!list_of_ctxs.empty()) {
ctx_destroy(list_of_ctxs.back());
list_of_ctxs.pop_back();
}
}
Backend::~Backend() {
CHECK_HIP(hipFree(print_lock));
if (backend_comm != MPI_COMM_NULL)
NET_CHECK(mpilib_ftable_.Comm_free(&backend_comm));
}
void Backend::dump_stats() {
printf("PE %d\n", my_pe);
const auto& device_stats{globalStats};
printf("DEVICE STATS\n");
printf("Puts (Blocking/P/Nbi) %llu/%llu/%llu\n",
device_stats.getStat(NUM_PUT), device_stats.getStat(NUM_P),
device_stats.getStat(NUM_PUT_NBI));
printf("WG_Puts (Blocking/Nbi) %llu/%llu\n", device_stats.getStat(NUM_PUT_WG),
device_stats.getStat(NUM_PUT_NBI_WG));
printf("WAVE_Puts (Blocking/Nbi) %llu/%llu\n",
device_stats.getStat(NUM_PUT_WAVE),
device_stats.getStat(NUM_PUT_NBI_WAVE));
printf("Gets (Blocking/G/Nbi) %llu/%llu/%llu\n",
device_stats.getStat(NUM_GET), device_stats.getStat(NUM_G),
device_stats.getStat(NUM_GET_NBI));
printf("WG_Gets (Blocking/Nbi) %llu/%llu\n", device_stats.getStat(NUM_GET_WG),
device_stats.getStat(NUM_GET_NBI_WG));
printf("WAVE_Gets (Blocking/Nbi) %llu/%llu\n",
device_stats.getStat(NUM_GET_WAVE),
device_stats.getStat(NUM_GET_NBI_WAVE));
printf("Fences %llu\n", device_stats.getStat(NUM_FENCE));
printf("Quiets %llu\n", device_stats.getStat(NUM_QUIET));
printf("PE Quiets %llu\n", device_stats.getStat(NUM_PE_QUIET));
printf("ToAll %llu\n", device_stats.getStat(NUM_TO_ALL));
printf("BarrierAll %llu\n", device_stats.getStat(NUM_BARRIER_ALL));
printf("WAVE_BarrierAll %llu\n", device_stats.getStat(NUM_BARRIER_ALL_WAVE));
printf("WG_BarrierAll %llu\n", device_stats.getStat(NUM_BARRIER_ALL_WG));
printf("Barrier %llu\n", device_stats.getStat(NUM_BARRIER));
printf("WAVE_Barrier %llu\n", device_stats.getStat(NUM_BARRIER_WAVE));
printf("WG_Barrier %llu\n", device_stats.getStat(NUM_BARRIER_WG));
printf("Wait Until %llu\n", device_stats.getStat(NUM_WAIT_UNTIL));
printf("Wait Until Any %llu\n", device_stats.getStat(NUM_WAIT_UNTIL_ANY));
printf("Wait Until All %llu\n", device_stats.getStat(NUM_WAIT_UNTIL_ALL));
printf("Wait Until Some %llu\n", device_stats.getStat(NUM_WAIT_UNTIL_SOME));
printf("Wait Until All Vector %llu\n",
device_stats.getStat(NUM_WAIT_UNTIL_ALL_VECTOR));
printf("Wait Until Any Vector %llu\n",
device_stats.getStat(NUM_WAIT_UNTIL_ANY_VECTOR));
printf("Wait Until Some Vector %llu\n",
device_stats.getStat(NUM_WAIT_UNTIL_SOME_VECTOR));
printf("Finalizes %llu\n", device_stats.getStat(NUM_FINALIZE));
printf("Coalesced %llu\n", device_stats.getStat(NUM_MSG_COAL));
printf("Atomic_FAdd %llu\n", device_stats.getStat(NUM_ATOMIC_FADD));
printf("Atomic_FCswap %llu\n", device_stats.getStat(NUM_ATOMIC_FCSWAP));
printf("Atomic_FInc %llu\n", device_stats.getStat(NUM_ATOMIC_FINC));
printf("Atomic_Fetch %llu\n", device_stats.getStat(NUM_ATOMIC_FETCH));
printf("Atomic_Add %llu\n", device_stats.getStat(NUM_ATOMIC_ADD));
printf("Atomic_Set %llu\n", device_stats.getStat(NUM_ATOMIC_SET));
printf("Atomic_Cswap %llu\n", device_stats.getStat(NUM_ATOMIC_CSWAP));
printf("Atomic_Inc %llu\n", device_stats.getStat(NUM_ATOMIC_INC));
printf("Tests %llu\n", device_stats.getStat(NUM_TEST));
printf("SHMEM_PTR %llu\n", device_stats.getStat(NUM_SHMEM_PTR));
printf("SyncAll %llu\n", device_stats.getStat(NUM_SYNC_ALL));
printf("WAVE_SyncAll %llu\n", device_stats.getStat(NUM_SYNC_ALL_WAVE));
printf("WG_SyncAll %llu\n", device_stats.getStat(NUM_SYNC_ALL_WG));
printf("Sync %llu\n", device_stats.getStat(NUM_SYNC));
printf("WAVE_Sync %llu\n", device_stats.getStat(NUM_SYNC_WAVE));
printf("WG_Sync %llu\n", device_stats.getStat(NUM_SYNC_WG));
const auto& host_stats{globalHostStats};
printf("HOST STATS\n");
printf("Puts (Blocking/P/Nbi) %llu/%llu/%llu\n",
host_stats.getStat(NUM_HOST_PUT), host_stats.getStat(NUM_HOST_P),
host_stats.getStat(NUM_HOST_PUT_NBI));
printf("Gets (Blocking/G/Nbi) (%llu/%llu/%llu)\n",
host_stats.getStat(NUM_HOST_GET), host_stats.getStat(NUM_HOST_G),
host_stats.getStat(NUM_HOST_GET_NBI));
printf("Fences %llu\n", host_stats.getStat(NUM_HOST_FENCE));
printf("Quiets %llu\n", host_stats.getStat(NUM_HOST_QUIET));
printf("ToAll %llu\n", host_stats.getStat(NUM_HOST_TO_ALL));
printf("BarrierAll %llu\n", host_stats.getStat(NUM_HOST_BARRIER_ALL));
printf("Wait Until %llu\n", host_stats.getStat(NUM_HOST_WAIT_UNTIL));
printf("Wait Until Any %llu\n", host_stats.getStat(NUM_HOST_WAIT_UNTIL_ANY));
printf("Wait Until All %llu\n", host_stats.getStat(NUM_HOST_WAIT_UNTIL_ALL));
printf("Wait Until Some %llu\n",
host_stats.getStat(NUM_HOST_WAIT_UNTIL_SOME));
printf("Wait Until All Vector %llu\n",
host_stats.getStat(NUM_HOST_WAIT_UNTIL_ALL_VECTOR));
printf("Wait Until Any Vector %llu\n",
host_stats.getStat(NUM_HOST_WAIT_UNTIL_ANY_VECTOR));
printf("Wait Until Some Vector %llu\n",
host_stats.getStat(NUM_HOST_WAIT_UNTIL_SOME_VECTOR));
printf("Finalizes %llu\n", host_stats.getStat(NUM_HOST_FINALIZE));
printf("Atomic_FAdd %llu\n", host_stats.getStat(NUM_HOST_ATOMIC_FADD));
printf("Atomic_FCswap %llu\n", host_stats.getStat(NUM_HOST_ATOMIC_FCSWAP));
printf("Atomic_FInc %llu\n", host_stats.getStat(NUM_HOST_ATOMIC_FINC));
printf("Atomic_Fetch %llu\n", host_stats.getStat(NUM_HOST_ATOMIC_FETCH));
printf("Atomic_Add %llu\n", host_stats.getStat(NUM_HOST_ATOMIC_ADD));
printf("Atomic_Set %llu\n", host_stats.getStat(NUM_ATOMIC_SET));
printf("Atomic_Cswap %llu\n", host_stats.getStat(NUM_HOST_ATOMIC_CSWAP));
printf("Atomic_Inc %llu\n", host_stats.getStat(NUM_HOST_ATOMIC_INC));
printf("Tests %llu\n", host_stats.getStat(NUM_HOST_TEST));
printf("SHMEM_PTR %llu\n", host_stats.getStat(NUM_HOST_SHMEM_PTR));
printf("SyncAll %llu\n", host_stats.getStat(NUM_HOST_SYNC_ALL));
dump_backend_stats();
}
void Backend::reset_stats() {
globalStats.resetStats();
globalHostStats.resetStats();
reset_backend_stats();
}
__device__ bool Backend::create_ctx(int64_t option, rocshmem_ctx_t* ctx) {
#if defined(USE_GDA) && defined(USE_RO) && defined(USE_IPC)
switch(this->type) {
case BackendType::GDA_BACKEND:
return static_cast<GDABackend*>(this)->create_ctx(option, ctx);
break;
case BackendType::RO_BACKEND:
return static_cast<ROBackend*>(this)->create_ctx(option, ctx);
break;
case BackendType::IPC_BACKEND:
default:
return static_cast<IPCBackend*>(this)->create_ctx(option, ctx);
break;
}
#elif defined(USE_GDA)
return static_cast<GDABackend*>(this)->create_ctx(option, ctx);
#elif defined(USE_RO)
return static_cast<ROBackend*>(this)->create_ctx(option, ctx);
#elif defined(USE_IPC)
return static_cast<IPCBackend*>(this)->create_ctx(option, ctx);
#endif
}
__device__ void Backend::destroy_ctx(rocshmem_ctx_t* ctx) {
#if defined(USE_GDA) && defined(USE_RO) && defined(USE_IPC)
switch(this->type) {
case BackendType::GDA_BACKEND:
static_cast<GDABackend*>(this)->destroy_ctx(ctx);
break;
case BackendType::RO_BACKEND:
static_cast<ROBackend*>(this)->destroy_ctx(ctx);
break;
case BackendType::IPC_BACKEND:
default:
static_cast<IPCBackend*>(this)->destroy_ctx(ctx);
break;
}
#elif defined(USE_GDA)
static_cast<GDABackend*>(this)->destroy_ctx(ctx);
#elif defined(USE_RO)
static_cast<ROBackend*>(this)->destroy_ctx(ctx);
#elif defined(USE_IPC)
static_cast<IPCBackend*>(this)->destroy_ctx(ctx);
#endif
}
} // namespace rocshmem
+332
Visa fil
@@ -0,0 +1,332 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_BACKEND_BC_HPP_
#define LIBRARY_SRC_BACKEND_BC_HPP_
/**
* @file backend_bc.hpp
* Defines the Backend base class.
*
* The backend base class sets up most of the host-side library resources.
* It is the top-level interface for these resources.
*/
#include <vector>
#include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir)
#include "rocshmem/rocshmem.hpp"
#include "mpi_instance.hpp"
#include "backend_type.hpp"
#include "ipc_policy.hpp"
#include "memory/symmetric_heap.hpp"
#include "stats.hpp"
#include "team_tracker.hpp"
#include "bootstrap/bootstrap.hpp"
namespace rocshmem {
class Team;
class TeamInfo;
/**
* @class Backend backend.hpp
* @brief Container class for the persistent state used by the library.
*
* Backend is populated by host-side initialization and allocation calls.
* It uses this state to populate Context objects which the GPU may use to
* perform networking operations.
*
* The rocshmem.cpp implementation file wraps many the Backend public
* members to implement the library's public API.
*/
class Backend {
public:
friend Context;
/**
* @brief Constructor.
*
* @note Implementation may reduce the number of workgroups if the
* number exceeds hardware limits.
*/
explicit Backend(MPI_Comm comm);
explicit Backend(TcpBootstrap* bootstrap);
/**
* @brief Destructor.
*/
virtual ~Backend();
__device__ bool create_ctx(int64_t option, rocshmem_ctx_t* ctx);
__device__ void destroy_ctx(rocshmem_ctx_t* ctx);
/**
* @brief Create a new team object and initialize it.
*
* @param[in] parent_team Pointer to the parrent team object.
* @param[in] team_info_wrt_parent TeamInfo object wrt parent team.
* @param[in] team_info_wrt_world TeamInfo object wrt TEAM_WORLD.
* @param[in] num_pes Number of PEs in this team.
* @param[in] my_pe_in_new_team Index of this PE in the new team.
* @param[in] team_comm MPI communicator for this team.
*
* @param[out] new_team pointer to the new team.
*/
virtual void create_new_team(Team* parent_team,
TeamInfo* team_info_wrt_parent,
TeamInfo* team_info_wrt_world, int num_pes,
int my_pe_in_new_team, MPI_Comm team_comm,
rocshmem_team_t* new_team) = 0;
/**
* @brief Destruct a team
*
* @param[in] team Handle to the team to destroy.
*/
virtual void team_destroy(rocshmem_team_t team) = 0;
/**
* @brief Reports processing element number id.
*
* @return Unique numeric identifier for each processing element.
*/
__host__ __device__ int getMyPE() const { return my_pe; }
/**
* @brief Reports number of processing elements.
*
* @return Number of active processing elements tracked by library.
*/
__host__ __device__ int getNumPEs() const { return num_pes; }
/**
* @brief Allocates and initializes device-side library state.
*
* Preallocates a single private context for this workgroup (thread-block)
* and binds it to the WGState instance.
*
* The code below carves the allocation out of the "extern __shared__"
* partition and then builds the context object in that memory.
*
* @return void
*/
__device__ void create_wg_state();
/**
* @brief Frees device-side library resources.
*
* @return void
*/
__device__ void finalize_wg_state();
/**
* @brief Dumps statistics for public API invocations.
*
* @note Implementation may dump additional statistics from backend
* derived classes when calling this function. If so, the method,
* dump_backend_stats, will be used as the interface for the
* additional statistics.
*/
void dump_stats();
/**
* @brief Resets statistics for public API invocations.
*
* @note Implementation may reset additional statistics from backend
* derived classes when calling this function. If so, the method,
* reset_backend_stats, will be used as the interface for the
* additional statistics.
*/
void reset_stats();
/**
* @brief Abort the application.
*
* @param[in] status Exit code.
*
* @return void.
*
* @note This routine terminates the entire application.
*/
virtual void global_exit(int status) = 0;
/**
* @brief Creates a new OpenSHMEM context.
*
* @param[in] options Options for context creation
* @param[in] ctx Address of the pointer to the new context
*
* @return Zero on success, nonzero otherwise.
*/
virtual void ctx_create(int64_t options, void** ctx) = 0;
/**
* @brief Destroys a context.
*
* @param[in] ctx Context handle.
*
* @return void.
*/
virtual void ctx_destroy(Context* ctx) = 0;
/**
* @brief High level device stats that do not depend on backend type.
*/
ROCStats globalStats{};
/**
* @brief High level host stats that do not depend on backend type.
*/
ROCHostStats globalHostStats{};
/**
* @brief Number of processing elements running in job.
*
* @todo Change to size_t.
*/
int num_pes{0};
/**
* @brief Unique numeric identifier ranging from 0 (inclusive) to
* num_pes (exclusive) [0 ... num_pes).
*
* @todo Change to size_t and set invalid entry to max size.
*/
int my_pe{-1};
/**
* @brief indicate when init is done on the CPU. Non-blocking init is only
* available with GPU-IB
*/
uint8_t* done_init{nullptr};
/**
* @todo document where this is used and try to coalesce this into another
* class
*/
MPI_Comm backend_comm;
/**
* @todo document where this is used
*/
TcpBootstrap *backend_bootstr{nullptr};
/**
* @brief Object contains the interface and internal data structures
* needed to allocate/free memory on the symmetric heap.
*/
SymmetricHeap heap;
/**
* @brief Determines which device to launch device kernels onto.
*
* Multi-device nodes can specify which one they would like to use.
*/
int hip_dev_id{0};
/**
* @brief Add ctx from the list of user-created ctxs
*/
void track_ctx(Context* ctx);
/**
* @brief Remove ctx from the list of user-created ctxs
*/
void untrack_ctx(Context* ctx);
/**
* @brief Remove all ctxs from the list of user-created ctxs
*/
void destroy_remaining_ctxs();
/**
* @brief Compile-time configuration policy for intra-node shared memory
* accesses.
*
* The configuration option "USE_IPC" can be enabled to allow shared
* memory accesses to the symmetric heap from processing elements
* co-located on the same node.
*/
IpcImpl ipcImpl{};
/**
* @brief Maintains information about teams
*/
TeamTracker team_tracker{};
BackendType get_backend_type() { return type; }
protected:
/**
* @brief Required to support static inheritance for device calls.
*
* The Context DISPATCH implementation requires this member.
* The implementation needs to know the derived class type to
* issue a static_cast.
*
* GPU devices do not support virtual functions. Therefore, we cannot
* rely on the normal inheritance mechanism to tailor behavior for
* derived backend types.
*/
BackendType type;
/**
* @brief Dumps derived class statistics.
*/
virtual void dump_backend_stats() = 0;
/**
* @brief Resets derived class statistics.
*/
virtual void reset_backend_stats() = 0;
private:
/**
* @brief initialization code used by all constructors
*/
void init (void);
/**
* @brief List of ctxs created by the user.
*/
std::vector<Context*> list_of_ctxs{};
/**
* @brief initialize MPI.
*
* Backend relies on MPI to exchange meta data across PEs.
*/
void init_mpi_once(MPI_Comm comm);
};
/**
* @brief Global handle used by the device to access the backend.
*/
extern __constant__ Backend* device_backend_proxy;
} // namespace rocshmem
#endif // LIBRARY_SRC_BACKEND_BC_HPP_
+249
Visa fil
@@ -0,0 +1,249 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_BACKEND_TYPE_HPP_
#define LIBRARY_SRC_BACKEND_TYPE_HPP_
/**
* @file backend_type.hpp
* Defines the Backend derived class types and contains the DISPATCH macros.
*
* The type information is required to be known at compile time because
* we use static dispatch to produce compile time polymorphism.
*
* The device cannot use runtime polymorphism because calls through virtual
* functions are not supported at this time.
*/
#include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir)
namespace rocshmem {
/**
* @brief Enumerates the Backend derived classes.
*
* @note Derived classes which use Backend as a base class must add
* themselves to this enum class to support static polymorphism.
*/
//enum class BackendType { GDA_BACKEND, RO_BACKEND, IPC_BACKEND };
/**
* @brief Helper macro for some dispatch calls
*/
#define PAIR(A, B) A, B
/**
* @brief Device static dispatch method call.
*/
#if defined(USE_GDA) && defined(USE_RO) && defined(USE_IPC)
#define DISPATCH(Func) \
switch(this->btype) { \
case BackendType::GDA_BACKEND: \
static_cast<GDAContext *>(this)->Func; \
break; \
case BackendType::RO_BACKEND: \
static_cast<ROContext *>(this)->Func; \
break; \
case BackendType::IPC_BACKEND: \
default: \
static_cast<IPCContext *>(this)->Func; \
break; \
}
#elif defined(USE_GDA)
#define DISPATCH(Func) \
static_cast<GDAContext *>(this)->Func;
#elif defined(USE_RO)
#define DISPATCH(Func) \
static_cast<ROContext *>(this)->Func;
#elif defined(USE_IPC)
#define DISPATCH(Func) \
static_cast<IPCContext *>(this)->Func;
#endif
/**
* @brief Device static dispatch method call with a return value.
*/
#if defined(USE_GDA) && defined(USE_RO) && defined(USE_IPC)
#define DISPATCH_RET(Func) \
if (this->btype == BackendType::GDA_BACKEND) { \
auto ret1 = static_cast<GDAContext *>(this)->Func; \
return ret1; \
} else if(this->btype == BackendType::RO_BACKEND) { \
auto ret2 = static_cast<ROContext *>(this)->Func; \
return ret2; \
} else { \
auto ret3 = static_cast<IPCContext *>(this)->Func; \
return ret3; \
}
#elif defined(USE_GDA)
#define DISPATCH_RET(Func) \
auto ret_val = static_cast<GDAContext *>(this)->Func; \
return ret_val;
#elif defined(USE_RO)
#define DISPATCH_RET(Func) \
auto ret_val = static_cast<ROContext *>(this)->Func; \
return ret_val;
#elif defined(USE_IPC)
#define DISPATCH_RET(Func) \
auto ret_val = static_cast<IPCContext *>(this)->Func; \
return ret_val;
#endif
/**
* @brief Device static dispatch method call with a return type of pointer.
*/
#if defined(USE_GDA) && defined(USE_RO) && defined(USE_IPC)
#define DISPATCH_RET_PTR(Func) \
void *ret_val{nullptr}; \
switch(this->btype) { \
case BackendType::GDA_BACKEND: \
ret_val = static_cast<GDAContext *>(this)->Func; \
break; \
case BackendType::RO_BACKEND: \
ret_val = static_cast<ROContext *>(this)->Func; \
break; \
case BackendType::IPC_BACKEND: \
default: \
ret_val = static_cast<IPCContext *>(this)->Func; \
break; \
} \
return ret_val;
#elif defined(USE_GDA)
#define DISPATCH_RET_PTR(Func) \
void *ret_val{nullptr}; \
ret_val = static_cast<GDAContext *>(this)->Func; \
return ret_val;
#elif defined(USE_RO)
#define DISPATCH_RET_PTR(Func) \
void *ret_val{nullptr}; \
ret_val = static_cast<ROContext *>(this)->Func; \
return ret_val;
#elif defined(USE_IPC)
#define DISPATCH_RET_PTR(Func) \
void *ret_val{nullptr}; \
ret_val = static_cast<IPCContext *>(this)->Func; \
return ret_val;
#endif
/**
* @brief Host static dispatch method call.
*
* @note There is no need to lock-unlock on host since we are using
* MPI_THREAD_MULTIPLE (for RMA and AMO operations) and the ordering and
* threading semantics of collectives in OpenSHMEM match those of MPI.
*/
#if defined(USE_GDA) && defined(USE_RO) && defined(USE_IPC)
#define HOST_DISPATCH(Func) \
switch(this->btype) { \
case BackendType::GDA_BACKEND: \
static_cast<GDAHostContext *>(this)->Func; \
break; \
case BackendType::RO_BACKEND: \
static_cast<ROHostContext *>(this)->Func; \
break; \
case BackendType::IPC_BACKEND: \
default: \
static_cast<IPCHostContext *>(this)->Func; \
break; \
}
#elif defined(USE_GDA)
#define HOST_DISPATCH(Func) static_cast<GDAHostContext *>(this)->Func;
#elif defined(USE_RO)
#define HOST_DISPATCH(Func) static_cast<ROHostContext *>(this)->Func;
#elif defined(USE_IPC)
#define HOST_DISPATCH(Func) static_cast<IPCHostContext *>(this)->Func;
#endif
/**
* @brief Host static dispatch method call with return value.
*
* @note There is no need to lock-unlock on host since we are using
* MPI_THREAD_MULTIPLE (for RMA and AMO operations) and the ordering and
* threading semantics of collectives in OpenSHMEM match those of MPI.
*/
#if defined(USE_GDA) && defined(USE_RO) && defined(USE_IPC)
#define HOST_DISPATCH_RET(Func) \
if (this->btype == BackendType::GDA_BACKEND) { \
auto ret1 = static_cast<GDAHostContext *>(this)->Func; \
return ret1; \
} else if (this->btype == BackendType::RO_BACKEND) { \
auto ret2 = static_cast<ROHostContext *>(this)->Func; \
return ret2; \
} else { \
auto ret3 = static_cast<IPCHostContext *>(this)->Func; \
return ret3; \
}
#elif defined(USE_GDA)
#define HOST_DISPATCH_RET(Func) \
auto ret_val = static_cast<GDAHostContext *>(this)->Func; \
return ret_val;
#elif defined(USE_RO)
#define HOST_DISPATCH_RET(Func) \
auto ret_val = static_cast<ROHostContext *>(this)->Func; \
return ret_val;
#elif defined(USE_IPC)
#define HOST_DISPATCH_RET(Func) \
auto ret_val = static_cast<IPCHostContext *>(this)->Func; \
return ret_val;
#endif
/**
* @brief Host static dispatch method call with a return type of pointer.
*/
#if defined(USE_GDA) && defined(USE_RO) && defined(USE_IPC)
#define HOST_DISPATCH_RET_PTR(Func) \
void *ret_val{nullptr}; \
switch(this->btype) { \
case BackendType::GDA_BACKEND: \
ret_val = static_cast<GDAHostContext *>(this)->Func; \
break; \
case BackendType::RO_BACKEND: \
ret_val = static_cast<ROHostContext *>(this)->Func; \
break; \
case BackendType::IPC_BACKEND: \
default: \
ret_val = static_cast<IPCHostContext *>(this)->Func; \
break; \
} \
return ret_val;
#elif defined(USE_GDA)
#define HOST_DISPATCH_RET_PTR(Func) \
void *ret_val{nullptr}; \
ret_val = static_cast<GDAHostContext *>(this)->Func; \
return ret_val;
#elif defined(USE_RO)
#define HOST_DISPATCH_RET_PTR(Func) \
void *ret_val{nullptr}; \
ret_val = static_cast<ROHostContext *>(this)->Func; \
return ret_val;
#elif defined(USE_IPC)
#define HOST_DISPATCH_RET_PTR(Func) \
void *ret_val{nullptr}; \
ret_val = static_cast<IPCHostContext *>(this)->Func; \
return ret_val;
#endif
} // namespace rocshmem
#endif // LIBRARY_SRC_BACKEND_TYPE_HPP_
@@ -0,0 +1,34 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
###############################################################################
# ADD ROCSHMEM TARGET FOR FILES IN CURRENT DIRECTORY
###############################################################################
target_sources(
${PROJECT_NAME}
PRIVATE
socket.cpp
bootstrap.cpp
utils.cpp
)
@@ -0,0 +1,702 @@
/******************************************************************************
* Copyright (c) Microsoft Corporation.
* Modifications Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include <sys/resource.h>
#include <cstring>
#include <thread>
#include <unordered_map>
#include <vector>
#include "bootstrap.hpp"
#include "utils.hpp"
#include "util.hpp"
#include "socket.hpp"
namespace rocshmem {
static void setFilesLimit() {
rlimit filesLimit;
if (getrlimit(RLIMIT_NOFILE, &filesLimit) != 0) {
DPRINTF("getrlimit failed\n");
return;
}
filesLimit.rlim_cur = filesLimit.rlim_max;
if (setrlimit(RLIMIT_NOFILE, &filesLimit) != 0) {
DPRINTF("setrlimit failed\n");
return;
}
}
/* Socket Interface Selection type */
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
struct ExtInfo {
int rank;
int nRanks;
SocketAddress extAddressListenRoot;
SocketAddress extAddressListen;
};
void Bootstrap::groupBarrier(const std::vector<int>& ranks) {
int dummy = 0;
for (auto rank : ranks) {
if (rank != this->getRank()) {
this->send(static_cast<void*>(&dummy), sizeof(dummy), rank, 0);
}
}
for (auto rank : ranks) {
if (rank != this->getRank()) {
this->recv(static_cast<void*>(&dummy), sizeof(dummy), rank, 0);
}
}
}
void Bootstrap::groupAllGather(void* allData, int size, const std::vector<int>& ranks) {
char* data = static_cast<char*>(allData);
int rank = this->getRank();
int nRanks = ranks.size();
int rank_pos = -1;
// Confirm that rank is in the vectors of ranks
for (int i = 0; i < ranks.size(); i++) {
if (rank == ranks[i]) {
rank_pos = i;
break;
}
}
if (rank_pos == -1) {
printf("Bootstrap::groupAllGather: called with process that is not in list of ranks. Aborting\n");
abort();
}
DPRINTF("groupAllGather: rank %d nranks %d size %d\n", rank, nRanks, size);
int sendto = (rank_pos + 1 + nRanks) % nRanks;
int recvfrom = (rank_pos - 1 + nRanks) % nRanks;
for (int i = 0; i < nRanks - 1; i++) {
size_t rSlice = (rank_pos - i - 1 + nRanks) % nRanks;
size_t sSlice = (rank_pos - i + nRanks) % nRanks;
char *tmpsend = data + sSlice * size;
char *tmprecv = data + rSlice * size;
this->send(tmpsend, size, ranks[sendto], i);
this->recv(tmprecv, size, ranks[recvfrom], i);
}
DPRINTF("groupAllGather: rank %d nranks %d size %d - DONE\n", rank, nRanks, size);
}
void Bootstrap::groupAlltoall(void* allData, int size, const std::vector<int>& ranks) {
char* data = static_cast<char*>(allData);
int num_pes = ranks.size();
int rank = this->getRank();
int rank_pos = -1;
// Confirm that rank is in the vectors of ranks
for (int i = 0; i < ranks.size(); i++) {
if (rank == ranks[i]) {
rank_pos = i;
break;
}
}
if (rank_pos == -1) {
printf("Bootstrap::groupAlltoall: called with process that is not in list of ranks. Aborting\n");
abort();
}
DPRINTF("groupAlltoall: rank %d nranks %d size %d\n", rank, num_pes, size);
// Since this is an in-place algorithm, allocate temporary receive buffer
char *recv_buf = new char[size * num_pes];
std::memset(recv_buf, 0, num_pes * size);
// Perform pairwise exchange - local copy is ommitted
for (int step = 1; step < num_pes; step++) {
int sendto = (rank_pos + step) % num_pes;
int recvfrom = (rank_pos + num_pes - step) % num_pes;
char *tmpsend = (char*)data + (ptrdiff_t)sendto * size;
char *tmprecv = (char*)recv_buf + (ptrdiff_t)recvfrom * size;
this->send(tmpsend, size, ranks[sendto], step /* used as tag */);
this->recv(tmprecv, size, ranks[recvfrom], step);
}
//Since this is an in_place all-to-all, copy data back into the user buffer
for (int step = 0; step < num_pes; step++) {
if (step == rank_pos) continue;
std::memcpy(&data[step*size], &recv_buf[step*size], size);
}
DPRINTF("groupAlltoall: rank %d nranks %d size %d DONE \n", rank, num_pes, size);
delete[] recv_buf;
}
void Bootstrap::send(const std::vector<char>& data, int peer, int tag) {
size_t size = data.size();
send((void*)&size, sizeof(size_t), peer, tag);
send((void*)data.data(), data.size(), peer, tag + 1);
}
void Bootstrap::recv(std::vector<char>& data, int peer, int tag) {
size_t size;
recv((void*)&size, sizeof(size_t), peer, tag);
data.resize(size);
recv((void*)data.data(), data.size(), peer, tag + 1);
}
struct UniqueIdInternal {
uint64_t magic;
union SocketAddress addr;
};
static_assert(sizeof(UniqueIdInternal) <= sizeof(rocshmem_uniqueid_t), "UniqueIdInternal is too large to fit into rocshmem_uniqueid_t");
class TcpBootstrap::Impl {
public:
static rocshmem_uniqueid_t createUniqueId();
static rocshmem_uniqueid_t getUniqueId(const UniqueIdInternal& uniqueId);
Impl(int rank, int nRanks);
~Impl();
void initialize(const rocshmem_uniqueid_t& uniqueId, int64_t timeoutSec);
void initialize(const std::string& ifIpPortTrio, int64_t timeoutSec);
void establishConnections(int64_t timeoutSec);
rocshmem_uniqueid_t getUniqueId() const;
int getRank();
int getNranks();
int getNranksPerNode();
std::vector<int> getLocalRanks();
void allGather(void* allData, int size);
void send(void* data, int size, int peer, int tag);
void recv(void* data, int size, int peer, int tag);
void barrier();
void close();
private:
UniqueIdInternal uniqueId_;
int rank_;
int nRanks_;
int nRanksPerNode_;
bool netInitialized;
std::unique_ptr<Socket> listenSockRoot_;
std::unique_ptr<Socket> listenSock_;
std::unique_ptr<Socket> ringRecvSocket_;
std::unique_ptr<Socket> ringSendSocket_;
std::vector<SocketAddress> peerCommAddresses_;
std::vector<int> barrierArr_;
std::unique_ptr<uint32_t> abortFlagStorage_;
volatile uint32_t* abortFlag_;
std::thread rootThread_;
SocketAddress netIfAddr_;
std::unordered_map<std::pair<int, int>, std::shared_ptr<Socket>, PairHash> peerSendSockets_;
std::unordered_map<std::pair<int, int>, std::shared_ptr<Socket>, PairHash> peerRecvSockets_;
std::vector<int> localRanks_;
void netSend(Socket* sock, const void* data, int size);
void netRecv(Socket* sock, void* data, int size);
std::shared_ptr<Socket> getPeerSendSocket(int peer, int tag);
std::shared_ptr<Socket> getPeerRecvSocket(int peer, int tag);
static void assignPortToUniqueId(UniqueIdInternal& uniqueId);
static void netInit(std::string ipPortPair, std::string interface, SocketAddress& netIfAddr);
void bootstrapCreateRoot();
void bootstrapRoot();
void getRemoteAddresses(Socket* listenSock, std::vector<SocketAddress>& rankAddresses,
std::vector<SocketAddress>& rankAddressesRoot, int& rank);
void sendHandleToPeer(int peer, const std::vector<SocketAddress>& rankAddresses,
const std::vector<SocketAddress>& rankAddressesRoot);
};
rocshmem_uniqueid_t TcpBootstrap::Impl::createUniqueId() {
UniqueIdInternal uniqueId;
SocketAddress netIfAddr;
netInit("", "", netIfAddr);
getRandomData(&uniqueId.magic, sizeof(uniqueId_.magic));
std::memcpy(&uniqueId.addr, &netIfAddr, sizeof(SocketAddress));
assignPortToUniqueId(uniqueId);
return getUniqueId(uniqueId);
}
rocshmem_uniqueid_t TcpBootstrap::Impl::getUniqueId(const UniqueIdInternal& uniqueId) {
rocshmem_uniqueid_t ret;
std::memcpy(&ret, &uniqueId, sizeof(uniqueId));
return ret;
}
TcpBootstrap::Impl::Impl(int rank, int nRanks)
: rank_(rank),
nRanks_(nRanks),
nRanksPerNode_(0),
netInitialized(false),
peerCommAddresses_(nRanks, SocketAddress()),
barrierArr_(nRanks, 0),
abortFlagStorage_(new uint32_t(0)),
abortFlag_(abortFlagStorage_.get()) {}
rocshmem_uniqueid_t TcpBootstrap::Impl::getUniqueId() const { return getUniqueId(uniqueId_); }
int TcpBootstrap::Impl::getRank() { return rank_; }
int TcpBootstrap::Impl::getNranks() { return nRanks_; }
std::vector<int> TcpBootstrap::Impl::getLocalRanks() { return localRanks_; }
void TcpBootstrap::Impl::initialize(const rocshmem_uniqueid_t& uniqueId, int64_t timeoutSec) {
if (!netInitialized) {
netInit("", "", netIfAddr_);
netInitialized = true;
}
std::memcpy(&uniqueId_, &uniqueId, sizeof(uniqueId_));
if (rank_ == 0) {
bootstrapCreateRoot();
}
char line[MAX_IF_NAME_SIZE + 1];
SocketToString(&uniqueId_.addr, line);
DPRINTF("rank %d nranks %d - connecting to %s\n", rank_, nRanks_, line);
establishConnections(timeoutSec);
}
void TcpBootstrap::Impl::initialize(const std::string& ifIpPortTrio, int64_t timeoutSec) {
// first check if it is a trio
int nColons = 0;
for (auto c : ifIpPortTrio) {
if (c == ':') {
nColons++;
}
}
std::string ipPortPair = ifIpPortTrio;
std::string interface = "";
if (nColons == 2) {
// we know the <interface>
interface = ifIpPortTrio.substr(0, ipPortPair.find_first_of(':'));
ipPortPair = ifIpPortTrio.substr(ipPortPair.find_first_of(':') + 1);
}
if (!netInitialized) {
netInit(ipPortPair, interface, netIfAddr_);
netInitialized = true;
}
uniqueId_.magic = 0xdeadbeef;
std::memcpy(&uniqueId_.addr, &netIfAddr_, sizeof(SocketAddress));
SocketGetAddrFromString(&uniqueId_.addr, ipPortPair.c_str());
if (rank_ == 0) {
bootstrapCreateRoot();
}
establishConnections(timeoutSec);
}
TcpBootstrap::Impl::~Impl() {
if (abortFlag_) {
*abortFlag_ = 1;
}
if (rootThread_.joinable()) {
rootThread_.join();
}
}
void TcpBootstrap::Impl::getRemoteAddresses(Socket* listenSock, std::vector<SocketAddress>& rankAddresses,
std::vector<SocketAddress>& rankAddressesRoot, int& rank) {
ExtInfo info;
SocketAddress zero;
std::memset(&zero, 0, sizeof(SocketAddress));
{
Socket sock(nullptr, ROCSHMEM_SOCKET_MAGIC, SocketTypeUnknown, abortFlag_);
sock.accept(listenSock);
netRecv(&sock, &info, sizeof(info));
}
if (this->nRanks_ != info.nRanks) {
ERROR("Bootstrap Root : mismatch in rank count from procs %d : %d\n", this->nRanks_, info.nRanks);
return;
}
if (std::memcmp(&zero, &rankAddressesRoot[info.rank], sizeof(SocketAddress)) != 0) {
ERROR("Bootstrap Root : rank %d of %d has already checked in\n", info.rank, this->nRanks_);
return;
}
// Save the connection handle for that rank
rankAddressesRoot[info.rank] = info.extAddressListenRoot;
rankAddresses[info.rank] = info.extAddressListen;
rank = info.rank;
}
void TcpBootstrap::Impl::sendHandleToPeer(int peer, const std::vector<SocketAddress>& rankAddresses,
const std::vector<SocketAddress>& rankAddressesRoot) {
int next = (peer + 1) % nRanks_;
Socket sock(&rankAddressesRoot[peer], uniqueId_.magic, SocketTypeBootstrap, abortFlag_);
sock.connect();
netSend(&sock, &rankAddresses[next], sizeof(SocketAddress));
}
void TcpBootstrap::Impl::assignPortToUniqueId(UniqueIdInternal& uniqueId) {
std::unique_ptr<Socket> socket = std::make_unique<Socket>(&uniqueId.addr, uniqueId.magic, SocketTypeBootstrap);
socket->bind();
uniqueId.addr = socket->getAddr();
}
void TcpBootstrap::Impl::bootstrapCreateRoot() {
listenSockRoot_ = std::make_unique<Socket>(&uniqueId_.addr, uniqueId_.magic, SocketTypeBootstrap, abortFlag_, 0);
listenSockRoot_->bindAndListen();
uniqueId_.addr = listenSockRoot_->getAddr();
rootThread_ = std::thread([this]() {
// try {
bootstrapRoot();
//} catch (const std::exception& e) {
//if (abortFlag_ && *abortFlag_) r;
//throw e;
//}
});
}
void TcpBootstrap::Impl::bootstrapRoot() {
int numCollected = 0;
std::vector<SocketAddress> rankAddresses(nRanks_, SocketAddress());
// for initial rank <-> root information exchange
std::vector<SocketAddress> rankAddressesRoot(nRanks_, SocketAddress());
std::memset(rankAddresses.data(), 0, sizeof(SocketAddress) * nRanks_);
std::memset(rankAddressesRoot.data(), 0, sizeof(SocketAddress) * nRanks_);
setFilesLimit();
DPRINTF("BEGIN bootstrapRoot\n");
/* Receive addresses from all ranks */
do {
int rank;
getRemoteAddresses(listenSockRoot_.get(), rankAddresses, rankAddressesRoot, rank);
++numCollected;
DPRINTF("Received connect from rank %d total %d/%d\n", rank, numCollected, nRanks_);
} while (numCollected < nRanks_ && (!abortFlag_ || *abortFlag_ == 0));
if (abortFlag_ && *abortFlag_) {
DPRINTF("ABORTED\n");
return;
}
DPRINTF("COLLECTED ALL %d HANDLES\n", nRanks_);
// Send the connect handle for the next rank in the AllGather ring
for (int peer = 0; peer < nRanks_; ++peer) {
sendHandleToPeer(peer, rankAddresses, rankAddressesRoot);
}
DPRINTF("DONE bootstrapRoot\n");
}
void TcpBootstrap::Impl::netInit(std::string ipPortPair, std::string interface,
SocketAddress& netIfAddr) {
char netIfName[MAX_IF_NAME_SIZE + 1];
if (!ipPortPair.empty()) {
if (interface != "") {
// we know the <interface>
int ret = FindInterfaces(netIfName, &netIfAddr, MAX_IF_NAME_SIZE, 1, interface.c_str());
if (ret <= 0) {
ERROR("NET/Socket : No interface named %s found\n", interface.c_str());
return;
}
} else {
// we do not know the <interface> try to match it next
SocketAddress remoteAddr;
SocketGetAddrFromString(&remoteAddr, ipPortPair.c_str());
if (FindInterfaceMatchSubnet(netIfName, &netIfAddr, &remoteAddr, MAX_IF_NAME_SIZE, 1) <= 0) {
ERROR("NET/Socket : No usable listening interface found\n");
return;
}
}
} else {
int ret = FindInterfaces(netIfName, &netIfAddr, MAX_IF_NAME_SIZE, 1);
if (ret <= 0) {
ERROR("TcpBootstrap : no socket interface found\n");
return;
}
}
char line[SOCKET_NAME_MAXLEN + MAX_IF_NAME_SIZE + 2];
std::sprintf(line, " %s:", netIfName);
SocketToString(&netIfAddr, line + strlen(line));
DPRINTF("TcpBootstrap : Using%s", line);
}
#define TIMEOUT(__exp) \
do { \
try { \
__exp; \
} catch (const Error& e) { \
if (e.getErrorCode() == ErrorCode::Timeout) { \
throw Error("TcpBootstrap connection timeout", ErrorCode::Timeout); \
} \
throw; \
} \
} while (0);
void TcpBootstrap::Impl::establishConnections(int64_t timeoutSec) {
const int64_t connectionTimeoutUs = timeoutSec * 1000000;
Timer timer;
SocketAddress nextAddr;
ExtInfo info;
DPRINTF("establishConnections: rank %d nranks %d\n", rank_, nRanks_);
auto getLeftTime = [&]() {
if (connectionTimeoutUs < 0) {
// no timeout: always return a large number
return int64_t(1e9);
}
int64_t timeout = connectionTimeoutUs - timer.elapsed();
if (timeout <= 0) {
ERROR("TcpBootstrap connection timeout\n");
return (long int)-1;
}
return timeout;
};
info.rank = rank_;
info.nRanks = nRanks_;
uint64_t magic = uniqueId_.magic;
// Create socket for other ranks to contact me
listenSock_ = std::make_unique<Socket>(&netIfAddr_, magic, SocketTypeBootstrap, abortFlag_);
listenSock_->bindAndListen();
info.extAddressListen = listenSock_->getAddr();
{
// Create socket for root to contact me
Socket lsock(&netIfAddr_, magic, SocketTypeBootstrap, abortFlag_);
lsock.bindAndListen();
info.extAddressListenRoot = lsock.getAddr();
// stagger connection times to avoid an overload of the root
auto randomSleep = [](int rank) {
timespec tv;
tv.tv_sec = rank / 1000;
tv.tv_nsec = 1000000 * (rank % 1000);
DPRINTF("rank %d delaying connection to root by %ld sec %ld nsec\n", rank,
tv.tv_sec, tv.tv_nsec);
(void)nanosleep(&tv, NULL);
};
if (nRanks_ > 128) {
randomSleep(rank_);
}
// send info on my listening socket to root
{
Socket sock(&uniqueId_.addr, magic, SocketTypeBootstrap, abortFlag_);
//TIMEOUT(sock.connect(getLeftTime()));
sock.connect(getLeftTime());
netSend(&sock, &info, sizeof(info));
}
// get info on my "next" rank in the bootstrap ring from root
{
Socket sock(nullptr, ROCSHMEM_SOCKET_MAGIC, SocketTypeUnknown, abortFlag_);
//TIMEOUT(sock.accept(&lsock, getLeftTime()));
sock.accept(&lsock, getLeftTime());
netRecv(&sock, &nextAddr, sizeof(SocketAddress));
}
}
ringSendSocket_ = std::make_unique<Socket>(&nextAddr, magic, SocketTypeBootstrap, abortFlag_);
//TIMEOUT(ringSendSocket_->connect(getLeftTime()));
ringSendSocket_->connect(getLeftTime());
// Accept the connect request from the previous rank in the AllGather ring
ringRecvSocket_ = std::make_unique<Socket>(nullptr, ROCSHMEM_SOCKET_MAGIC, SocketTypeUnknown,
abortFlag_);
//TIMEOUT(ringRecvSocket_->accept(listenSock_.get(), getLeftTime()));
ringRecvSocket_->accept(listenSock_.get(), getLeftTime());
// AllGather all listen handlers
peerCommAddresses_[rank_] = listenSock_->getAddr();
allGather(peerCommAddresses_.data(), sizeof(SocketAddress));
DPRINTF("rank %d nranks %d - DONE\n", rank_, nRanks_);
}
int TcpBootstrap::Impl::getNranksPerNode() {
if (nRanksPerNode_ > 0) return nRanksPerNode_;
int nRanksPerNode = 0;
bool useIpv4 = peerCommAddresses_[rank_].sa.sa_family == AF_INET;
for (int i = 0; i < nRanks_; i++) {
if (useIpv4) {
if (peerCommAddresses_[i].sin.sin_addr.s_addr ==
peerCommAddresses_[rank_].sin.sin_addr.s_addr) {
localRanks_.push_back(i);
nRanksPerNode++;
}
} else {
if (std::memcmp(&(peerCommAddresses_[i].sin6.sin6_addr),
&(peerCommAddresses_[rank_].sin6.sin6_addr),
sizeof(in6_addr)) == 0) {
localRanks_.push_back(i);
nRanksPerNode++;
}
}
}
nRanksPerNode_ = nRanksPerNode;
return nRanksPerNode_;
}
void TcpBootstrap::Impl::allGather(void* allData, int size) {
char* data = static_cast<char*>(allData);
int rank = rank_;
int nRanks = nRanks_;
DPRINTF("allGather: rank %d nranks %d size %d\n", rank, nRanks, size);
/* Simple ring based AllGather
* At each step i receive data from (rank-i-1) from left
* and send previous step's data from (rank-i) to right
*/
for (int i = 0; i < nRanks - 1; i++) {
size_t rSlice = (rank - i - 1 + nRanks) % nRanks;
size_t sSlice = (rank - i + nRanks) % nRanks;
// Send slice to the right
netSend(ringSendSocket_.get(), data + sSlice * size, size);
// Recv slice from the left
netRecv(ringRecvSocket_.get(), data + rSlice * size, size);
}
DPRINTF("allGather: rank %d nranks %d size %d - DONE\n", rank, nRanks, size);
}
std::shared_ptr<Socket> TcpBootstrap::Impl::getPeerSendSocket(int peer, int tag) {
auto it = peerSendSockets_.find(std::make_pair(peer, tag));
if (it != peerSendSockets_.end()) {
return it->second;
}
auto sock = std::make_shared<Socket>(&peerCommAddresses_[peer], uniqueId_.magic,
SocketTypeBootstrap, abortFlag_);
sock->connect();
netSend(sock.get(), &rank_, sizeof(int));
netSend(sock.get(), &tag, sizeof(int));
peerSendSockets_[std::make_pair(peer, tag)] = sock;
return sock;
}
std::shared_ptr<Socket> TcpBootstrap::Impl::getPeerRecvSocket(int peer, int tag) {
auto it = peerRecvSockets_.find(std::make_pair(peer, tag));
if (it != peerRecvSockets_.end()) {
return it->second;
}
for (;;) {
auto sock = std::make_shared<Socket>(nullptr, ROCSHMEM_SOCKET_MAGIC, SocketTypeUnknown,
abortFlag_);
sock->accept(listenSock_.get());
int recvPeer, recvTag;
netRecv(sock.get(), &recvPeer, sizeof(int));
netRecv(sock.get(), &recvTag, sizeof(int));
peerRecvSockets_[std::make_pair(recvPeer, recvTag)] = sock;
if (recvPeer == peer && recvTag == tag) {
return sock;
}
}
}
void TcpBootstrap::Impl::netSend(Socket* sock, const void* data, int size) {
sock->send(&size, sizeof(int));
sock->send(const_cast<void*>(data), size);
}
void TcpBootstrap::Impl::netRecv(Socket* sock, void* data, int size) {
int recvSize;
sock->recv(&recvSize, sizeof(int));
if (recvSize > size) {
ERROR("Message truncated : received %d bytes instead of %d\n", recvSize, size);
return;
}
sock->recv(data, std::min(recvSize, size));
}
void TcpBootstrap::Impl::send(void* data, int size, int peer, int tag) {
auto sock = getPeerSendSocket(peer, tag);
netSend(sock.get(), data, size);
}
void TcpBootstrap::Impl::recv(void* data, int size, int peer, int tag) {
auto sock = getPeerRecvSocket(peer, tag);
netRecv(sock.get(), data, size);
}
void TcpBootstrap::Impl::barrier() { allGather(barrierArr_.data(), sizeof(int)); }
void TcpBootstrap::Impl::close() {
listenSockRoot_.reset(nullptr);
listenSock_.reset(nullptr);
ringRecvSocket_.reset(nullptr);
ringSendSocket_.reset(nullptr);
peerSendSockets_.clear();
peerRecvSockets_.clear();
}
rocshmem_uniqueid_t TcpBootstrap::createUniqueId() { return Impl::createUniqueId(); }
TcpBootstrap::TcpBootstrap(int rank, int nRanks) { pimpl_ = std::make_unique<Impl>(rank, nRanks); }
rocshmem_uniqueid_t TcpBootstrap::getUniqueId() const { return pimpl_->getUniqueId(); }
int TcpBootstrap::getRank() { return pimpl_->getRank(); }
int TcpBootstrap::getNranks() { return pimpl_->getNranks(); }
int TcpBootstrap::getNranksPerNode() { return pimpl_->getNranksPerNode(); }
std::vector<int> TcpBootstrap::getLocalRanks() { return pimpl_->getLocalRanks(); }
void TcpBootstrap::send(void* data, int size, int peer, int tag) {
pimpl_->send(data, size, peer, tag);
}
void TcpBootstrap::recv(void* data, int size, int peer, int tag) {
pimpl_->recv(data, size, peer, tag);
}
void TcpBootstrap::allGather(void* allData, int size) { pimpl_->allGather(allData, size); }
void TcpBootstrap::initialize(rocshmem_uniqueid_t uniqueId, int64_t timeoutSec) {
pimpl_->initialize(uniqueId, timeoutSec);
}
void TcpBootstrap::initialize(const std::string& ipPortPair, int64_t timeoutSec) {
pimpl_->initialize(ipPortPair, timeoutSec);
}
void TcpBootstrap::barrier() { pimpl_->barrier(); }
TcpBootstrap::~TcpBootstrap() { pimpl_->close(); }
} // namespace rocshmem
@@ -0,0 +1,150 @@
/******************************************************************************
* Copyright (c) Microsoft Corporation.
* Modifications Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef ROCSHMEM_BOOTSTRAP_HPP_
#define ROCSHMEM_BOOTSTRAP_HPP_
#include <array>
#include <bitset>
#include <future>
#include <memory>
#include <string>
#include <vector>
#include "rocshmem/rocshmem_common.hpp"
namespace rocshmem {
/// Return a version string.
std::string version();
/// Base class for bootstraps.
class Bootstrap {
public:
Bootstrap(){};
virtual ~Bootstrap() = default;
virtual int getRank() = 0;
virtual int getNranks() = 0;
virtual int getNranksPerNode() = 0;
virtual std::vector<int> getLocalRanks() = 0;
virtual void send(void* data, int size, int peer, int tag) = 0;
virtual void recv(void* data, int size, int peer, int tag) = 0;
virtual void allGather(void* allData, int size) = 0;
virtual void barrier() = 0;
void groupBarrier(const std::vector<int>& ranks);
void groupAllGather(void* allData, int size, const std::vector<int>& ranks);
void groupAlltoall(void* allData, int size, const std::vector<int>& ranks);
void send(const std::vector<char>& data, int peer, int tag);
void recv(std::vector<char>& data, int peer, int tag);
};
/// A native implementation of the bootstrap using TCP sockets.
class TcpBootstrap : public Bootstrap {
public:
/// Create a random unique ID.
/// @return The created unique ID.
static rocshmem_uniqueid_t createUniqueId();
/// Constructor.
/// @param rank The rank of the process.
/// @param nRanks The total number of ranks.
TcpBootstrap(int rank, int nRanks);
/// Destructor.
~TcpBootstrap();
/// Return the unique ID stored in the @ref TcpBootstrap.
/// @return The unique ID stored in the @ref TcpBootstrap.
rocshmem_uniqueid_t getUniqueId() const;
/// Initialize the @ref TcpBootstrap with a given unique ID.
/// @param uniqueId The unique ID to initialize the @ref TcpBootstrap with.
/// @param timeoutSec The connection timeout in seconds.
void initialize(rocshmem_uniqueid_t uniqueId, int64_t timeoutSec = 30);
/// Initialize the @ref TcpBootstrap with a string formatted as "ip:port" or "interface:ip:port".
/// @param ifIpPortTrio The string formatted as "ip:port" or "interface:ip:port".
/// @param timeoutSec The connection timeout in seconds.
void initialize(const std::string& ifIpPortTrio, int64_t timeoutSec = 30);
/// Return the rank of the process.
int getRank() override;
/// Return the total number of ranks.
int getNranks() override;
/// Return the total number of ranks per node.
int getNranksPerNode() override;
/// Send data to another process.
///
/// Data sent via `send(senderBuff, size, receiverRank, tag)` can be received via `recv(receiverBuff, size,
/// senderRank, tag)`.
///
/// @param data The data to send.
/// @param size The size of the data to send.
/// @param peer The rank of the process to send the data to.
/// @param tag The tag to send the data with.
void send(void* data, int size, int peer, int tag) override;
/// Receive data from another process.
///
/// Data sent via `send(senderBuff, size, receiverRank, tag)` can be received via `recv(receiverBuff, size,
/// senderRank, tag)`.
///
/// @param data The buffer to write the received data to.
/// @param size The size of the data to receive.
/// @param peer The rank of the process to receive the data from.
/// @param tag The tag to receive the data with.
void recv(void* data, int size, int peer, int tag) override;
/// Provide list of ranks that are local to the calling process
std::vector<int> getLocalRanks() override;
/// Gather data from all processes.
///
/// When called by rank `r`, this sends data from `allData[r * size]` to `allData[(r + 1) * size - 1]` to all other
/// ranks. The data sent by rank `r` is received into `allData[r * size]` of other ranks.
///
/// @param allData The buffer to write the received data to.
/// @param size The size of the data each rank sends.
void allGather(void* allData, int size) override;
/// Synchronize all processes.
void barrier() override;
private:
// The interal implementation.
class Impl;
// Pointer to the internal implementation.
std::unique_ptr<Impl> pimpl_;
};
} // namespace rocshmem
#endif // ROCSHMEM_BOOTSTRAP_HPP_
+790
Visa fil
@@ -0,0 +1,790 @@
/******************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation.
* Modifications Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include <errno.h>
#include <ifaddrs.h>
#include <net/if.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fstream>
#include <cstring>
#include "envvar.hpp"
#include "socket.hpp"
#include "utils.hpp"
#include "util.hpp"
namespace rocshmem {
#define ROCSHMEM_SOCKET_SEND 0
#define ROCSHMEM_SOCKET_RECV 1
/* Format a string representation of a (union SocketAddress *)
* socket address using getnameinfo()
*
* Output: "IPv4/IPv6 address<port>"
*/
const char* SocketToString(union SocketAddress* addr, char* buf,
const int numericHostForm /*= 1*/) {
if (buf == NULL || addr == NULL) return NULL;
struct sockaddr* saddr = &addr->sa;
if (saddr->sa_family != AF_INET && saddr->sa_family != AF_INET6) {
buf[0] = '\0';
return buf;
}
char host[NI_MAXHOST], service[NI_MAXSERV];
/* NI_NUMERICHOST: If set, then the numeric form of the hostname is returned.
* (When not set, this will still happen in case the node's name cannot be determined.)
*/
int flag = NI_NUMERICSERV | (numericHostForm ? NI_NUMERICHOST : 0);
(void)getnameinfo(saddr, sizeof(union SocketAddress), host, NI_MAXHOST, service, NI_MAXSERV, flag);
sprintf(buf, "%s<%s>", host, service);
return buf;
}
// Equivalent with ($ cat /proc/sys/net/ipv4/tcp_fin_timeout)
static int getTcpFinTimeout() {
std::ifstream ifs("/proc/sys/net/ipv4/tcp_fin_timeout");
if (!ifs.is_open()) {
ERROR("open /proc/sys/net/ipv4/tcp_fin_timeout failed errno %d\n", errno);
return -1;
}
int timeout;
ifs >> timeout;
return timeout;
}
static uint16_t socketToPort(union SocketAddress* addr) {
struct sockaddr* saddr = &addr->sa;
return ntohs(saddr->sa_family == AF_INET ? addr->sin.sin_port : addr->sin6.sin6_port);
}
/* Allow the user to force the IPv4/IPv6 interface selection */
static int envSocketFamily(void) {
// envvar::types::socket_family enum is defined directly from AF_* constants
return static_cast<int>(envvar::bootstrap::socket_family.get_value());
}
static int findInterfaces(const char* prefixList, char* names, union SocketAddress* addrs,
int sock_family, int maxIfNameSize, int maxIfs) {
#ifdef DEBUG
char line[SOCKET_NAME_MAXLEN + 1];
#endif
struct netIf userIfs[MAX_IFS];
bool searchNot = prefixList && prefixList[0] == '^';
if (searchNot) prefixList++;
bool searchExact = prefixList && prefixList[0] == '=';
if (searchExact) prefixList++;
int nUserIfs = parseStringList(prefixList, userIfs, MAX_IFS);
int found = 0;
struct ifaddrs *interfaces, *interface;
getifaddrs(&interfaces);
for (interface = interfaces; interface && found < maxIfs; interface = interface->ifa_next) {
if (interface->ifa_addr == NULL) continue;
/* We only support IPv4 & IPv6 */
int family = interface->ifa_addr->sa_family;
if (family != AF_INET && family != AF_INET6) continue;
DPRINTF("Found interface %s:%s\n", interface->ifa_name,
SocketToString((union SocketAddress*)interface->ifa_addr, line));
/* Allow the caller to force the socket family type */
if (sock_family != AF_UNSPEC && family != sock_family) continue;
/* We also need to skip IPv6 loopback interfaces */
if (family == AF_INET6) {
struct sockaddr_in6* sa = (struct sockaddr_in6*)(interface->ifa_addr);
if (IN6_IS_ADDR_LOOPBACK(&sa->sin6_addr)) continue;
}
// check against user specified interfaces
if (!(matchIfList(interface->ifa_name, -1, userIfs, nUserIfs, searchExact) ^ searchNot)) {
continue;
}
// Check that this interface has not already been saved
// getifaddrs() normal order appears to be; IPv4, IPv6 Global, IPv6 Link
bool duplicate = false;
for (int i = 0; i < found; i++) {
if (strcmp(interface->ifa_name, names + i * maxIfNameSize) == 0) {
duplicate = true;
break;
}
}
if (!duplicate) {
// Store the interface name
strncpy(names + found * maxIfNameSize, interface->ifa_name, maxIfNameSize);
// Store the IP address
int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
std::memcpy(addrs + found, interface->ifa_addr, salen);
found++;
}
}
freeifaddrs(interfaces);
return found;
}
static bool matchSubnet(struct ifaddrs local_if, union SocketAddress* remote) {
/* Check family first */
int family = local_if.ifa_addr->sa_family;
if (family != remote->sa.sa_family) {
return false;
}
if (family == AF_INET) {
struct sockaddr_in* local_addr = (struct sockaddr_in*)(local_if.ifa_addr);
struct sockaddr_in* mask = (struct sockaddr_in*)(local_if.ifa_netmask);
struct sockaddr_in& remote_addr = remote->sin;
struct in_addr local_subnet, remote_subnet;
local_subnet.s_addr = local_addr->sin_addr.s_addr & mask->sin_addr.s_addr;
remote_subnet.s_addr = remote_addr.sin_addr.s_addr & mask->sin_addr.s_addr;
return (local_subnet.s_addr ^ remote_subnet.s_addr) ? false : true;
} else if (family == AF_INET6) {
struct sockaddr_in6* local_addr = (struct sockaddr_in6*)(local_if.ifa_addr);
struct sockaddr_in6* mask = (struct sockaddr_in6*)(local_if.ifa_netmask);
struct sockaddr_in6& remote_addr = remote->sin6;
struct in6_addr& local_in6 = local_addr->sin6_addr;
struct in6_addr& mask_in6 = mask->sin6_addr;
struct in6_addr& remote_in6 = remote_addr.sin6_addr;
bool same = true;
int len = 16; // IPv6 address is 16 unsigned char
for (int c = 0; c < len; c++) { // Network byte order is big-endian
char c1 = local_in6.s6_addr[c] & mask_in6.s6_addr[c];
char c2 = remote_in6.s6_addr[c] & mask_in6.s6_addr[c];
if (c1 ^ c2) {
same = false;
break;
}
}
// At last, we need to compare scope id
// Two Link-type addresses can have the same subnet address even though they are not in the same scope
// For Global type, this field is 0, so a comparison wouldn't matter
same &= (local_addr->sin6_scope_id == remote_addr.sin6_scope_id);
return same;
} else {
ERROR("Net : Unsupported address family type\n");
return false;
}
}
int FindInterfaceMatchSubnet(char* ifNames, union SocketAddress* localAddrs, union SocketAddress* remoteAddr,
int ifNameMaxSize, int maxIfs) {
#ifdef DEBUG
char line[SOCKET_NAME_MAXLEN + 1];
#endif
char line_a[SOCKET_NAME_MAXLEN + 1];
int found = 0;
struct ifaddrs *interfaces, *interface;
getifaddrs(&interfaces);
for (interface = interfaces; interface && !found; interface = interface->ifa_next) {
if (interface->ifa_addr == NULL) continue;
/* We only support IPv4 & IPv6 */
int family = interface->ifa_addr->sa_family;
if (family != AF_INET && family != AF_INET6) continue;
// check against user specified interfaces
if (!matchSubnet(*interface, remoteAddr)) {
continue;
}
// Store the local IP address
int salen = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
std::memcpy(localAddrs + found, interface->ifa_addr, salen);
// Store the interface name
strncpy(ifNames + found * ifNameMaxSize, interface->ifa_name, ifNameMaxSize);
DPRINTF("NET : Found interface %s:%s in the same subnet as remote address %s\n",
interface->ifa_name, SocketToString(localAddrs + found, line), SocketToString(remoteAddr, line_a));
found++;
if (found == maxIfs) break;
}
if (found == 0) {
ERROR("Net : No interface found in the same subnet as remote address %s\n",
SocketToString(remoteAddr, line_a));
}
freeifaddrs(interfaces);
return found;
}
void SocketGetAddrFromString(union SocketAddress* ua, const char* ip_port_pair) {
if (!(ip_port_pair && strlen(ip_port_pair) > 1)) {
ERROR("Net : string is null\n");
return;
}
bool ipv6 = ip_port_pair[0] == '[';
/* Construct the sockaddress structure */
if (!ipv6) {
struct netIf ni;
// parse <ip_or_hostname>:<port> string, expect one pair
if (parseStringList(ip_port_pair, &ni, 1) != 1) {
ERROR("Net : No valid <IPv4_or_hostname>:<port> pair found\n");
return;
}
struct addrinfo hints, *p;
int rv;
memset(&hints, 0, sizeof(hints));
hints.ai_family = AF_UNSPEC;
hints.ai_socktype = SOCK_STREAM;
if ((rv = getaddrinfo(ni.prefix, NULL, &hints, &p)) != 0) {
ERROR("Net : error encountered when getting address info : %s\n", gai_strerror(rv));
return;
}
// use the first
if (p->ai_family == AF_INET) {
struct sockaddr_in& sin = ua->sin;
std::memcpy(&sin, p->ai_addr, sizeof(struct sockaddr_in));
sin.sin_family = AF_INET; // IPv4
// inet_pton(AF_INET, ni.prefix, &(sin.sin_addr)); // IP address
sin.sin_port = htons(ni.port); // port
} else if (p->ai_family == AF_INET6) {
struct sockaddr_in6& sin6 = ua->sin6;
std::memcpy(&sin6, p->ai_addr, sizeof(struct sockaddr_in6));
sin6.sin6_family = AF_INET6; // IPv6
sin6.sin6_port = htons(ni.port); // port
sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
sin6.sin6_scope_id = 0; // should be global scope, set to 0
} else {
ERROR("Net : unsupported IP family\n");
return;
}
freeaddrinfo(p); // all done with this structure
} else {
int i, j = -1, len = strlen(ip_port_pair);
for (i = 1; i < len; i++) {
if (ip_port_pair[i] == '%') j = i;
if (ip_port_pair[i] == ']') break;
}
if (i == len) {
ERROR("Net : No valid [IPv6]:port pair found\n");
return;
}
bool global_scope = (j == -1 ? true : false); // If no % found, global scope; otherwise, link scope
char ip_str[NI_MAXHOST], port_str[NI_MAXSERV], if_name[IFNAMSIZ];
memset(ip_str, '\0', sizeof(ip_str));
memset(port_str, '\0', sizeof(port_str));
memset(if_name, '\0', sizeof(if_name));
strncpy(ip_str, ip_port_pair + 1, global_scope ? i - 1 : j - 1);
strncpy(port_str, ip_port_pair + i + 2, len - i - 1);
int port = atoi(port_str);
// If not global scope, we need the intf name
if (!global_scope)
strncpy(if_name, ip_port_pair + j + 1, i - j - 1);
struct sockaddr_in6& sin6 = ua->sin6;
sin6.sin6_family = AF_INET6; // IPv6
inet_pton(AF_INET6, ip_str, &(sin6.sin6_addr)); // IP address
sin6.sin6_port = htons(port); // port
sin6.sin6_flowinfo = 0; // needed by IPv6, but possibly obsolete
sin6.sin6_scope_id = global_scope ? 0 : if_nametoindex(if_name); // 0 if global scope; intf index if link scope
}
}
int FindInterfaces(char* ifNames, union SocketAddress* ifAddrs, int ifNameMaxSize, int maxIfs,
const char* inputIfName) {
static int shownIfName = 0;
int nIfs = 0;
// Allow user to force the INET socket family selection
int sock_family = envSocketFamily();
// User specified interface
const std::string& socketIfname = envvar::bootstrap::socket_ifname;
if (inputIfName) {
DPRINTF("using iterface %s", inputIfName);
nIfs = findInterfaces(inputIfName, ifNames, ifAddrs, sock_family, ifNameMaxSize, maxIfs);
} else if (socketIfname != "") {
// Specified by user : find or fail
if (shownIfName++ == 0) DPRINTF ("ROCSHMEM_SOCKET_IFNAME set to %s", socketIfname.c_str());
nIfs = findInterfaces(socketIfname.c_str(), ifNames, ifAddrs, sock_family,
ifNameMaxSize, maxIfs);
} else {
// Try to automatically pick the right one
// Look for anything (but not docker or lo)
if (nIfs == 0) nIfs = findInterfaces("^docker,lo", ifNames, ifAddrs, sock_family,
ifNameMaxSize, maxIfs);
// Finally look for docker, then lo.
if (nIfs == 0) nIfs = findInterfaces("docker", ifNames, ifAddrs, sock_family,
ifNameMaxSize, maxIfs);
if (nIfs == 0) nIfs = findInterfaces("lo", ifNames, ifAddrs, sock_family,
ifNameMaxSize, maxIfs);
}
return nIfs;
}
Socket::Socket(const SocketAddress* addr, uint64_t magic, enum SocketType type, volatile uint32_t* abortFlag,
int asyncFlag) {
fd_ = -1;
acceptFd_ = -1;
connectRetries_ = 0;
acceptRetries_ = 0;
abortFlag_ = abortFlag;
asyncFlag_ = asyncFlag;
state_ = SocketStateInitialized;
magic_ = magic;
type_ = type;
if (addr) {
/* IPv4/IPv6 support */
int family;
std::memcpy(&addr_, addr, sizeof(union SocketAddress));
family = addr_.sa.sa_family;
if (family != AF_INET && family != AF_INET6) {
char line[SOCKET_NAME_MAXLEN + 1];
ERROR("SocketInit: connecting to address %s with family %d is neither AF_INET(%d) nor AF_INET6(%d)\n",
SocketToString(&addr_, line), family, (int)AF_INET, (int)AF_INET6);
return;
}
salen_ = (family == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
/* Connect to a hostname / port */
fd_ = ::socket(family, SOCK_STREAM, 0);
if (fd_ == -1) {
ERROR("socket creation failed %d\n", errno);
return;
}
} else {
memset(&addr_, 0, sizeof(union SocketAddress));
}
/* Set socket as non-blocking if async or if we need to be able to abort */
if ((asyncFlag_ || abortFlag_) && fd_ >= 0) {
int flags = fcntl(fd_, F_GETFL);
if (flags == -1) {
ERROR("fcntl(F_GETFL) failed errno %d\n", errno);
return;
}
if (fcntl(fd_, F_SETFL, flags | O_NONBLOCK) == -1) {
ERROR("fcntl(F_SETFL) failed errno %d\n", errno);
return;
}
}
}
Socket::~Socket() { close(); }
void Socket::bind() {
if (fd_ == -1) {
ERROR("file descriptor is -1\n");
return;
}
if (socketToPort(&addr_)) {
// Port is forced by env. Make sure we get the port.
int opt = 1;
#if defined(SO_REUSEPORT)
if (::setsockopt(fd_, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)) != 0) {
ERROR("::setsockopt(SO_REUSEADDR | SO_REUSEPORT) failed errno %d\n", errno);
return;
}
#else
if (::setsockopt(fd_, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)) != 0) {
ERROR("setsockopt(SO_REUSEADDR) failed errno %d\n", errno);
return;
}
#endif
}
int finTimeout = getTcpFinTimeout();
int retrySecs = finTimeout + 1;
int remainSecs = retrySecs;
// addr port should be 0 (Any port)
while (::bind(fd_, &addr_.sa, salen_) != 0) {
// upon EADDRINUSE, retry up to for (finTimeout + 1) seconds
if (errno != EADDRINUSE) {
ERROR("bind failed errno %d\n", errno);
return;
}
if (remainSecs > 0) {
DPRINTF("No available ephemeral ports found, will retry after 1 second");
sleep(1);
remainSecs--;
} else {
ERROR("No available ephemeral ports found for %d seconds \n", retrySecs);
return;
}
}
/* Get the assigned Port */
socklen_t size = salen_;
if (::getsockname(fd_, &addr_.sa, &size) != 0) {
ERROR("getsockname failed errno %d\n", errno);
return;
}
state_ = SocketStateBound;
}
void Socket::bindAndListen() {
#ifdef DEBUG
char line[SOCKET_NAME_MAXLEN + 1];
#endif
bind();
DPRINTF("Listening on socket %s\n", SocketToString(&addr_, line));
/* Put the socket in listen mode
* NB: The backlog will be silently truncated to the value in /proc/sys/net/core/somaxconn
*/
if (::listen(fd_, 16384) != 0) {
ERROR("listen failed errno %d\n", errno);
return;
}
state_ = SocketStateReady;
}
void Socket::connect(int64_t timeout) {
#ifdef DEBUG
char line[SOCKET_NAME_MAXLEN + 1];
#endif
Timer timer;
const int one = 1;
if (fd_ == -1) {
ERROR("file descriptor is -1\n");
return;
}
if (state_ != SocketStateInitialized) {
ERROR("wrong socket state %d\n", state_);
return;
}
DPRINTF("Connecting to socket %s \n", SocketToString(&addr_, line));
if (setsockopt(fd_, IPPROTO_TCP, TCP_NODELAY, (char*)&one, sizeof(int)) != 0) {
DPRINTF("setsockopt(TCP_NODELAY) failed, errno %d\n", errno);
return;
}
state_ = SocketStateConnecting;
do {
progressState();
if (timeout > 0 && timer.elapsed() > timeout) {
ERROR("connect timeout\n");
return;
}
} while (asyncFlag_ == 0 && (abortFlag_ == NULL || *abortFlag_ == 0) &&
(state_ == SocketStateConnecting || state_ == SocketStateConnectPolling || state_ == SocketStateConnected));
if (abortFlag_ && *abortFlag_ != 0) {
ERROR("aborted\n");
return;
}
}
void Socket::accept(const Socket* listenSocket, int64_t timeout) {
Timer timer;
if (listenSocket == NULL) {
ERROR("listenSocket is NULL\n");
return;
}
if (listenSocket->getState() != SocketStateReady) {
ERROR("listenSocket is in error state %u\n", listenSocket->getState());
return;
}
if (acceptFd_ == -1) {
fd_ = listenSocket->getFd();
connectRetries_ = listenSocket->getConnectRetries();
acceptRetries_ = listenSocket->getAcceptRetries();
abortFlag_ = listenSocket->getAbortFlag();
asyncFlag_ = listenSocket->getAsyncFlag();
magic_ = listenSocket->getMagic();
type_ = listenSocket->getType();
addr_ = listenSocket->getAddr();
salen_ = listenSocket->getSalen();
acceptFd_ = listenSocket->getFd();
state_ = SocketStateAccepting;
}
do {
progressState();
if (timeout > 0 && timer.elapsed() > timeout) {
ERROR("accept timeout\n");
return;
}
} while (asyncFlag_ == 0 && (abortFlag_ == NULL || *abortFlag_ == 0) &&
(state_ == SocketStateAccepting || state_ == SocketStateAccepted));
if (abortFlag_ && *abortFlag_ != 0) {
ERROR("aborted\n");
return;
}
}
void Socket::send(void* ptr, int size) {
int offset = 0;
if (state_ != SocketStateReady) {
ERROR("socket state (%d) is not ready\n", state_);
return;
}
socketWait(ROCSHMEM_SOCKET_SEND, ptr, size, &offset);
}
void Socket::recv(void* ptr, int size) {
int offset = 0;
if (state_ != SocketStateReady) {
ERROR("socket state (%d) is not read\n", state_);
return;
}
socketWait(ROCSHMEM_SOCKET_RECV, ptr, size, &offset);
}
void Socket::recvUntilEnd(void* ptr, int size, int* closed) {
int offset = 0;
*closed = 0;
if (state_ != SocketStateReady) {
ERROR("socket state (%d) is not ready in recvUntilEnd\n", state_);
return;
}
int bytes = 0;
char* data = (char*)ptr;
do {
bytes = ::recv(fd_, data + (offset), size - (offset), 0);
if (bytes == 0) {
*closed = 1;
return;
}
if (bytes == -1) {
if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN && state_ != SocketStateClosed) {
ERROR("recv until end failed errno %d\n", errno);
return;
} else {
bytes = 0;
}
}
(offset) += bytes;
if (abortFlag_ && *abortFlag_ != 0) {
ERROR("aborted\n");
return;
}
} while (bytes > 0 && (offset) < size);
}
void Socket::close() {
if (fd_ >= 0) ::close(fd_);
state_ = SocketStateClosed;
fd_ = -1;
}
void Socket::progressState() {
if (state_ == SocketStateAccepting) {
tryAccept();
}
if (state_ == SocketStateAccepted) {
finalizeAccept();
}
if (state_ == SocketStateConnecting) {
startConnect();
}
if (state_ == SocketStateConnectPolling) {
pollConnect();
}
if (state_ == SocketStateConnected) {
finalizeConnect();
}
}
void Socket::tryAccept() {
socklen_t socklen = sizeof(union SocketAddress);
fd_ = ::accept(acceptFd_, &addr_.sa, &socklen);
if (fd_ != -1) {
state_ = SocketStateAccepted;
} else if (errno != EAGAIN && errno != EWOULDBLOCK) {
ERROR("accept failed (fd %d) errno %d\n", acceptFd_, errno);
} else {
usleep(SLEEP_INT);
if (++acceptRetries_ % 1000 == 0)
DPRINTF("tryAccept: Call to try accept returned %s, retrying", strerror(errno));
}
}
void Socket::finalizeAccept() {
uint64_t magic;
enum SocketType type;
int received = 0;
socketProgress(ROCSHMEM_SOCKET_RECV, &magic, sizeof(magic), &received);
if (received == 0) return;
socketWait(ROCSHMEM_SOCKET_RECV, &magic, sizeof(magic), &received);
if (magic != magic_) {
ERROR("finalizeAccept: wrong magic %lx != %lx\n", magic, magic_);
::close(fd_);
fd_ = -1;
// Ignore spurious connection and accept again
state_ = SocketStateAccepting;
return;
} else {
received = 0;
socketWait(ROCSHMEM_SOCKET_RECV, &type, sizeof(type), &received);
if (type != type_) {
state_ = SocketStateError;
::close(fd_);
fd_ = -1;
ERROR("wrong socket type %d != %d \n", type, type_);
return;
} else {
state_ = SocketStateReady;
}
}
}
void Socket::startConnect() {
/* blocking/non-blocking connect() is determined by asyncFlag. */
int ret = ::connect(fd_, &addr_.sa, salen_);
if (ret == 0) {
state_ = SocketStateConnected;
return;
} else if (errno == EINPROGRESS) {
state_ = SocketStateConnectPolling;
return;
} else if (errno == ECONNREFUSED || errno == ETIMEDOUT) {
usleep(SLEEP_INT);
if (++connectRetries_ % 1000 == 0) DPRINTF("Call to connect returned %s, retrying", strerror(errno));
return;
} else {
char line[SOCKET_NAME_MAXLEN + 1];
state_ = SocketStateError;
ERROR("connect to %s failed, errno %d\n", SocketToString(&addr_, line), errno);
return;
}
}
void Socket::pollConnect() {
struct pollfd pfd;
int timeout = 1, ret;
socklen_t rlen = sizeof(int);
memset(&pfd, 0, sizeof(struct pollfd));
pfd.fd = fd_;
pfd.events = POLLOUT;
ret = ::poll(&pfd, 1, timeout);
if (ret == -1) {
ERROR("poll failed errno %d\n", errno);
return;
}
if (ret == 0) return;
/* check socket status */
if ((ret == 1 && (pfd.revents & POLLOUT)) == 0) {
ERROR("poll failed\n");
return;
}
if (getsockopt(fd_, SOL_SOCKET, SO_ERROR, (void*)&ret, &rlen) == -1) {
ERROR("getsockopt failed, errno %d\n", errno);
return;
}
if (ret == 0) {
state_ = SocketStateConnected;
} else if (ret == ECONNREFUSED || ret == ETIMEDOUT) {
if (++connectRetries_ % 1000 == 0) {
DPRINTF("Call to connect returned %s, retrying", strerror(errno));
}
usleep(SLEEP_INT);
::close(fd_);
fd_ = ::socket(addr_.sa.sa_family, SOCK_STREAM, 0);
state_ = SocketStateConnecting;
} else if (ret != EINPROGRESS) {
state_ = SocketStateError;
ERROR("connect failed \n");
return;
}
}
void Socket::finalizeConnect() {
int sent = 0;
socketProgress(ROCSHMEM_SOCKET_SEND, &magic_, sizeof(magic_), &sent);
if (sent == 0) return;
socketWait(ROCSHMEM_SOCKET_SEND, &magic_, sizeof(magic_), &sent);
sent = 0;
socketWait(ROCSHMEM_SOCKET_SEND, &type_, sizeof(type_), &sent);
state_ = SocketStateReady;
}
void Socket::socketProgressOpt(int op, void* ptr, int size, int* offset, int block, int* closed) {
int bytes = 0;
*closed = 0;
char* data = (char*)ptr;
do {
if (op == ROCSHMEM_SOCKET_RECV) bytes = ::recv(fd_, data + (*offset), size - (*offset), block ? 0 : MSG_DONTWAIT);
if (op == ROCSHMEM_SOCKET_SEND)
bytes = ::send(fd_, data + (*offset), size - (*offset), block ? MSG_NOSIGNAL : MSG_DONTWAIT | MSG_NOSIGNAL);
if (op == ROCSHMEM_SOCKET_RECV && bytes == 0) {
*closed = 1;
return;
}
if (bytes == -1) {
if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
ERROR("recv failed, errno %d\n", errno);
return;
} else {
bytes = 0;
}
}
(*offset) += bytes;
if (abortFlag_ && *abortFlag_ != 0) {
ERROR("aborted\n");
return;
}
} while (bytes > 0 && (*offset) < size);
}
void Socket::socketProgress(int op, void* ptr, int size, int* offset) {
int closed;
socketProgressOpt(op, ptr, size, offset, 0, &closed);
if (closed) {
char line[SOCKET_NAME_MAXLEN + 1];
ERROR("connection closed by remote peer %s\n", SocketToString(&addr_, line, 0));
return;
}
}
void Socket::socketWait(int op, void* ptr, int size, int* offset) {
while (*offset < size) socketProgress(op, ptr, size, offset);
}
} // namespace rocshmem
+137
Visa fil
@@ -0,0 +1,137 @@
/******************************************************************************
* Copyright (c) Microsoft Corporation.
* Modifications Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef ROCSHMEM_SOCKET_H_
#define ROCSHMEM_SOCKET_H_
#include <arpa/inet.h>
#include <fcntl.h>
#include <netdb.h>
#include <netinet/tcp.h>
#include <poll.h>
#include <stddef.h>
#include <sys/socket.h>
namespace rocshmem {
#define MAX_IFS 16
#define MAX_IF_NAME_SIZE 16
#define SLEEP_INT 1000 // connection retry sleep interval in usec
#define SOCKET_NAME_MAXLEN (NI_MAXHOST + NI_MAXSERV)
#define ROCSHMEM_SOCKET_MAGIC 0x564ab9f2fc4b9d6cULL
/* Common socket address storage structure for IPv4/IPv6 */
union SocketAddress {
struct sockaddr sa;
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
};
enum SocketState {
SocketStateNone = 0,
SocketStateInitialized = 1,
SocketStateAccepting = 2,
SocketStateAccepted = 3,
SocketStateConnecting = 4,
SocketStateConnectPolling = 5,
SocketStateConnected = 6,
SocketStateBound = 7,
SocketStateReady = 8,
SocketStateClosed = 9,
SocketStateError = 10,
SocketStateNum = 11
};
enum SocketType {
SocketTypeUnknown = 0,
SocketTypeBootstrap = 1,
SocketTypeProxy = 2,
SocketTypeNetSocket = 3,
SocketTypeNetIb = 4
};
const char* SocketToString(union SocketAddress* addr, char* buf, const int numericHostForm = 1);
void SocketGetAddrFromString(union SocketAddress* ua, const char* ip_port_pair);
int FindInterfaceMatchSubnet(char* ifNames, union SocketAddress* localAddrs, union SocketAddress* remoteAddr,
int ifNameMaxSize, int maxIfs);
int FindInterfaces(char* ifNames, union SocketAddress* ifAddrs, int ifNameMaxSize, int maxIfs,
const char* inputIfName = nullptr);
class Socket {
public:
Socket(const SocketAddress* addr = nullptr, uint64_t magic = ROCSHMEM_SOCKET_MAGIC,
enum SocketType type = SocketTypeUnknown, volatile uint32_t* abortFlag = nullptr, int asyncFlag = 0);
~Socket();
void bind();
void bindAndListen();
void connect(int64_t timeout = -1);
void accept(const Socket* listenSocket, int64_t timeout = -1);
void send(void* ptr, int size);
void recv(void* ptr, int size);
void recvUntilEnd(void* ptr, int size, int* closed);
void close();
int getFd() const { return fd_; }
int getAcceptFd() const { return acceptFd_; }
int getConnectRetries() const { return connectRetries_; }
int getAcceptRetries() const { return acceptRetries_; }
volatile uint32_t* getAbortFlag() const { return abortFlag_; }
int getAsyncFlag() const { return asyncFlag_; }
enum SocketState getState() const { return state_; }
uint64_t getMagic() const { return magic_; }
enum SocketType getType() const { return type_; }
SocketAddress getAddr() const { return addr_; }
int getSalen() const { return salen_; }
private:
void tryAccept();
void finalizeAccept();
void startConnect();
void pollConnect();
void finalizeConnect();
void progressState();
void socketProgressOpt(int op, void* ptr, int size, int* offset, int block, int* closed);
void socketProgress(int op, void* ptr, int size, int* offset);
void socketWait(int op, void* ptr, int size, int* offset);
int fd_;
int acceptFd_;
int connectRetries_;
int acceptRetries_;
volatile uint32_t* abortFlag_;
int asyncFlag_;
enum SocketState state_;
uint64_t magic_;
enum SocketType type_;
union SocketAddress addr_;
int salen_;
};
} // namespace rocshmem
#endif // ROCSHMEM_SOCKET_H_
+277
Visa fil
@@ -0,0 +1,277 @@
/******************************************************************************
* Copyright (c) Microsoft Corporation.
* Modifications Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include <unistd.h>
#include <signal.h>
#include <chrono>
#include <cstring>
#include <fstream>
#include <memory>
#include <string>
#include <iostream>
#include "envvar.hpp"
#include "utils.hpp"
#include "util.hpp"
constexpr char HOSTID_FILE[32] = "/proc/sys/kernel/random/boot_id";
static bool matchIf(const char* string, const char* ref, bool matchExact) {
// Make sure to include '\0' in the exact case
int matchLen = matchExact ? strlen(string) + 1 : strlen(ref);
return strncmp(string, ref, matchLen) == 0;
}
static bool matchPort(const int port1, const int port2) {
if (port1 == -1) return true;
if (port2 == -1) return true;
if (port1 == port2) return true;
return false;
}
namespace rocshmem {
std::string int64ToBusId(int64_t id) {
char busId[20];
std::snprintf(busId, sizeof(busId), "%04lx:%02lx:%02lx.%01lx", (id) >> 20, (id & 0xff000) >> 12, (id & 0xff0) >> 4,
(id & 0xf));
return std::string(busId);
}
int64_t busIdToInt64(const std::string busId) {
char hexStr[17]; // Longest possible int64 hex string + null terminator.
size_t hexOffset = 0;
for (size_t i = 0; hexOffset < sizeof(hexStr) - 1 && i < busId.length(); ++i) {
char c = busId[i];
if (c == '.' || c == ':') continue;
if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) {
hexStr[hexOffset++] = busId[i];
} else
break;
}
hexStr[hexOffset] = '\0';
return std::strtol(hexStr, NULL, 16);
}
uint64_t getHash(const char* string, int n) {
// Based on DJB2a, result = result * 33 ^ char
uint64_t result = 5381;
for (int c = 0; c < n; c++) {
result = ((result << 5) + result) ^ string[c];
}
return result;
}
/* Generate a hash of the unique identifying string for this host
* that will be unique for both bare-metal and container instances
* Equivalent of a hash of;
*
* $(hostname)$(cat /proc/sys/kernel/random/boot_id)
*
* This string can be overridden by using the ROCSHMEM_HOSTID env var.
*/
uint64_t computeHostHash(void) {
const size_t hashLen = 1024;
char hostHash[hashLen];
memset(hostHash, 0, hashLen);
std::string hostName = getHostName(hashLen, '\0');
strncpy(hostHash, hostName.c_str(), hostName.size());
const std::string& hostid = envvar::bootstrap::hostid;
if (!hostid.empty()) {
strncpy(hostHash, hostid.c_str(), hashLen);
} else if (hostName.size() < hashLen) {
std::ifstream file(HOSTID_FILE, std::ios::binary);
if (file.is_open()) {
file.read(hostHash + hostName.size(), hashLen - hostName.size());
}
}
// Make sure the string is terminated
hostHash[sizeof(hostHash) - 1] = '\0';
DPRINTF("unique hostname '%s'", hostHash);
return getHash(hostHash, strlen(hostHash));
}
uint64_t getHostHash(void) {
thread_local std::unique_ptr<uint64_t> hostHash = std::make_unique<uint64_t>(computeHostHash());
// avoid crash on static destruction
if (hostHash == nullptr) {
hostHash = std::make_unique<uint64_t>(computeHostHash());
}
return *hostHash;
}
/* Generate a hash of the unique identifying string for this process
* that will be unique for both bare-metal and container instances
* Equivalent of a hash of;
*
* $$ $(readlink /proc/self/ns/pid)
*/
uint64_t computePidHash(void) {
char pname[1024];
// Start off with our pid ($$)
std::snprintf(pname, sizeof(pname), "%ld", (long)getpid());
int plen = strlen(pname);
int len = readlink("/proc/self/ns/pid", pname + plen, sizeof(pname) - 1 - plen);
if (len < 0) len = 0;
pname[plen + len] = '\0';
DPRINTF("unique PID '%s'", pname);
return getHash(pname, strlen(pname));
}
uint64_t getPidHash(void) {
thread_local std::unique_ptr<uint64_t> pidHash = std::make_unique<uint64_t>(computePidHash());
// avoid crash on static destruction
if (pidHash == nullptr) {
pidHash = std::make_unique<uint64_t>(computePidHash());
}
return *pidHash;
}
int parseStringList(const char* string, netIf* ifList, int maxList) {
if (!string) return 0;
const char* ptr = string;
int ifNum = 0;
int ifC = 0;
char c;
do {
c = *ptr;
if (c == ':') {
if (ifC > 0) {
ifList[ifNum].prefix[ifC] = '\0';
ifList[ifNum].port = atoi(ptr + 1);
ifNum++;
ifC = 0;
}
while (c != ',' && c != '\0') c = *(++ptr);
} else if (c == ',' || c == '\0') {
if (ifC > 0) {
ifList[ifNum].prefix[ifC] = '\0';
ifList[ifNum].port = -1;
ifNum++;
ifC = 0;
}
} else {
ifList[ifNum].prefix[ifC] = c;
ifC++;
}
ptr++;
} while (ifNum < maxList && c);
return ifNum;
}
bool matchIfList(const char* string, int port, netIf* ifList, int listSize, bool matchExact) {
// Make an exception for the case where no user list is defined
if (listSize == 0) return true;
for (int i = 0; i < listSize; i++) {
if (matchIf(string, ifList[i].prefix, matchExact) && matchPort(port, ifList[i].port)) {
return true;
}
}
return false;
}
/* get any bytes of random data from /dev/urandom */
void getRandomData(void* buffer, size_t bytes) {
if (bytes > 0) {
const size_t one = 1UL;
FILE* fp = fopen("/dev/urandom", "r");
if (buffer == NULL || fp == NULL || fread(buffer, bytes, one, fp) != one) {
ERROR("Failed to read random data\n");
return;
}
if (fp) fclose(fp);
}
}
} // namespace rocshmem
// Throw upon SIGALRM.
static void sigalrmTimeoutHandler(int) {
signal(SIGALRM, SIG_IGN);
//throw mscclpp::Error("Timer timed out", ErrorCode::Timeout);
ERROR("Timer timed out\n");
return;
}
namespace rocshmem {
Timer::Timer(int timeout) { set(timeout); }
Timer::~Timer() {
if (timeout_ > 0) {
alarm(0);
signal(SIGALRM, SIG_DFL);
}
}
int64_t Timer::elapsed() const {
auto end = std::chrono::steady_clock::now();
return std::chrono::duration_cast<std::chrono::microseconds>(end - start_).count();
}
void Timer::set(int timeout) {
timeout_ = timeout;
if (timeout > 0) {
signal(SIGALRM, sigalrmTimeoutHandler);
alarm(timeout);
}
start_ = std::chrono::steady_clock::now();
}
void Timer::reset() { set(timeout_); }
void Timer::print(const std::string& name) {
auto us = elapsed();
printf("%s : %ld\n", name.c_str(), us);
}
ScopedTimer::ScopedTimer(const std::string& name) : name_(name) {}
ScopedTimer::~ScopedTimer() { print(name_); }
std::string getHostName(int maxlen, const char delim) {
std::string hostname(maxlen + 1, '\0');
if (gethostname(const_cast<char*>(hostname.data()), maxlen) != 0) {
ERROR("gethostname failed\n");
return nullptr;
}
int i = 0;
while ((hostname[i] != delim) && (hostname[i] != '\0') &&
(i < maxlen - 1)) i++;
hostname[i] = '\0';
return hostname.substr(0, i);
}
} // namespace rocshmem
+102
Visa fil
@@ -0,0 +1,102 @@
/******************************************************************************
* Copyright (c) 2016-2022, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) Microsoft Corporation.
* Modifications Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef ROCSHMEM_UTILS_HPP_
#define ROCSHMEM_UTILS_HPP_
#include <chrono>
#include <cstdint>
#include <cstdio>
#define ERROR(...) { fprintf(stderr, __VA_ARGS__); abort(); }
namespace rocshmem {
struct Timer {
std::chrono::steady_clock::time_point start_;
int timeout_;
Timer(int timeout = -1);
~Timer();
/// Returns the elapsed time in microseconds.
int64_t elapsed() const;
void set(int timeout);
void reset();
void print(const std::string& name);
};
struct ScopedTimer : public Timer {
const std::string name_;
ScopedTimer(const std::string& name);
~ScopedTimer();
};
std::string getHostName(int maxlen, const char delim);
// PCI Bus ID <-> int64 conversion functions
std::string int64ToBusId(int64_t id);
int64_t busIdToInt64(const std::string busId);
uint64_t getHash(const char* string, int n);
uint64_t getHostHash();
uint64_t getPidHash();
void getRandomData(void* buffer, size_t bytes);
struct netIf {
char prefix[64];
int port;
};
int parseStringList(const char* string, struct netIf* ifList, int maxList);
bool matchIfList(const char* string, int port, struct netIf* ifList, int listSize, bool matchExact);
template <class T>
inline void hashCombine(std::size_t& hash, const T& v) {
std::hash<T> hasher;
hash ^= hasher(v) + 0x9e3779b9 + (hash << 6) + (hash >> 2);
}
struct PairHash {
public:
template <typename T, typename U>
std::size_t operator()(const std::pair<T, U>& x) const {
std::size_t hash = 0;
hashCombine(hash, x.first);
hashCombine(hash, x.second);
return hash;
}
};
} // namespace rocshmem
#endif // ROCSHMEM_UTILS_HPP
+68
Visa fil
@@ -0,0 +1,68 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_CONSTANTS_HPP_
#define LIBRARY_SRC_CONSTANTS_HPP_
/**
* @file constants.hpp
*
* @brief Contains global constants for rocSHMEM library
*/
namespace rocshmem {
/**
* @brief Minimum object alignment for symmetric heap.
*
* @note Cache line size on most systems is either 64 or 128.
*/
inline const unsigned ALIGNMENT{128};
/**
* @brief Constant number which holds maximum workgroup size.
*
* @todo Remove this member from this class. It belongs in a class
* that specifically holds device hardware information. If this
* device class existed, we could consolidate the various flavours of
* the Instinct cards into their own groups and then set these
* hard-coded fields by querying the rocm runtime during our library
* initialization.
*/
inline const unsigned MAX_WG_SIZE{1024};
/**
* @brief Constant number which holds the wavefront size
*
* @note Wavefront size on most systems is either 32 or 64.
*/
#if defined(__gfx90a__) || defined(__gfx942__) || defined (__gfx950__)
inline const int WF_SIZE{64};
#else
inline const int WF_SIZE{32};
#endif
} // namespace rocshmem
#endif // LIBRARY_SRC_CONSTANTS_HPP_
@@ -0,0 +1,33 @@
###############################################################################
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# SPDX-License-Identifier: MIT
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to
# deal in the Software without restriction, including without limitation the
# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
# sell copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
###############################################################################
###############################################################################
# ADD ROCSHMEM TARGET FOR FILES IN CURRENT DIRECTORY
###############################################################################
target_sources(
${PROJECT_NAME}
PRIVATE
share_strategy.cpp
strategies.cpp
)
+201
Visa fil
@@ -0,0 +1,201 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_CONTAINERS_ARRAY_HPP_
#define LIBRARY_SRC_CONTAINERS_ARRAY_HPP_
#include <hip/hip_runtime.h>
#include "index_strategy.hpp"
#include "strategies.hpp"
#include "memory_allocator.hpp"
namespace rocshmem {
template <typename TYPE>
class Array {
public:
/**
* @brief
*
* @param[in] index
*
* @return
*/
__host__ __device__ TYPE& operator[](size_t index);
/**
* @brief
*
* @param[in] index
*
* @return
*/
__host__ __device__ const TYPE& operator[](size_t index) const;
/**
* @brief
*
* @return
*/
__host__ __device__ size_t size() const;
/**
* @brief
*
* @param[in] v
* @param[in] start_index
* @param[in] length
*
* @return void
*/
__host__ void fill(TYPE v, size_t start_index, size_t length);
/**
* @brief
*
* @param[in] v
* @param[in] start_index
* @param[in] length
*
* @return void
*/
__device__ void fill(TYPE v, size_t start_index, size_t length);
/**
* @brief
*
* @return void
*/
__host__ __device__ void zero();
/**
* @brief
*
* @param[in] other
* @param[in] start_index
* @param[in] length
*
* @return void
*/
__host__ void copy(const Array* other, size_t start_index,
size_t length); // NOLINT
/**
* @brief
*
* @param[in] other
* @param[in] start_index
* @param[in] length
*
* @return void
*/
__device__ void copy(const Array* other, size_t start_index, // NOLINT
size_t length);
/**
* @brief
*/
Array() = default;
/**
* @brief
*
* @param[in] array_size
* @param[in] allocator
*/
Array(size_t array_size, MemoryAllocator allocator,
const ObjectStrategy* strategy);
/**
* @brief
*/
~Array();
/**
* @brief
*
* @param[in] other
*/
Array(const Array& other);
/**
* @brief
*
* @param[in] rhs
*
* @return
*/
Array& operator=(const Array& rhs);
/**
* @brief
*/
__device__ void zero_thread_dump();
/**
* @brief
*/
__device__ void any_thread_dump();
private:
/**
* @brief
*/
__device__ void _dump();
/**
* @brief
*
* @note _allocator is required to be declared before '_array' in this
* class because _allocator deallocates _array in class destructor.
*/
MemoryAllocator _allocator{};
protected:
/**
* @brief
*/
TYPE* _array{nullptr};
/**
* @brief
*/
size_t _size{0};
private:
/**
* @brief
*/
IndexStrategy _dev_idx{};
/**
* @brief
*/
const size_t dump_num_data_per_line{8};
};
} // namespace rocshmem
#endif // LIBRARY_SRC_CONTAINERS_ARRAY_HPP_
@@ -0,0 +1,190 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_CONTAINERS_ARRAY_IMPL_HPP_
#define LIBRARY_SRC_CONTAINERS_ARRAY_IMPL_HPP_
#include "array.hpp"
#include "constants.hpp"
#include <hip/hip_runtime.h>
#include <cassert>
namespace rocshmem {
extern __constant__ int* GLOBAL_DEVICE_PRINT_LOCK;
template <typename TYPE>
Array<TYPE>::Array(size_t array_size, MemoryAllocator allocator,
const ObjectStrategy* strategy)
: _allocator(allocator),
_size(array_size),
_dev_idx(strategy->index_strategy_two) {
_allocator.allocate(reinterpret_cast<void**>(&_array),
array_size * sizeof(TYPE));
}
template <typename TYPE>
Array<TYPE>::~Array() {
if (_array) {
_allocator.deallocate(_array);
}
}
template <typename TYPE>
Array<TYPE>::Array(const Array<TYPE>& other) {
_size = other._size;
_allocator.allocate(reinterpret_cast<void**>(&_array), _size * sizeof(TYPE));
memcpy(_array, other._array, _size * sizeof(TYPE));
}
template <typename TYPE>
Array<TYPE>& Array<TYPE>::operator=(const Array<TYPE>& rhs) {
if (this == &rhs) {
return *this;
}
if (_array) {
_allocator.deallocate(_array);
}
_size = rhs._size;
_allocator = rhs._allocator;
_allocator.allocate(reinterpret_cast<void**>(&_array), _size * sizeof(TYPE));
assert(_array);
memcpy(_array, rhs._array, _size * sizeof(TYPE));
return *this;
}
template <typename TYPE>
__host__ __device__ TYPE& Array<TYPE>::operator[](size_t index) {
assert(index < _size);
return _array[index];
}
template <typename TYPE>
__host__ __device__ const TYPE& Array<TYPE>::operator[](size_t index) const {
assert(index < _size);
return _array[index];
}
template <typename TYPE>
__host__ __device__ size_t Array<TYPE>::size() const {
return _size;
}
template <typename TYPE>
__host__ void Array<TYPE>::fill(TYPE v, size_t start_index, size_t length) {
for (size_t i = start_index; i < start_index + length; i++) {
(*this)[i] = v;
}
}
template <typename TYPE>
__device__ void Array<TYPE>::fill(TYPE v, size_t start_index, size_t length) {
size_t i = start_index + _dev_idx.start();
while ((i < start_index + length) && (i < _size)) {
(*this)[i] = v;
i = _dev_idx.next(i);
}
}
template <typename TYPE>
__host__ __device__ void Array<TYPE>::zero() {
fill({0, 0}, 0, size());
}
template <typename TYPE>
__host__ void Array<TYPE>::copy(const Array<TYPE>* other,
size_t start_index, // NOLINT
size_t length) {
assert(other);
for (size_t i = start_index; i < start_index + length; i++) {
(*this)[i] = (*other)[i];
}
}
template <typename TYPE>
__device__ void Array<TYPE>::copy(const Array<TYPE>* other, // NOLINT
size_t start_index, size_t length) {
assert(other);
for (size_t i = start_index + _dev_idx.start(); i < start_index + length;
i = _dev_idx.next(i)) {
(*this)[i] = (*other)[i];
}
}
template <typename TYPE>
__device__ void Array<TYPE>::zero_thread_dump() {
Identity id{};
if (id.global_thread_id() == 0) {
_dump();
}
}
template <typename TYPE>
__device__ void Array<TYPE>::any_thread_dump() {
Identity id{};
for (int i = 0; i < WF_SIZE; i++) {
if ((id.local_thread_id() % WF_SIZE) == i) {
while (atomicCAS(GLOBAL_DEVICE_PRINT_LOCK, 0, 1) == 1) {
}
printf("(thread %lu)\n", id.global_thread_id());
_dump();
*GLOBAL_DEVICE_PRINT_LOCK = 0;
}
}
}
template <typename TYPE>
__device__ void Array<TYPE>::_dump() {
Thread_Contiguous_Block_Agnostic idx(_size);
for (size_t i = idx.start(); i < idx.end(); i = idx.next(i)) {
/*
* Limit the number of printed elements per line.
*/
if (i % dump_num_data_per_line == 0) {
printf("\n");
printf("%10lu ", i);
}
/*
* Print the data for this index.
*/
dump((*this)[i]);
}
printf("\n");
}
} // namespace rocshmem
#endif // LIBRARY_SRC_CONTAINERS_ARRAY_IMPL_HPP_
@@ -0,0 +1,360 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_CONTAINERS_ATOMIC_WF_QUEUE_HPP_
#define LIBRARY_SRC_CONTAINERS_ATOMIC_WF_QUEUE_HPP_
#include <hip/hip_runtime.h>
#include "memory/hip_allocator.hpp"
#include "sync/abql_block_mutex.hpp"
#include "util.hpp"
namespace rocshmem {
/*****************************************************************************
******************************* WAVE FREE LIST ******************************
*****************************************************************************/
template <typename TYPE, typename ALLOCATOR = HIPDefaultFinegrainedAllocator>
class AtomicWFQueue {
using MutexProxyType = ABQLBlockMutexProxy<ALLOCATOR>;
using MutexType = ABQLBlockMutex;
/**
* @brief A lock guard for ticket-based locks that follows the design of
* `std::lock_guard`.
*
* @tparam MUTEX The type of the ticket-based mutex to lock.
*/
template <typename MUTEX>
struct TicketLockGuard {
/**
* @brief Constructs the `TicketLockGuard` and locks the mutex.
*
* @param m Mutex to take ownership of.
*/
__device__ explicit TicketLockGuard(MUTEX& m) : mutex_{m} {
ticket_ = mutex_.lock();
__threadfence();
}
/**
* @brief Lock guards are not copyable
*/
__device__ TicketLockGuard(const TicketLockGuard&) = delete;
/**
* @brief Lock guards are not moveable
*/
__device__ TicketLockGuard(TicketLockGuard&&) = delete;
/**
* @brief Destructor the unlocks the mutex.
*/
__device__ ~TicketLockGuard() {
__threadfence();
mutex_.unlock(ticket_);
}
private:
using TicketT = uint64_t;
MUTEX& mutex_;
TicketT ticket_;
};
public:
/**
* @brief Construct a new AtomicWFQueue object
*
* @param allocator Allocator to use for allocating internal structures of the
* AtomicWFQueue.
*/
explicit AtomicWFQueue(const ALLOCATOR& allocator = ALLOCATOR());
/**
* @brief Destroy the AtomicWFQueue object
*/
~AtomicWFQueue();
/**
* @brief Enqueues an element into the AtomicWFQueue.
*
* This function inserts the specified value at the position indicated by
* the `tail_` of the AtomicWFQueue and increases the AtomicWFQueue size
* by one. The enqueue operation follows a first-come, first-serve
* execution order.
*
* @param val The value to be inserted into the AtomicWFQueue.
*/
__device__ void enqueue(const TYPE& val);
/**
* @brief Dequeues an element from the AtomicWFQueue.
*
* This function dequeues the element pointed to by the `head_` of the
* AtomicWFQueue and decreases the AtomicWFQueue size by one. If the
* AtomicWFQueue is empty, the function waits until an element becomes
* available. The dequeue operation follows a first-come, first-serve
* execution order.
*
* @return The dequeued element from the AtomicWFQueue.
*/
__device__ TYPE dequeue();
/**
* @brief Inserts a new element at the end of the AtomicWFQueue.
*
* This function adds the specified value to the end of the AtomicWFQueue,
* updating the `tail_` and `curr_size_` accordingly. It is intended for
* initializing the AtomicWFQueue with initial values.
*
* @note This function is not thread-safe and should only be used during
* the AtomicWFQueue initialization phase or in scenarios where thread
* safety is not a concern.
*
* @param val The value to be inserted into the AtomicWFQueue.
*/
__host__ void push(const TYPE& val);
/**
* @brief Allocates and initializes the AtomicWFQueue.
*
* This function allocates memory for the AtomicWFQueue with the specified
* size and initializes the AtomicWFQueue's head, tail, current size, and
* maximum size variables to their appropriate starting values.
*
* @param size The maximum number of elements the AtomicWFQueue can hold.
*/
__host__ void allocate_queue(unsigned int size);
/**
* @brief Deallocates the AtomicWFQueue and resets its internal variables.
*
* This function frees the memory allocated for the AtomicWFQueue and resets
* the AtomicWFQueue's internal variables such as head, tail, current size,
* and maximum size to their default or zero-initialized values.
*/
__host__ void deallocate_queue();
/**
* @brief Retrieves the logical lane ID of the calling thread.
*
* This function returns the active logical lane ID of the current thread
* within the wavefront. The logical lane ID uniquely identifies
* the thread's position among active threads in the wavefront.
*
* @return The logical lane ID of the active thread within the wavefront.
*/
__device__ unsigned int active_logical_lane_id();
/**
* @brief Broadcasts a value to other threads in the wavefront.
*
* This function broadcasts the specified value to all active threads
* in the wavefront. If `lowest_active` is true, the value is broadcasted
* from the thread with the lowest active lane ID.
*
* @param lowest_active If true, broadcasting starts from the lowest
* active thread in the wavefront.
* @param val The value to be broadcasted.
*
* @return The broadcasted value received by each thread in the wavefront.
*/
__device__ TYPE broadcast_lds(bool lowest_active, TYPE val);
/**
* @brief Retrieves the maximum capacity of the AtomicWFQueue.
*
* This function returns the total size of the AtomicWFQueue, representing
* the maximum number of elements it can hold.
*
* @return The maximum capacity of the AtomicWFQueue.
*/
__host__ __device__ int get_queue_size() {
return size_;
}
/**
* @brief Retrieves the current number of elements in the AtomicWFQueue.
*
* This function returns the current size of the AtomicWFQueue, representing
* the total number of elements currently stored.
*
* @return The current number of elements in the AtomicWFQueue.
*/
__host__ __device__ int get_curr_size() {
return curr_size_;
}
/**
* @brief Retrieves the tail index of the AtomicWFQueue.
*
* This function returns the current index of the tail in the
* AtomicWFQueue, which represents the position where the next
* element will be enqueued.
*
* @return The index of the tail in the AtomicWFQueue.
*/
__host__ __device__ int get_tail() {
return tail_;
}
/**
* @brief Retrieves the head index of the AtomicWFQueue.
*
* This function returns the current index of the head in the
* AtomicWFQueue, which represents the position of the next element
* to be dequeued.
*
* @return The index of the head in the AtomicWFQueue.
*/
__host__ __device__ int get_head() {
return head_;
}
private:
__device__ int atomic_load(const int* address) {
return __hip_atomic_load(address, __ATOMIC_SEQ_CST,
__HIP_MEMORY_SCOPE_AGENT);
}
__device__ void atomic_store(int* address, const int val) {
__hip_atomic_store(address, val, __ATOMIC_SEQ_CST,
__HIP_MEMORY_SCOPE_AGENT);
}
__device__ void atomic_add(int* address, const int val) {
__hip_atomic_fetch_add(address, val, __ATOMIC_SEQ_CST,
__HIP_MEMORY_SCOPE_AGENT);
}
__device__ void atomic_sub(int* address, const int val) {
__hip_atomic_fetch_sub(address, val, __ATOMIC_SEQ_CST,
__HIP_MEMORY_SCOPE_AGENT);
}
/**
* @brief Checks if the AtomicWFQueue is full.
*
* This function determines whether the AtomicWFQueue has reached its
* maximum capacity. It is used to prevent overflow conditions during
* enqueue operations.
*
* @return true if the AtomicWFQueue is full, false otherwise.
*/
__device__ bool is_full() {
return atomic_load(&curr_size_) == size_;
}
/**
* @brief Checks if the AtomicWFQueue is empty.
*
* This function determines whether the AtomicWFQueue has no elements
* available for dequeue operations. It is used to prevent underflow
* conditions.
*
* @return true if the AtomicWFQueue is empty, false otherwise.
*/
__device__ bool is_empty() {
return atomic_load(&curr_size_) == 0;
}
/**
* @brief Internal memory allocator used to create internal structures of
* the AtomicWFQueue.
*/
ALLOCATOR allocator_{};
/**
* @brief Points to the index of first element in the AtomicWFQueue.
*/
int head_{};
/**
* @brief Points to the next empty slot in the AtomicWFQueue.
*/
int tail_{};
/**
* @brief Size of the AtomicWFQueue.
*/
int size_{};
/**
* @brief Current size of the AtomicWFQueue.
*/
int curr_size_{};
/**
* @brief Pointer to AtomicWFQueue memory
*/
TYPE *queue_{nullptr};
/**
* @brief Mutex protecting the AtomicWFQueue mutations during dequeue.
*/
MutexProxyType dequeue_mutex_;
/**
* @brief Mutex protecting the AtomicWFQueue mutations during enqueue_mutex.
*/
MutexProxyType enqueue_mutex_;
};
template <typename ALLOCATOR, typename TYPE>
class AtomicWFQueueProxy {
using AtomicWFQueueT = AtomicWFQueue<TYPE, ALLOCATOR>;
using ProxyT = DeviceProxy<ALLOCATOR, AtomicWFQueueT>;
public:
__host__ __device__ AtomicWFQueueT* get() { return proxy_.get(); }
AtomicWFQueueProxy(size_t num_elems = 1) : proxy_{num_elems} {
new (proxy_.get()) AtomicWFQueueT();
}
AtomicWFQueueProxy(const AtomicWFQueueProxy& other) = delete;
AtomicWFQueueProxy& operator=(const AtomicWFQueueProxy& other) = delete;
AtomicWFQueueProxy(AtomicWFQueueProxy&& other) = default;
AtomicWFQueueProxy& operator=(AtomicWFQueueProxy&& other) = default;
~AtomicWFQueueProxy() {
auto atomic_wf_queue = proxy_.get();
atomic_wf_queue->deallocate_queue();
atomic_wf_queue->~AtomicWFQueue();
}
private:
ProxyT proxy_{};
};
} // namespace rocshmem
#endif // LIBRARY_SRC_CONTAINERS_ATOMIC_WF_QUEUE_HPP_
@@ -0,0 +1,168 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "atomic_wf_queue.hpp"
#include <iostream>
#include <hip/hip_runtime.h>
#include <cassert>
namespace rocshmem {
/*****************************************************************************
******************************* WAVE FREE LIST ******************************
*****************************************************************************/
template <typename TYPE, typename ALLOCATOR>
AtomicWFQueue<TYPE, ALLOCATOR>::~AtomicWFQueue() {}
template <typename TYPE, typename ALLOCATOR>
__host__ void AtomicWFQueue<TYPE, ALLOCATOR>::deallocate_queue() {
if (queue_ != nullptr) {
allocator_.deallocate((void*)queue_);
queue_ = nullptr;
}
size_ = 0;
curr_size_ = 0;
head_ = 0;
tail_ = 0;
}
template <typename TYPE, typename ALLOCATOR>
AtomicWFQueue<TYPE, ALLOCATOR>::AtomicWFQueue(const ALLOCATOR& allocator)
: allocator_{allocator}, size_{0}, curr_size_{0}, head_{0}, tail_{0} {}
template <typename TYPE, typename ALLOCATOR>
__host__ void AtomicWFQueue<TYPE, ALLOCATOR>::allocate_queue(
unsigned int size) {
size_ = size;
head_ = 0;
tail_ = 0;
curr_size_ = 0;
allocator_.allocate(reinterpret_cast<void**>(&queue_),
sizeof(TYPE) * size_);
}
template <typename TYPE, typename ALLOCATOR>
__host__ void AtomicWFQueue<TYPE, ALLOCATOR>::push(const TYPE& val) {
if (curr_size_ < size_) {
queue_[tail_] = val;
tail_ = (tail_ + 1) % size_;
curr_size_++;
}
else {
std::cerr << "AtomicWfQueue is full: " << curr_size_
<< " elements" << std::endl;
}
}
template <typename TYPE, typename ALLOCATOR>
__device__ unsigned int
AtomicWFQueue<TYPE, ALLOCATOR>::active_logical_lane_id() {
uint64_t ballot{__ballot(1)};
uint64_t my_physical_lane_id{__lane_id()};
uint64_t all_ones_mask = -1;
uint64_t lane_mask{all_ones_mask << my_physical_lane_id};
uint64_t inverted_mask{~lane_mask};
uint64_t lower_active_lanes{ballot & inverted_mask};
unsigned int my_logical_lane_id{__popcll(lower_active_lanes)};
return my_logical_lane_id;
}
template <typename TYPE, typename ALLOCATOR>
__device__ TYPE AtomicWFQueue<TYPE, ALLOCATOR>::broadcast_lds(
bool lowest_active, TYPE value) {
/**
* Shared array to broadcast data within each wavefront
* Max threads per block = 1024, wavefront size = 64 (in most GPUs)
* Maximum array size required = 1024/64 = 16
*/
constexpr size_t SIZE = 1024 / WF_SIZE;
__shared__ TYPE value_per_warp[SIZE];
auto wavefront_id {get_flat_block_id() / WF_SIZE};
if (lowest_active) {
value_per_warp[wavefront_id] = value;
__threadfence_block();
}
return value_per_warp[wavefront_id];
}
template <typename TYPE, typename ALLOCATOR>
__device__ void AtomicWFQueue<TYPE, ALLOCATOR>::enqueue(const TYPE& val) {
unsigned int my_active_lane_id {active_logical_lane_id()};
bool is_lowest_active_lane {my_active_lane_id == 0};
if (is_lowest_active_lane) {
/**
* Prevents multiple wavefronts from simultaneously entering the enqueue
* operation. Ensures a first-come, first-serve execution order
*/
TicketLockGuard<MutexType> guard(*enqueue_mutex_.get());
/**
* There should always be space available.
* If the queue is full, it indicates an unexpected issue.
*/
assert(!is_full());
int next_tail = (tail_ + 1) % size_;
queue_[tail_] = val;
tail_ = next_tail;
atomic_add(&curr_size_, 1);
}
}
template <typename TYPE, typename ALLOCATOR>
__device__ TYPE AtomicWFQueue<TYPE, ALLOCATOR>::dequeue() {
TYPE ret_val {TYPE()};
unsigned int my_active_lane_id {active_logical_lane_id()};
bool is_lowest_active_lane {my_active_lane_id == 0};
if (is_lowest_active_lane) {
/**
* Prevents multiple wavefronts from simultaneously entering the dequeue
* operation. Ensures a first-come, first-serve execution order
*/
TicketLockGuard<MutexType> guard(*dequeue_mutex_.get());
// queue is empty, wait until data is available
while (is_empty()) {}
int next_head = (head_ + 1) % size_;
ret_val = queue_[head_];
head_ = next_head;
atomic_sub(&curr_size_, 1);
}
ret_val = broadcast_lds(is_lowest_active_lane, ret_val);
// TYPE should support + operation
ret_val += my_active_lane_id;
return ret_val;
}
} // namespace rocshmem
@@ -0,0 +1,290 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_CONTAINERS_FREE_LIST_HPP_
#define LIBRARY_SRC_CONTAINERS_FREE_LIST_HPP_
#include <hip/hip_runtime.h>
#include "memory/hip_allocator.hpp"
#include "sync/abql_block_mutex.hpp"
namespace rocshmem {
// Forward declaration of the proxy.
template <typename ALLOCATOR, typename TYPE>
class FreeListProxy;
/*****************************************************************************
******************************* FREE LIST ***********************************
*****************************************************************************/
template <typename TYPE, typename ALLOC = HIPDefaultFinegrainedAllocator>
class FreeList {
friend class FreeListProxy<ALLOC, TYPE>;
using MutexProxyType = ABQLBlockMutexProxy<ALLOC>;
using MutexType = ABQLBlockMutex;
struct Node {
TYPE data;
Node* next{nullptr};
};
struct PopBackResult {
TYPE value;
bool success;
};
/**
* @brief A lock guard for ticket-based locks that follows the design of
* `std::lock_guard`.
*
* @tparam MUTEX The type of the ticket-based mutex to lock.
*/
template <typename MUTEX>
struct TicketLockGuard {
/**
* @brief Constructs the `TicketLockGuard` and locks the mutex.
*
* @param m Mutex to take ownership of.
*/
__device__ explicit TicketLockGuard(MUTEX& m) : mutex_{m} {
ticket_ = mutex_.lock();
__threadfence();
}
/**
* @brief Lock guards are not copyable
*/
__device__ TicketLockGuard(const TicketLockGuard&) = delete;
/**
* @brief Lock guards are not moveable
*/
__device__ TicketLockGuard(TicketLockGuard&&) = delete;
/**
* @brief Destructor the unlocks the mutex.
*/
__device__ ~TicketLockGuard() {
__threadfence();
mutex_.unlock(ticket_);
}
private:
using TicketT = uint64_t;
MUTEX& mutex_;
TicketT ticket_;
};
public:
/**
* @brief Construct a new Free List object
*
* @param alloc Allocator to use for free list nodes allocations.
*/
explicit FreeList(const ALLOC& alloc = ALLOC());
/**
* @brief Constructs a FreeList object with contents from a range of elements
* defined by [`first`, `last`).
*
* @tparam InputIt Iterator type of the elements to store in the free-list.
* @param first First element in the range defining the input elements.
* @param last Element after last that defines the input elements range.
* @param alloc Allocator to use for allocating internal structures of the
* free list.
*/
template <class InputIt>
FreeList(InputIt first, InputIt last, const ALLOC& alloc = ALLOC());
/**
* @brief Pushes a range of elements defined by [`first`, `last`).
*
* @tparam InputIt Iterator type of the elements to store in the free-list.
* @param first First element in the range defining the input elements.
* @param last Element after last that defines the input elements range.
*
* @return @c true if success, or @c false otherwise.
*/
template <class InputIt>
__host__ bool push_back_range(InputIt first, InputIt last);
/**
* @brief Destroy the Free List object
*/
~FreeList();
/**
* @brief Inserts new element at the end of the FreeList.
*
* The element goes into the container right after its last
* element. The content of val is copied (or moved) to the inserted
* element.
*
* @note Host-side API is not thread safe.
*
* @param val The value to insert in the FreeList.
* @return @c true if the operation succeed, and @c false otherwise.
*/
__device__ bool push_back(const TYPE& val);
/// @copydoc bool FreeList<TYPE, ALLOC>::push_back(const TYPE&)
__device__ bool push_back(TYPE&& val);
/// @copydoc bool FreeList<TYPE, ALLOC>::push_back(const TYPE&)
__host__ bool push_back(const TYPE& val);
/// @copydoc bool FreeList<TYPE, ALLOC>::push_back(const TYPE&)
__host__ bool push_back(TYPE&& val);
/**
* @brief Removes the first element in FreeList, reducing its size by one.
*
* @return An object with two fields `value` and `success`. `success` is a
* boolean indicating if the operation succeeded, and if the operation
* succeeded, the `value` field contains the popped value.
*/
__device__ PopBackResult pop_front();
private:
/**
* @brief Deallocates all memory that was dynamically allocated using the free
* list.
*/
void deallocate_all_nodes();
/**
* @brief Allocates a node using the host-side allocator or the recycled
* pointers on the device.
*
* @note The device-side API assumes the data structure is protected by a
* lock.
*
* @return A pointer to the allocated node.
*/
__host__ Node* allocate_node() {
Node* node;
allocator_.allocate((void**)(&node), sizeof(Node));
return node;
};
/// @copydoc FreeList<TYPE, ALLOC>::Node* FreeList<TYPE,
/// ALLOC>::allocate_node(const TYPE&)
__device__ Node* allocate_node() {
Node* node = deallocated_nodes_;
if (node != nullptr) {
deallocated_nodes_ = node->next;
}
return node;
};
/**
* @brief Appends a node to the tail of the free list.
*
* @param node Node to append to the list.
*/
__host__ __device__ void insert_node_at_tail(Node* node) {
if (tail_ != nullptr) {
tail_->next = node;
}
tail_ = node;
// if the list is empty, set the head_ to the first node
if (head_ == nullptr) {
head_ = node;
}
}
/**
* @brief Device-side node deallocation that inserts the deallocated node into
* a linked-list of pointers to reuse on the device.
*
* @note Assumes structure is protected by a lock.
*/
__device__ void deallocate_node(Node* node) {
// append the node to the head of the linked list
node->next = deallocated_nodes_;
deallocated_nodes_ = node;
};
/**
* @brief Internal memory allocator used to create list nodes.
*/
MemoryAllocator allocator_{};
/**
* @brief First element in the list.
*/
Node* head_{};
/**
* @brief Last element in the list.
*/
Node* tail_{};
/**
* @brief A linked-list of deallocated nodes.
*/
Node* deallocated_nodes_{};
/**
* @brief Mutex protecting the free-list mutations.
*/
MutexProxyType mutex_;
};
template <typename ALLOCATOR, typename TYPE>
class FreeListProxy {
using FreeListT = FreeList<TYPE, ALLOCATOR>;
using ProxyT = DeviceProxy<ALLOCATOR, FreeListT>;
public:
__host__ __device__ FreeListT* get() { return proxy_.get(); }
FreeListProxy(size_t num_elems = 1) : proxy_{num_elems} {
new (proxy_.get()) FreeListT();
}
FreeListProxy(const FreeListProxy& other) = delete;
FreeListProxy& operator=(const FreeListProxy& other) = delete;
FreeListProxy(FreeListProxy&& other) = default;
FreeListProxy& operator=(FreeListProxy&& other) = default;
~FreeListProxy() {
auto free_list = proxy_.get();
free_list->deallocate_all_nodes();
free_list->~FreeList();
}
private:
ProxyT proxy_{};
};
} // namespace rocshmem
#endif // LIBRARY_SRC_CONTAINERS_FREE_LIST_HPP_
@@ -0,0 +1,143 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_CONTAINERS_FREE_LIST_IMPL_HPP_
#define LIBRARY_SRC_CONTAINERS_FREE_LIST_IMPL_HPP_
#include "free_list.hpp"
namespace rocshmem {
/*****************************************************************************
******************************* FREE LIST ***********************************
*****************************************************************************/
template <typename TYPE, typename ALLOC>
FreeList<TYPE, ALLOC>::~FreeList() {}
template <typename TYPE, typename ALLOC>
void FreeList<TYPE, ALLOC>::deallocate_all_nodes() {
// Deallocate any existing nodes
while (head_ != nullptr) {
auto temp = head_;
head_ = temp->next;
allocator_.deallocate(temp);
}
// The tail no longer points to any nodes
tail_ = nullptr;
// Deallocate all recycled nodes
while (deallocated_nodes_ != nullptr) {
auto temp = deallocated_nodes_;
deallocated_nodes_ = temp->next;
allocator_.deallocate(temp);
}
}
template <typename TYPE, typename ALLOC>
FreeList<TYPE, ALLOC>::FreeList(const ALLOC& alloc)
: allocator_{alloc},
head_{nullptr},
tail_{nullptr},
deallocated_nodes_{nullptr} {}
template <typename TYPE, typename ALLOC>
template <class InputIt>
bool FreeList<TYPE, ALLOC>::push_back_range(InputIt first, InputIt last) {
for (auto iter = first; iter != last; iter++) {
auto key = *iter;
const bool result = push_back(key);
if (!result) {
return false;
}
}
return true;
}
template <typename TYPE, typename ALLOC>
__device__ bool FreeList<TYPE, ALLOC>::push_back(const TYPE& val) {
TicketLockGuard<MutexType> guard(*mutex_.get());
auto node = allocate_node();
if (node == nullptr) {
return false;
}
node->data = val;
node->next = nullptr;
insert_node_at_tail(node);
return true;
}
template <typename TYPE, typename ALLOC>
__device__ bool FreeList<TYPE, ALLOC>::push_back(TYPE&& val) {
return push_back(std::forward<const TYPE>(val));
}
template <typename TYPE, typename ALLOC>
__host__ bool FreeList<TYPE, ALLOC>::push_back(const TYPE& val) {
auto node = allocate_node();
if (node == nullptr) {
return false;
}
node->data = val;
node->next = nullptr;
insert_node_at_tail(node);
return true;
}
template <typename TYPE, typename ALLOC>
__host__ bool FreeList<TYPE, ALLOC>::push_back(TYPE&& val) {
return push_back(std::forward<const TYPE>(val));
}
template <typename TYPE, typename ALLOC>
__device__ typename FreeList<TYPE, ALLOC>::PopBackResult
FreeList<TYPE, ALLOC>::pop_front() {
TicketLockGuard<MutexType> guard(*mutex_.get());
if (head_ == nullptr) {
return {{}, false};
}
auto last_node = head_;
head_ = head_->next;
// if we removed all nodes, we should reset the tail
if (head_ == nullptr) {
tail_ = nullptr;
}
TYPE result{last_node->data};
deallocate_node(last_node);
return {result, true};
}
} // namespace rocshmem
#endif // LIBRARY_SRC_CONTAINERS_FREE_LIST_IMPL_HPP_
@@ -0,0 +1,54 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_CONTAINERS_HELPER_MACROS_HPP_
#define LIBRARY_SRC_CONTAINERS_HELPER_MACROS_HPP_
#include <hip/hip_runtime.h>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <iostream>
#include "rocshmem/rocshmem.hpp"
#define BARRIER() rocshmem::rocshmem_wg_barrier_all()
#define RANK rocshmem::rocshmem_my_pe()
#define NPES rocshmem::rocshmem_n_pes()
#define PE_BITS ((uint64_t)ceil(log(NPES) / log(2)))
#define PE_OF(X) ((X) >> (64 - PE_BITS))
#define _printf \
if (RANK == 0) printf
#define _cout \
if (RANK == 0) std::cout
#define _cerr \
if (RANK == 0) std::cerr
#define GIBI 1073741824L
#define MEBI 1048576
#endif // LIBRARY_SRC_CONTAINERS_HELPER_MACROS_HPP_
@@ -0,0 +1,642 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_CONTAINERS_INDEX_STRATEGY_HPP_
#define LIBRARY_SRC_CONTAINERS_INDEX_STRATEGY_HPP_
/**
* @file index_strategy.hpp
*
* @section
* Q: What is an indexing strategy?
*
* A: The indexing strategy is a scheme used by __device__ code to access
* a container's raw memory region.
* The indexing strategy behaves like a simple forward iterator (in normal
* STL code).
*
* Q: What are the inputs and outputs of an indexing strategy?
*
* A: [INPUT] an indexing strategy needs block and grid information
* [INPUT] an indexing strategy needs to know the current index
* [INPUT] an indexing strategy needs to know number of container elements
* [OUTPUT] an indexing strategy returns an index
*
* Q: What are the names of the strategies?
*
* A: Thread_Contiguous_Block_Agnostic
* Thread_Discontiguous_Block_Discontiguous
* Thread_Discontiguous_Block_Contiguous
*
* Q: What is the Thread_Contiguous_Block_Agnostic strategy?
*
* A: Thread_Contiguous_Block_Agnostic keeps a thread's memory accesses
* contiguous. Only one thread may access the container; the container
* is private to the thread (thread-private).
*
* Q: What is the Thread_Discontiguous_Block_Discontiguous strategy?
*
* A: Thread_Discontiguous_Block_Discontiguous does not keep a thread
* block's memory accesses contiguous. The container memory is accessed
* by multiple thread blocks and the accesses by different thread blocks
* are interleaved.
*
* For example:
* assume grid_dim {8, 1, 1} block_dim {4, 1, 1}
* assume container._size = 72
*
* In table below, '#' denotes thread block #'s accesses.
* 0 1 2 3 4 5 6 7 8 9 10 11
* +---+---+---+---+---+---+---+---+---+---+---+---+
* 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 |
* +---+---+---+---+---+---+---+---+---+---+---+---+
* 1 | 3 | 3 | 3 | 3 | 4 | 4 | 4 | 4 | 5 | 5 | 5 | 5 |
* +---+---+---+---+---+---+---+---+---+---+---+---+
* 2 | 6 | 6 | 6 | 6 | 7 | 7 | 7 | 7 | 0 | 0 | 0 | 0 |
* +---+---+---+---+---+---+---+---+---+---+---+---+
* 3 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 |
* +---+---+---+---+---+---+---+---+---+---+---+---+
* 4 | 4 | 4 | 4 | 4 | 5 | 5 | 5 | 5 | 6 | 6 | 6 | 6 |
* +---+---+---+---+---+---+---+---+---+---+---+---+
* 5 | 7 | 7 | 7 | 7 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 |
* +---+---+---+---+---+---+---+---+---+---+---+---+
*
* thread_id_00: grid{0, 0, 0} block{0, 0, 0}
* thread_id_00.2d_accesses := { {0,0},
* {2,8},
* {5,4} }
* thread_id_00.1d_accesses := { 0, 32, 64 }
*
* thread_id_14: grid{3, 0, 0} block{2, 0, 0}
* thread_id_14.2d_accesses := { {1,2},
* {3,9} }
* thread_id_14.1d_accesses := { 14, 46 }
*
* Q: What is the Thread_Discontiguous_Block_Contiguous strategy?
*
* A: Thread_Discontiguous_Block_Contiguous does keep a thread block's memory
* accesses contiguous. The container memory is accessed by multiple
* thread blocks and the accesses by different thread blocks are
* restricted by artificial boundaries called 'tiles'.
*
* For example:
* assume grid_dim {8, 1, 1} block_dim {4, 1, 1}
* assume container._size = 72
*
* In table below, '#' denotes thread block #'s accesses.
* 0 1 2 3 4 5 6 7 8 9 10 11
* +---+---+---+---+---+---+---+---+---+---+---+---+
* 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 |
* +---+---+---+---+---+---+---+---+---+---+---+---+
* 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 2 | 2 | 2 | 2 | 2 |
* +---+---+---+---+---+---+---+---+---+---+---+---+
* 2 | 2 | 2 | 2 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3 |
* +---+---+---+---+---+---+---+---+---+---+---+---+
* 3 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4 | 5 | 5 | 5 |
* +---+---+---+---+---+---+---+---+---+---+---+---+
* 4 | 5 | 5 | 5 | 5 | 5 | 5 | 6 | 6 | 6 | 6 | 6 | 6 |
* +---+---+---+---+---+---+---+---+---+---+---+---+
* 5 | 6 | 6 | 6 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 | 7 |
* +---+---+---+---+---+---+---+---+---+---+---+---+
*
* Q: How do I use the strategies?
*
* A: You may use a strategy like this:
*
* for (size_t i = _dev_idx.start();
* i < _dev_idx.end();
* i = _dev_idx.next(i)) {
* container[i] = ...
* }
*/
#include <hip/hip_runtime.h>
#include <cassert>
#include <algorithm>
namespace rocshmem {
class Identity {
public:
/**
* @brief
*
* @return
*/
__device__ size_t block_size() const {
return hipBlockDim_x * hipBlockDim_y * hipBlockDim_z;
}
/**
* @brief
*
* @return
*/
__device__ size_t grid_size() const {
return hipGridDim_x * hipGridDim_y * hipGridDim_z;
}
/**
* @brief
*
* @return
*/
__device__ size_t block_id() const {
return hipBlockIdx_x + hipBlockIdx_y * hipGridDim_x +
hipBlockIdx_z * hipGridDim_x * hipGridDim_y;
}
/**
* @brief
*
* @return
*/
__device__ size_t local_thread_id() const {
return hipThreadIdx_x + hipThreadIdx_y * hipBlockDim_x +
hipThreadIdx_z * hipBlockDim_x * hipBlockDim_y;
}
/**
* @brief
*
* @return
*/
__device__ size_t global_thread_id() const {
return local_thread_id() + block_id() * block_size();
}
};
class Thread_Contiguous_Block_Agnostic {
public:
/**
* @brief
*/
__host__ __device__ Thread_Contiguous_Block_Agnostic() = default;
/**
* @brief
*
* @param container_elems
*/
__host__ __device__ Thread_Contiguous_Block_Agnostic(size_t container_elems)
: _container_elems(container_elems) {}
/**
* @brief
*
* @return
*/
__device__ size_t start() { return 0; }
/**
* @brief
*
* @return
*/
__device__ size_t end() {
assert(_container_elems);
return _container_elems;
}
/**
* @brief
*
* @param current_index
*
* @return
*/
__device__ size_t next(size_t current_index) { return current_index + 1; }
private:
/**
* @brief
*/
size_t _container_elems{0};
};
class Thread_Discontiguous_Block_Discontiguous {
public:
/**
* @brief
*/
__host__ __device__ Thread_Discontiguous_Block_Discontiguous() = default;
/**
* @brief
*
* @param container_elems
*/
__host__ __device__ explicit Thread_Discontiguous_Block_Discontiguous(
size_t container_elems)
: _container_elems(container_elems) {}
/**
* @brief
*
* @return
*/
__device__ size_t start() { return _id.global_thread_id(); }
/**
* @brief
*
* @return
*/
__device__ size_t end() {
assert(_container_elems);
return _container_elems;
}
/**
* @brief
*
* @param current_index
*
* @return
*/
__device__ size_t next(size_t current_index) {
return current_index + _id.grid_size() * _id.block_size();
}
private:
size_t _container_elems{0};
Identity _id{};
};
class Thread_Discontiguous_Block_Contiguous {
public:
/**
* @brief
*/
__host__ __device__ Thread_Discontiguous_Block_Contiguous() = default;
/**
* @brief
*
* @param container_elems
*/
__host__ __device__ explicit Thread_Discontiguous_Block_Contiguous(
size_t container_elems)
: _container_elems(container_elems) {
size_t left_over = _container_elems % _id.grid_size();
_tile_offset = _id.block_id() * (_container_elems / _id.grid_size()) +
min(_id.block_id(), left_over);
_tile_num_elems = _container_elems / _id.grid_size();
if (_id.block_id() < left_over) {
_tile_num_elems++;
}
}
/**
* @brief
*
* @return
*/
__device__ size_t start() {
assert(_container_elems);
return _tile_offset + _id.local_thread_id();
}
/**
* @brief
*
* @return
*/
__device__ size_t end() {
assert(_container_elems);
return _tile_offset + _tile_num_elems;
}
/**
* @brief
*
* @param current_index
*
* @return
*/
__device__ size_t next(size_t current_index) {
assert(_container_elems);
return current_index + _id.block_size();
}
private:
size_t _container_elems{0};
size_t _tile_offset{0};
size_t _tile_num_elems{0};
Identity _id{};
};
class Thread_Discontiguous_Block_Private {
public:
/**
* @brief
*/
__host__ __device__ Thread_Discontiguous_Block_Private() = default;
/**
* @brief
*
* @param container_elems
*/
__host__ __device__ Thread_Discontiguous_Block_Private(size_t container_elems)
: _container_elems(container_elems) {}
/**
* @brief
*
* @return
*/
__device__ size_t start() { return _id.local_thread_id(); }
/**
* @brief
*
* @return
*/
__device__ size_t end() {
assert(_container_elems);
return _container_elems;
}
/**
* @brief
*
* @param current_index
*
* @return
*/
__device__ size_t next(size_t current_index) {
return current_index + _id.block_size();
}
private:
size_t _container_elems{0};
Identity _id{};
};
class Matrix_Private {
public:
/**
* @brief
*/
__host__ __device__ Matrix_Private() = default;
/**
* @brief
*
* @return
*/
__device__ size_t start() { return _id.global_thread_id(); }
/**
* @brief
*
* @return
*/
__device__ size_t end() {
assert(false);
return 0;
}
/**
* @brief
*
* @param current_index
*
* @return
*/
__device__ size_t next(size_t current_index) {
assert(false);
return 0;
}
private:
Identity _id{};
};
class Matrix_Block {
public:
/**
* @brief
*/
__host__ __device__ Matrix_Block() = default;
/**
* @brief
*
* @return
*/
__device__ size_t start() { return _id.block_id(); }
/**
* @brief
*
* @return
*/
__device__ size_t end() {
assert(false);
return 0;
}
/**
* @brief
*
* @param current_index
*
* @return
*/
__device__ size_t next(size_t current_index) {
assert(false);
return 0;
}
private:
Identity _id{};
};
class Matrix_Device {
public:
/**
* @brief
*/
__host__ __device__ Matrix_Device() = default;
/**
* @brief
*
* @return
*/
__device__ size_t start() { return 0; }
/**
* @brief
*
* @return
*/
__device__ size_t end() {
assert(false);
return 0;
}
/**
* @brief
*
* @param current_index
*
* @return
*/
__device__ size_t next(size_t current_index) {
assert(false);
return 0;
}
};
enum class IndexStrategyEnum {
TCBA = 0,
TDBD = 1,
TDBC = 2,
TDBP = 3,
MP = 4,
MB = 5,
MD = 6,
UNSET = 7
};
class IndexStrategy {
public:
__host__ __device__ IndexStrategy() = default;
__host__ __device__ IndexStrategy(IndexStrategyEnum ise) : _ise(ise) {}
__host__ __device__ IndexStrategy(IndexStrategyEnum ise,
size_t container_elems)
: _ise(ise),
_tcba(container_elems),
_tdbd(container_elems),
_tdbc(container_elems),
_tdbp(container_elems) {}
/**
* @brief
*
* @return
*/
__device__ size_t start() {
switch (_ise) {
case IndexStrategyEnum::TCBA:
return _tcba.start();
case IndexStrategyEnum::TDBD:
return _tdbd.start();
case IndexStrategyEnum::TDBC:
return _tdbc.start();
case IndexStrategyEnum::TDBP:
return _tdbp.start();
case IndexStrategyEnum::MP:
return _mp.start();
case IndexStrategyEnum::MB:
return _mb.start();
case IndexStrategyEnum::MD:
return _md.start();
case IndexStrategyEnum::UNSET:
assert(false);
return 0;
}
}
/**
* @brief
*
* @return
*/
__device__ size_t end() {
switch (_ise) {
case IndexStrategyEnum::TCBA:
return _tcba.end();
case IndexStrategyEnum::TDBD:
return _tdbd.end();
case IndexStrategyEnum::TDBC:
return _tdbc.end();
case IndexStrategyEnum::TDBP:
return _tdbp.end();
case IndexStrategyEnum::MP:
return _mp.end();
case IndexStrategyEnum::MB:
return _mb.end();
case IndexStrategyEnum::MD:
return _md.end();
case IndexStrategyEnum::UNSET:
assert(false);
return 0;
}
}
/**
* @brief
*
* @param current_index
*
* @return
*/
__device__ size_t next(size_t current_index) {
switch (_ise) {
case IndexStrategyEnum::TCBA:
return _tcba.next(current_index);
case IndexStrategyEnum::TDBD:
return _tdbd.next(current_index);
case IndexStrategyEnum::TDBC:
return _tdbc.next(current_index);
case IndexStrategyEnum::TDBP:
return _tdbp.next(current_index);
case IndexStrategyEnum::MP:
return _mp.next(current_index);
case IndexStrategyEnum::MB:
return _mb.next(current_index);
case IndexStrategyEnum::MD:
return _md.next(current_index);
case IndexStrategyEnum::UNSET:
assert(false);
return 0;
}
}
private:
IndexStrategyEnum _ise{IndexStrategyEnum::UNSET};
Thread_Contiguous_Block_Agnostic _tcba{};
Thread_Discontiguous_Block_Discontiguous _tdbd{};
Thread_Discontiguous_Block_Contiguous _tdbc{};
Thread_Discontiguous_Block_Private _tdbp{};
Matrix_Private _mp{};
Matrix_Block _mb{};
Matrix_Device _md{};
};
} // namespace rocshmem
#endif // LIBRARY_SRC_CONTAINERS_INDEX_STRATEGY_HPP_
+169
Visa fil
@@ -0,0 +1,169 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_CONTAINERS_MATRIX_HPP_
#define LIBRARY_SRC_CONTAINERS_MATRIX_HPP_
#include "index_strategy.hpp"
#include "memory_allocator.hpp"
#include <vector>
#include <hip/hip_runtime.h>
namespace rocshmem {
template <typename T>
struct requires_internal_allocator {
static const bool value{false};
};
template <typename T>
struct one_generic_constructor_parameter {
static const bool value{false};
};
template <typename TYPE>
class Matrix {
public:
/**
* @brief
*
* @param[in] row_index
* @param[in] col_index
*
* @return
*/
__host__ __device__ TYPE* access(size_t row_index, size_t col_index);
/**
* @brief
*
* @param[in] col_index
*
* @return
*/
__device__ TYPE* access(size_t col_index);
/**
* @brief
*
* @param[in] col_index
*
* @return
*/
__device__ TYPE* access();
/**
* @brief
*
* @return
*/
__host__ __device__ size_t rows() const;
/**
* @brief
*
* @return
*/
__host__ __device__ size_t columns() const;
protected:
/**
* @brief
*/
Matrix() = default;
/**
* @brief
*
* @param[in] number_rows
* @param[in] number_columns
* @param[in] allocator
* @param[in] strategy
*/
Matrix(size_t number_rows, size_t number_columns, MemoryAllocator allocator,
const IndexStrategy index_strat);
/**
* @brief
*
* @param[in] number_rows
* @param[in] number_columns
* @param[in] TYPE_constructor_param
* @param[in] allocator
* @param[in] strategy
*/
Matrix(size_t number_rows, size_t number_columns,
size_t TYPE_constructor_param, MemoryAllocator allocator,
const IndexStrategy index_strat);
/**
* @brief
*
* @param[in] number_rows
* @param[in] number_columns
* @param[in] TYPE_constructor_param
* @param[in] allocator
* @param[in] strategy
*/
Matrix(size_t number_rows, size_t number_columns,
std::vector<size_t> TYPE_constructor_param, MemoryAllocator allocator,
const IndexStrategy index_strat);
/**
* @brief
*/
~Matrix();
private:
/**
* @brief
*/
size_t _number_rows{0};
/**
* @brief
*/
size_t _number_columns{0};
/**
* @brief
*/
MemoryAllocator _allocator{};
/**
* @brief
*/
TYPE** _flat_c_array{nullptr};
/**
* @brief
*/
IndexStrategy _dev_idx{};
};
} // namespace rocshmem
#endif // LIBRARY_SRC_CONTAINERS_MATRIX_HPP_
@@ -0,0 +1,213 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_CONTAINERS_MATRIX_IMPL_HPP_
#define LIBRARY_SRC_CONTAINERS_MATRIX_IMPL_HPP_
#include "matrix.hpp"
#include <vector>
#include <hip/hip_runtime.h>
#include <cassert>
namespace rocshmem {
template <typename TYPE>
Matrix<TYPE>::Matrix(size_t number_rows, size_t number_columns,
MemoryAllocator allocator, const IndexStrategy strategy)
: _number_rows(number_rows),
_number_columns(number_columns),
_allocator(allocator),
_dev_idx(strategy) {
/*
* Allocate the flattened c-array which contains the top-level
* TYPE pointers.
*/
size_t flat_c_array_dimensions = _number_rows * _number_columns;
size_t size_bytes = flat_c_array_dimensions * sizeof(TYPE*);
_allocator.allocate(reinterpret_cast<void**>(&_flat_c_array), size_bytes);
/*
* Iterate through the flattened c-array and initialize each pointer
* with a valid TYPE.
*/
for (size_t i = 0; i < flat_c_array_dimensions; i++) {
_allocator.allocate(reinterpret_cast<void**>(&_flat_c_array[i]),
sizeof(TYPE));
if constexpr (one_generic_constructor_parameter<TYPE>::value &&
requires_internal_allocator<TYPE>::value) {
/*
* Do not invoke constructor since type traits do not match.
*/
} else if constexpr (requires_internal_allocator<TYPE>::value) {
/*
* Construct the TYPE with placement new.
*/
new (_flat_c_array[i]) TYPE(allocator, strategy);
}
}
}
template <typename TYPE>
Matrix<TYPE>::Matrix(size_t number_rows, size_t number_columns,
size_t TYPE_constructor_param, MemoryAllocator allocator,
const IndexStrategy strategy)
: _number_rows(number_rows),
_number_columns(number_columns),
_allocator(allocator),
_dev_idx(strategy) {
/*
* Allocate the flattened c-array which contains the top-level
* TYPE pointers.
*/
size_t flat_c_array_dimensions = _number_rows * _number_columns;
size_t size_bytes = flat_c_array_dimensions * sizeof(TYPE*);
_allocator.allocate(reinterpret_cast<void**>(&_flat_c_array), size_bytes);
/*
* Iterate through the flattened c-array and initialize each pointer
* with a valid TYPE.
*/
for (size_t i = 0; i < flat_c_array_dimensions; i++) {
_allocator.allocate(reinterpret_cast<void**>(&_flat_c_array[i]),
sizeof(TYPE));
/*
* If type traits match, construct the TYPE with placement new.
*/
if constexpr (one_generic_constructor_parameter<TYPE>::value &&
requires_internal_allocator<TYPE>::value) {
new (_flat_c_array[i]) TYPE(TYPE_constructor_param, allocator, strategy);
} else if constexpr (one_generic_constructor_parameter<TYPE>::value) {
new (_flat_c_array[i]) TYPE(TYPE_constructor_param, strategy);
}
}
}
template <typename TYPE>
Matrix<TYPE>::Matrix(size_t number_rows, size_t number_columns,
std::vector<size_t> TYPE_constructor_param,
MemoryAllocator allocator, const IndexStrategy strategy)
: _number_rows(number_rows),
_number_columns(number_columns),
_allocator(allocator),
_dev_idx(strategy->index_strategy_three) {
/*
* Allocate the flattened c-array which contains the top-level
* TYPE pointers.
*/
size_t flat_c_array_dimensions{_number_rows * _number_columns};
size_t size_bytes{flat_c_array_dimensions * sizeof(TYPE*)};
_allocator.allocate(reinterpret_cast<void**>(&_flat_c_array), size_bytes);
/*
* Check the TYPE_constructor_param vector to see if it has enough
* entries to fully initialize the matrix.
*/
assert(TYPE_constructor_param.size() == flat_c_array_dimensions);
/*
* Iterate through the flattened c-array and initialize each pointer
* with a valid TYPE.
*/
for (size_t i = 0; i < flat_c_array_dimensions; i++) {
_allocator.allocate(reinterpret_cast<void**>(&_flat_c_array[i]),
sizeof(TYPE));
/*
* If type traits match, construct the TYPE with placement new.
*/
if constexpr (one_generic_constructor_parameter<TYPE>::value &&
requires_internal_allocator<TYPE>::value) {
new (_flat_c_array[i])
TYPE(TYPE_constructor_param[i], allocator, strategy);
} else if constexpr (one_generic_constructor_parameter<TYPE>::value) {
new (_flat_c_array[i]) TYPE(TYPE_constructor_param[i], strategy);
}
}
}
template <typename TYPE>
Matrix<TYPE>::~Matrix() {
if (_flat_c_array) {
/*
* Free internal TYPE instances.
*/
size_t flat_c_array_dimensions = _number_rows * _number_columns;
for (size_t i = 0; i < flat_c_array_dimensions; i++) {
if (_flat_c_array[i]) {
_flat_c_array[i]->~TYPE();
_allocator.deallocate(_flat_c_array[i]);
}
}
/*
* Free top-level flat c-array.
*/
_allocator.deallocate(_flat_c_array);
}
}
template <typename TYPE>
__host__ __device__ TYPE* Matrix<TYPE>::access(size_t row_index,
size_t col_index) {
assert(row_index < _number_rows);
assert(col_index < _number_columns);
size_t offset = row_index * _number_columns + col_index;
return _flat_c_array[offset];
}
template <typename TYPE>
__device__ TYPE* Matrix<TYPE>::access(size_t col_index) {
auto row_index = _dev_idx.start();
assert(row_index < _number_rows);
assert(col_index < _number_columns);
size_t offset = row_index * _number_columns + col_index;
return _flat_c_array[offset];
}
template <typename TYPE>
__device__ TYPE* Matrix<TYPE>::access() {
auto row_index = 0;
auto col_index = _dev_idx.start();
assert(row_index < _number_rows);
assert(col_index < _number_columns);
size_t offset = row_index * _number_columns + col_index;
return _flat_c_array[offset];
}
template <typename TYPE>
__host__ __device__ size_t Matrix<TYPE>::rows() const {
return _number_rows;
}
template <typename TYPE>
__host__ __device__ size_t Matrix<TYPE>::columns() const {
return _number_columns;
}
} // namespace rocshmem
#endif // LIBRARY_SRC_CONTAINERS_MATRIX_IMPL_HPP_
@@ -0,0 +1,230 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#include "share_strategy.hpp"
#include "constants.hpp"
#include <hip/hip_runtime.h>
namespace rocshmem {
__device__ uint64_t Block::lane_id() {
/*
* amd_detail/device_functions.h::__lane_id()
*/
return __builtin_amdgcn_mbcnt_hi(-1, __builtin_amdgcn_mbcnt_lo(-1, 0));
}
__device__ uint64_t Block::number_active_lanes() {
/*
* The __ballot(1) built-in instruction conducts an active lane roll
* call storing the roll call result in a bit vector. Using its index
* within the warp, each active lane contributes a '1' to the bit vector
* Inactive lanes cannot contribute to the bit vector so their lane
* values contain a '0'.
*
* For example, assume lanes 2, 5, and 62 active out of 64 lanes (0...63).
* The result of the ballot would be the following:
* index: 0 0 0 0 0 0 0 ... 6 6
* 0 1 2 3 4 5 6 ... 2 3
* value: [0 0 1 0 0 1 0 ... 1 0]
*
* The __ballot(1) result is fed to __popcll.
*
* The __popcll instruction conducts a population count; it checks
* the number of '1' values in a bit vector.
*
* Using the previous example, the result would be a 3 (since indices
* 2, 5, and 62) contain a '1'.
*/
return __popcll(__ballot(1));
}
__device__ bool Block::is_lowest_active_lane() {
return active_logical_lane_id() == 0;
}
__device__ uint64_t Block::lowest_active_lane() {
/*
* The __ballot(1) built-in instruction conducts an active lane roll
* call storing the roll call result in a bit vector. Using its index
* within the warp, each active lane contributes a '1' to the bit vector
* Inactive lanes cannot contribute to the bit vector so their lane
* values contain a '0'.
*
* For example, assume lanes 2, 5, and 62 active out of 64 lanes (0...63).
* The result of the ballot would be the following:
* index: 0 0 0 0 0 0 0 ... 6 6
* 0 1 2 3 4 5 6 ... 2 3
* value: [0 0 1 0 0 1 0 ... 1 0]
*
* The __ballot(1) result is fed to __ffsll.
*
* The __ffsll instruction finds the index of the least significant
* bit set to '1' (in the input bit vector).
*
* In the previous example, the return result would be 2 (since index
* 2 is the first bit with a '1').
*
* The '- 1' at the end is necessary because the return index is 1-based
* instead of 0-based ([1...64] vs [0...63]).
*/
return __ffsll(__ballot(1)) - 1;
}
__device__ uint64_t Block::active_logical_lane_id() {
/*
* The __ballot(1) built-in instruction conducts an active lane roll
* call storing the roll call result in a bit vector. Using its index
* within the warp, each active lane contributes a '1' to the bit vector
* Inactive lanes cannot contribute to the bit vector so their lane
* values contain a '0'.
*
* For example, assume lanes 2, 5, and 62 active out of 64 lanes (0...63).
* The result of the ballot would be the following:
* index: 0 0 0 0 0 0 0 ... 6 6
* 0 1 2 3 4 5 6 ... 2 3
* value: [0 0 1 0 0 1 0 ... 1 0]
*/
uint64_t ballot = __ballot(1);
/*
* The physical_lane_id is the warp lane index of the thread executing
* this code. The word 'physical' here denotes that the lane_id will
* be the actual hardware lane_id.
*/
uint64_t my_physical_lane_id = lane_id();
/*
* Create a full bitset for subsequent operations.
*
* index: 0 0 0 0 0 0 0 ... 6 6
* 0 1 2 3 4 5 6 ... 2 3
* value: [1 1 1 1 1 1 1 ... 1 1]
*/
uint64_t all_ones_mask = -1;
/*
* Left-shift to zero-out the mask elements up to our lane_id.
*
* As an example, assume our lane_id is '5':
*
* index: 0 0 0 0 0 0 0 ... 6 6
* 0 1 2 3 4 5 6 ... 2 3
* value: [0 0 0 0 0 1 1 ... 1 1]
*/
uint64_t lane_mask = all_ones_mask << my_physical_lane_id;
/*
* Invert the lane_mask.
*
* Continue with lane_id '5' example:
*
* index: 0 0 0 0 0 0 0 ... 6 6
* 0 1 2 3 4 5 6 ... 2 3
* value: [1 1 1 1 1 0 0 ... 0 0]
*/
uint64_t inverted_mask = ~lane_mask;
/*
* Bit-wise And the inverted_mask and the ballot.
*
* The result contains a bitset with all active_lanes preceding this
* thread in the ballot (all active threads with lower lane_ids).
*
* Continue with lane_id '5' example:
*
* ballot
* ------------------------------
* index: 0 0 0 0 0 0 0 ... 6 6
* 0 1 2 3 4 5 6 ... 2 3
* value: [0 0 1 0 0 1 0 ... 1 0]
*
* inverted_mask
* ------------------------------
* index: 0 0 0 0 0 0 0 ... 6 6
* 0 1 2 3 4 5 6 ... 2 3
* value: [1 1 1 1 1 0 0 ... 0 0]
*
* lower_active_lanes
* ------------------------------
* index: 0 0 0 0 0 0 0 ... 6 6
* 0 1 2 3 4 5 6 ... 2 3
* value: [0 0 1 0 0 0 0 ... 0 0]
*/
uint64_t lower_active_lanes = ballot & inverted_mask;
/*
* Conduct a population count on lower_active_lanes.
*
* The result gives an index into our logical_lane_id.
*
* Continue with lane_id '5' example:
*
* lower_active_lanes
* ------------------------------
* index: 0 0 0 0 0 0 0 ... 6 6
* 0 1 2 3 4 5 6 ... 2 3
* value: [0 0 1 0 0 0 0 ... 0 0]
*
* my_logical_lane_id
* ------------------------------
* [- - X - - - - ... - -] <- population_count = 1
*
* index: 0 0 0 0 0 0 0 ... 6 6
* 0 1 2 3 4 5 6 ... 2 3
* [- - 0 - - 1 - ... 2 -] <- my_logical_lane_id = 1
*/
uint64_t my_logical_lane_id = __popcll(lower_active_lanes);
return my_logical_lane_id;
}
__device__ uint64_t Block::broadcast_up(uint64_t fetch_value) {
for (unsigned i = 0; i < WF_SIZE; i++) {
uint64_t temp = __shfl_up(fetch_value, i);
if (temp) {
fetch_value = temp;
}
}
return fetch_value;
}
__device__ void ShareStrategy::syncthreads() const {
switch (_sse) {
case ShareStrategyEnum::PRIVATE:
return;
case ShareStrategyEnum::BLOCK:
__syncthreads();
return;
case ShareStrategyEnum::DEVICE:
abort();
return;
case ShareStrategyEnum::UNUSED:
abort();
return;
}
}
} // namespace rocshmem
@@ -0,0 +1,218 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef LIBRARY_SRC_CONTAINERS_SHARE_STRATEGY_HPP_
#define LIBRARY_SRC_CONTAINERS_SHARE_STRATEGY_HPP_
#include <hip/hip_runtime.h>
#include "index_strategy.hpp"
namespace rocshmem {
class Global {
public:
/**
* @brief
*
* @param out_ptr
*
* @return
*/
template <typename T>
__host__ T fetch_incr(T *out_ptr) {}
};
class Grid {
public:
/**
* @brief
*
* @param out_ptr
*
* @return
*/
template <typename T>
__device__ T fetch_incr(T *out_ptr) {}
};
class Block {
private:
friend class BitwiseDeviceMethods;
public:
/**
* @brief
*
* @param out_ptr
*
* @return
*/
template <typename T>
__device__ T fetch_incr(T *out_ptr) {
T fetch_value = 0;
auto num_active_lanes = number_active_lanes();
if (is_lowest_active_lane()) {
fetch_value = atomicAdd(out_ptr, num_active_lanes);
}
fetch_value = broadcast_up(fetch_value);
return fetch_value + active_logical_lane_id();
}
private:
/**
* @brief
*
* @return
*/
__device__ uint64_t lane_id();
/**
* @brief
*
* @return
*/
__device__ uint64_t number_active_lanes();
/**
* @brief
*
* @return
*/
__device__ bool is_lowest_active_lane();
/**
* @brief
*
* @return
*/
__device__ uint64_t lowest_active_lane();
/**
* @brief
*
* @return
*/
__device__ uint64_t active_logical_lane_id();
/**
* @brief
*
* @return
*/
__device__ uint64_t broadcast_up(uint64_t fetch_value);
};
class Private {
public:
/**
* @brief
*
* @param out_ptr
*
* @return
*/
template <typename T>
__device__ T fetch_incr(T *out_ptr) {
auto orig_value = *out_ptr;
*out_ptr = orig_value + 1;
return orig_value;
}
};
enum class ShareStrategyEnum { PRIVATE = 0, BLOCK = 1, DEVICE = 2, UNUSED = 3 };
class ShareStrategy {
public:
__host__ __device__ ShareStrategy() = default;
__host__ __device__ ShareStrategy(ShareStrategyEnum sse) : _sse(sse) {}
/**
* @brief
*
* @return
*/
__device__ void syncthreads() const;
/**
* @brief
*
* @return
*/
template <typename T>
__device__ T fetch_incr(T *out_ptr, size_t my_pe, size_t num_pes) {
T value = 0;
switch (_sse) {
case ShareStrategyEnum::PRIVATE:
return _private.fetch_incr(out_ptr);
case ShareStrategyEnum::BLOCK:
for (size_t i = 0; i < num_pes; i++) {
if (i == my_pe) {
value = _block.fetch_incr(out_ptr);
}
}
return value;
case ShareStrategyEnum::DEVICE:
abort();
return 0;
case ShareStrategyEnum::UNUSED:
abort();
return 0;
}
}
/**
* @brief
*
* @return
*/
__device__ bool uses_designated_send_thread() {
switch (_sse) {
case ShareStrategyEnum::PRIVATE:
return false;
case ShareStrategyEnum::BLOCK:
return true;
case ShareStrategyEnum::DEVICE:
abort();
return false;
case ShareStrategyEnum::UNUSED:
abort();
return false;
}
}
private:
Private _private{};
Block _block{};
ShareStrategyEnum _sse{ShareStrategyEnum::UNUSED};
};
} // namespace rocshmem
#endif // LIBRARY_SRC_CONTAINERS_SHARE_STRATEGY_HPP_

Some files were not shown because too many files have changed in this diff Show More