From 2b75fe7bf9bb5bcacbfc9c0c100250fcd948d468 Mon Sep 17 00:00:00 2001 From: Yiltan Date: Thu, 25 Sep 2025 10:24:59 -0400 Subject: [PATCH] Improve qp mapping (#259) Co-authored-by: Aurelien Bouteiller [ROCm/rocshmem commit: 7ebf03fe2f471f2babe9a4d69c799fc62b0fde3f] --- projects/rocshmem/README.md | 8 +++- projects/rocshmem/docs/compile_and_run.rst | 11 +++-- projects/rocshmem/src/gda/backend_gda.cpp | 55 ++++++++++++++++++++++ projects/rocshmem/src/gda/backend_gda.hpp | 6 +++ 4 files changed, 74 insertions(+), 6 deletions(-) diff --git a/projects/rocshmem/README.md b/projects/rocshmem/README.md index a104b21b63..4b8126c725 100644 --- a/projects/rocshmem/README.md +++ b/projects/rocshmem/README.md @@ -125,10 +125,14 @@ rocSHMEM has the following enviroment variables: Disables IPC support for the reverse offload backend. ROCSHMEM_MAX_NUM_CONTEXTS (default : 1024) - Maximum number of contexts used in library + Maximum number of contexts used in library. ROCSHMEM_MAX_NUM_TEAMS (default : 40) - Maximum number of teams supported by the library + Maximum number of teams supported by the library. + + ROCSHMEM_GDA_ALTERNATE_QP_PORTS (default : 1) + Enables/Disables having QPs alternate their mappings + across rocSHMEM contexts. ``` ## Examples diff --git a/projects/rocshmem/docs/compile_and_run.rst b/projects/rocshmem/docs/compile_and_run.rst index c4f98e6220..3d78c3f556 100644 --- a/projects/rocshmem/docs/compile_and_run.rst +++ b/projects/rocshmem/docs/compile_and_run.rst @@ -15,10 +15,10 @@ Compiling and linking with rocSHMEM rocSHMEM is a library that can be statically linked to your application during compilation with ``hipcc``. For more information, see :doc:`HIPCC `. -When compiling your application with ``hipcc``, you must include the rocSHMEM header files and the rocSHMEM library. +When compiling your application with ``hipcc``, you must include the rocSHMEM header files and the rocSHMEM library. Because rocSHMEM depends on MPI (Message Passing Interface), you must manually add the arguments for MPI linkage instead of using ``mpicc``. -When using ``hipcc`` directly without a build system, it's recommended to perform the compilation and linking steps separately. +When using ``hipcc`` directly without a build system, it's recommended to perform the compilation and linking steps separately. Example compile and link commands are provided at the top of the example files in the ``examples`` directory: @@ -36,13 +36,13 @@ Example compile and link commands are provided at the top of the example files i $OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \ -L/opt/rocm/lib -lamdhip64 -lhsa-runtime64 -If your project uses CMake, see +If your project uses CMake, see `Using CMake with AMD ROCm `_. Running a rocSHMEM application -------------------------- -Applications using rocSHMEM typically deploy multiple processes, usually one per GPU. +Applications using rocSHMEM typically deploy multiple processes, usually one per GPU. The MPI launcher, for example, ``mpiexec`` with Open MPI, is used to start the required number of processes. For example, to launch two ``getmem`` example processes (available when compiled from source): @@ -87,3 +87,6 @@ You can control the behavior of rocSHMEM by using the following environment vari * - ROCSHMEM_RO_DISABLE_IPC - 0 - Defines whether to force using the RO conduit even when IPC is available. + * - ROCSHMEM_GDA_ALTERNATE_QP_PORTS + - 1 + - Enables/Disables having QPs alternate their mappings across rocSHMEM contexts. This helps saturate bandwidth on multiport bonded interfaces. diff --git a/projects/rocshmem/src/gda/backend_gda.cpp b/projects/rocshmem/src/gda/backend_gda.cpp index 17c152c1fb..afd44127b6 100644 --- a/projects/rocshmem/src/gda/backend_gda.cpp +++ b/projects/rocshmem/src/gda/backend_gda.cpp @@ -144,6 +144,10 @@ void GDABackend::read_env() { if ((value = getenv("ROCSHMEM_SQ_SIZE"))) { sq_size = atoi(value); } + + if ((value = getenv("ROCSHMEM_GDA_ALTERNATE_QP_PORTS"))) { + alternate_qp_ports_enabled = atoi(value); + } } void GDABackend::setup_ipc() { @@ -938,6 +942,57 @@ void GDABackend::create_queues() { create_cqs(ncqes); create_qps(sq_size); } + + alternate_qp_ports(); +} + +void GDABackend::alternate_qp_ports() { + int cur_qp_idx; + int new_qp_idx; + + /* We can't remap anything */ + if (maximum_num_contexts_ == 1) { + return; + } + + if (alternate_qp_ports_enabled) { + /* If we assume two PEs and a default context and two user context, + * initially QPs are in the following port order: + * + * Labels :| DCTX PE0 | DCTX PE1 | CTX0 PE0 | CTX0 PE1 | CTX1 PE0 | CTX1 PE1 | + * QPs :| QP0 | QP1 | QP2 | QP3 | QP4 | QP5 | + * Port :| 0 | 1 | 0 | 1 | 0 | 1 | + * + * This creates the pattern where PE1 is always mapped to port 0 but we want it + * to use both ports to maximize throughput/bandwidth. + * + * So we reorder our QPs + * + * Labels :| DCTX PE0 | DCTX PE1 | CTX0 PE0 | CTX0 PE1 | CTX1 PE0 | CTX1 PE1 | + * QPs :| QP0 | QP1 | QP2 | QP4 | QP3 | QP5 | + * Port :| 0 | 1 | 1 | 0 | 0 | 1 | + * + * We alternate the ports [0,1] and [1,0] for each context. + * Therefore, when we use two contexts we use both ports + * + */ + + /* Re-Map each context */ + for (int i = 1; i < (maximum_num_contexts_ + 1); i+=2) { + for (int p = 0; p < num_pes; p+=2) { + cur_qp_idx = (i * num_pes) + p; + new_qp_idx = cur_qp_idx + 1; + + if (new_qp_idx < qps.size()) { + // Swap QPs + std::swap(cqs[cur_qp_idx], cqs[new_qp_idx]); + std::swap(qps[cur_qp_idx], qps[new_qp_idx]); + std::swap(bnxt_cqs[cur_qp_idx], bnxt_cqs[new_qp_idx]); + std::swap(bnxt_qps[cur_qp_idx], bnxt_qps[new_qp_idx]); + } + } + } + } } void* GDABackend::pd_alloc_device_uncached(struct ibv_pd* pd, void* pd_context, size_t size, size_t alignment, uint64_t resource_type) { diff --git a/projects/rocshmem/src/gda/backend_gda.hpp b/projects/rocshmem/src/gda/backend_gda.hpp index aa8b42eccb..0401b4df47 100644 --- a/projects/rocshmem/src/gda/backend_gda.hpp +++ b/projects/rocshmem/src/gda/backend_gda.hpp @@ -127,6 +127,7 @@ class GDABackend : public Backend { std::vector qps; std::vector cqs; std::vector dest_info; + int alternate_qp_ports_enabled = 1;; /* GDA_BNXT START */ std::vector bnxt_qps; @@ -376,6 +377,11 @@ class GDABackend : public Backend { void create_qps(int sq_length); void bnxt_create_qps(int sq_length); + /** + * @brief Reorders QPs to that we map rocSHMEM contexts to the correct QPs + */ + void alternate_qp_ports(); + /** * @brief Exchange QP information for connection */