Improve qp mapping (#259)

Co-authored-by: Aurelien Bouteiller <aurelien.bouteiller@amd.com>

[ROCm/rocshmem commit: 7ebf03fe2f]
This commit is contained in:
Yiltan
2025-09-25 10:24:59 -04:00
committed by GitHub
parent f326e466c9
commit 2b75fe7bf9
4 changed files with 74 additions and 6 deletions
+6 -2
View File
@@ -125,10 +125,14 @@ rocSHMEM has the following enviroment variables:
Disables IPC support for the reverse offload backend.
ROCSHMEM_MAX_NUM_CONTEXTS (default : 1024)
Maximum number of contexts used in library
Maximum number of contexts used in library.
ROCSHMEM_MAX_NUM_TEAMS (default : 40)
Maximum number of teams supported by the library
Maximum number of teams supported by the library.
ROCSHMEM_GDA_ALTERNATE_QP_PORTS (default : 1)
Enables/Disables having QPs alternate their mappings
across rocSHMEM contexts.
```
## Examples
+7 -4
View File
@@ -15,10 +15,10 @@ Compiling and linking with rocSHMEM
rocSHMEM is a library that can be statically linked to your application during compilation with ``hipcc``. For more information, see :doc:`HIPCC <hipcc:index>`.
When compiling your application with ``hipcc``, you must include the rocSHMEM header files and the rocSHMEM library.
When compiling your application with ``hipcc``, you must include the rocSHMEM header files and the rocSHMEM library.
Because rocSHMEM depends on MPI (Message Passing Interface), you must manually add the arguments for MPI linkage instead of using ``mpicc``.
When using ``hipcc`` directly without a build system, it's recommended to perform the compilation and linking steps separately.
When using ``hipcc`` directly without a build system, it's recommended to perform the compilation and linking steps separately.
Example compile and link commands are provided at the top of the example files in the ``examples`` directory:
@@ -36,13 +36,13 @@ Example compile and link commands are provided at the top of the example files i
$OPENMPI_UCX_INSTALL_DIR/lib/libmpi.so \
-L/opt/rocm/lib -lamdhip64 -lhsa-runtime64
If your project uses CMake, see
If your project uses CMake, see
`Using CMake with AMD ROCm <https://rocmdocs.amd.com/en/latest/conceptual/cmake-packages.html>`_.
Running a rocSHMEM application
--------------------------
Applications using rocSHMEM typically deploy multiple processes, usually one per GPU.
Applications using rocSHMEM typically deploy multiple processes, usually one per GPU.
The MPI launcher, for example, ``mpiexec`` with Open MPI, is used to start the required number
of processes. For example, to launch two ``getmem`` example processes (available when compiled from source):
@@ -87,3 +87,6 @@ You can control the behavior of rocSHMEM by using the following environment vari
* - ROCSHMEM_RO_DISABLE_IPC
- 0
- Defines whether to force using the RO conduit even when IPC is available.
* - ROCSHMEM_GDA_ALTERNATE_QP_PORTS
- 1
- Enables/Disables having QPs alternate their mappings across rocSHMEM contexts. This helps saturate bandwidth on multiport bonded interfaces.
+55
View File
@@ -144,6 +144,10 @@ void GDABackend::read_env() {
if ((value = getenv("ROCSHMEM_SQ_SIZE"))) {
sq_size = atoi(value);
}
if ((value = getenv("ROCSHMEM_GDA_ALTERNATE_QP_PORTS"))) {
alternate_qp_ports_enabled = atoi(value);
}
}
void GDABackend::setup_ipc() {
@@ -938,6 +942,57 @@ void GDABackend::create_queues() {
create_cqs(ncqes);
create_qps(sq_size);
}
alternate_qp_ports();
}
void GDABackend::alternate_qp_ports() {
int cur_qp_idx;
int new_qp_idx;
/* We can't remap anything */
if (maximum_num_contexts_ == 1) {
return;
}
if (alternate_qp_ports_enabled) {
/* If we assume two PEs and a default context and two user context,
* initially QPs are in the following port order:
*
* Labels :| DCTX PE0 | DCTX PE1 | CTX0 PE0 | CTX0 PE1 | CTX1 PE0 | CTX1 PE1 |
* QPs :| QP0 | QP1 | QP2 | QP3 | QP4 | QP5 |
* Port :| 0 | 1 | 0 | 1 | 0 | 1 |
*
* This creates the pattern where PE1 is always mapped to port 0 but we want it
* to use both ports to maximize throughput/bandwidth.
*
* So we reorder our QPs
*
* Labels :| DCTX PE0 | DCTX PE1 | CTX0 PE0 | CTX0 PE1 | CTX1 PE0 | CTX1 PE1 |
* QPs :| QP0 | QP1 | QP2 | QP4 | QP3 | QP5 |
* Port :| 0 | 1 | 1 | 0 | 0 | 1 |
*
* We alternate the ports [0,1] and [1,0] for each context.
* Therefore, when we use two contexts we use both ports
*
*/
/* Re-Map each context */
for (int i = 1; i < (maximum_num_contexts_ + 1); i+=2) {
for (int p = 0; p < num_pes; p+=2) {
cur_qp_idx = (i * num_pes) + p;
new_qp_idx = cur_qp_idx + 1;
if (new_qp_idx < qps.size()) {
// Swap QPs
std::swap(cqs[cur_qp_idx], cqs[new_qp_idx]);
std::swap(qps[cur_qp_idx], qps[new_qp_idx]);
std::swap(bnxt_cqs[cur_qp_idx], bnxt_cqs[new_qp_idx]);
std::swap(bnxt_qps[cur_qp_idx], bnxt_qps[new_qp_idx]);
}
}
}
}
}
void* GDABackend::pd_alloc_device_uncached(struct ibv_pd* pd, void* pd_context, size_t size, size_t alignment, uint64_t resource_type) {
@@ -127,6 +127,7 @@ class GDABackend : public Backend {
std::vector<ibv_qp*> qps;
std::vector<ibv_cq*> cqs;
std::vector<dest_info_t> dest_info;
int alternate_qp_ports_enabled = 1;;
/* GDA_BNXT START */
std::vector<struct bnxt_host_qp> bnxt_qps;
@@ -376,6 +377,11 @@ class GDABackend : public Backend {
void create_qps(int sq_length);
void bnxt_create_qps(int sq_length);
/**
* @brief Reorders QPs to that we map rocSHMEM contexts to the correct QPs
*/
void alternate_qp_ports();
/**
* @brief Exchange QP information for connection
*/