add support for GPUs using wavefront size of 32 (#285)

* add gfx1100 support Add support for Radeon 7900 GPUs (RX and PRO), and 7800 PRO. I was contemplating to add gfx1101 and gfx1102 GPUs as well, but those are the lower end models that are more unlikely to be used for compute intensive jobs. In addition, I do not have access to them to test the support. * update WF_SIZe for different options Radeon systems use a WarpSize of 32, unlike current Instinct systems, which use a warp size of 64. For the device side, a gfx specific ifdef is sufficient. For the host side, we need to query the device properties. * adjust functional tests to wf_size of 32 * update unit tests to handle wf_size of 32 * address reviewer comments
2025-10-22 16:04:58 -05:00
commit d0c2845031
@@ -108,6 +108,7 @@ include(cmake/rocm_local_targets.cmake)
 set(DEFAULT_GPUS
    gfx90a:xnack-;
    gfx90a:xnack+;
+    gfx1100;
    gfx942)

 if(${ROCM_MAJOR_VERSION} GREATER 6)
@@ -118,6 +118,15 @@ ExecTest() {
  NUM_THREADS=$4
  MAX_MSG_SIZE=$5

+  if command -v amd-smi >/dev/null
+  then
+      NUM_GPUS=$(amd-smi list | grep GPU | wc -l)
+  elif command -v rocm-smi >/dev/null
+  then
+      NUM_GPUS=$(rocm-smi --showserial | grep GPU | wc -l)
+  else
+      NUM_GPUS=64
+  fi
  TIMEOUT=$((5 * 60)) # Timeout in seconds

  TEST_NUM=${TEST_NUMBERS[$TEST_NAME]}
@@ -159,9 +168,13 @@ ExecTest() {
  CMD+=" >> $LOG_DIR/$TEST_LOG_NAME.log 2>&1"

  # Run Test
-  echo $TEST_LOG_NAME
-  echo "# $CMD" >"$LOG_DIR/$TEST_LOG_NAME.log"
-  eval $CMD
+  if [ $NUM_RANKS -le $NUM_GPUS ] && [[ "" == "$HOSTFILE" ]]; then
+    echo $TEST_LOG_NAME
+    echo "# $CMD" >"$LOG_DIR/$TEST_LOG_NAME.log"
+    eval $CMD
+  else
+    echo "Skipping test $TEST_LOG_NAME"
+  fi

  # Validate Test
  if [ $? -ne 0 ]
@@ -46,7 +46,7 @@ __device__ __forceinline__ int uncached_load_ubyte(uint8_t* src) {
 #endif
 #if defined(__gfx908__)
 #endif
-#if defined(__gfx90a__)
+#if defined(__gfx90a__) || defined (__gfx1100__)
  asm volatile(
      "global_load_ubyte %0 %1 off glc slc \n"
      "s_waitcnt vmcnt(0)"
@@ -69,7 +69,7 @@ __device__ __forceinline__ void refresh_volatile_sbyte(volatile int *assigned_va
 #endif
 #if defined(__gfx908__)
 #endif
-#if defined(__gfx90a__)
+#if defined(__gfx90a__) || defined (__gfx1100__)
  asm volatile(
    "global_load_sbyte %0 %1 off glc slc\n "
    "s_waitcnt vmcnt(0)"
@@ -91,7 +91,7 @@ __device__ __forceinline__ void refresh_volatile_dwordx2(volatile uint64_t *assi
 #endif
 #if defined(__gfx908__)
 #endif
-#if defined(__gfx90a__)
+#if defined(__gfx90a__) || defined (__gfx1100__)
  asm volatile(
    "global_load_dwordx2 %0 %1 off glc slc\n "
    "s_waitcnt vmcnt(0)"
@@ -122,7 +122,7 @@ NOWARN(-Wdeprecated-volatile,
 #endif
 #if defined(__gfx908__)
 #endif
-#if defined(__gfx90a__)
+#if defined(__gfx90a__) || defined (__gfx1100__)
        asm volatile(
            "global_load_dword %0 %1 off glc slc \n"
            "s_waitcnt vmcnt(0)"
@@ -142,7 +142,7 @@ NOWARN(-Wdeprecated-volatile,
 #endif
 #if defined(__gfx908__)
 #endif
-#if defined(__gfx90a__)
+#if defined(__gfx90a__) || defined (__gfx1100__)
        asm volatile(
            "global_load_dwordx2 %0 %1 off glc slc \n"
            "s_waitcnt vmcnt(0)"
@@ -191,7 +191,7 @@ __device__ __forceinline__ void store_asm(uint8_t* val, uint8_t* dst,
 #endif
 #if defined(__gfx908__)
 #endif
-#if defined(__gfx90a__)
+#if defined(__gfx90a__) || defined (__gfx1100__)
      asm volatile("flat_store_short %0 %1 glc slc" : : "v"(dst), "v"(val16));
 #endif
 #if defined(__gfx942__) || defined(__gfx950__)
@@ -205,7 +205,7 @@ __device__ __forceinline__ void store_asm(uint8_t* val, uint8_t* dst,
 #endif
 #if defined(__gfx908__)
 #endif
-#if defined(__gfx90a__)
+#if defined(__gfx90a__) || defined (__gfx1100__)
      asm volatile("flat_store_dword %0 %1 glc slc" : : "v"(dst), "v"(val32));
 #endif
 #if defined(__gfx942__) || defined(__gfx950__)
@@ -219,7 +219,7 @@ __device__ __forceinline__ void store_asm(uint8_t* val, uint8_t* dst,
 #endif
 #if defined(__gfx908__)
 #endif
-#if defined(__gfx90a__)
+#if defined(__gfx90a__) || defined (__gfx1100__)
      asm volatile("flat_store_dwordx2 %0 %1 glc slc" : : "v"(dst), "v"(val64));
 #endif
 #if defined(__gfx942__) || defined(__gfx950__)
@@ -57,7 +57,11 @@ inline const unsigned MAX_WG_SIZE{1024};
 *
 * @note Wavefront size on most systems is either 32 or 64.
 */
+#if defined(__gfx90a__) || defined(__gfx942__) || defined (__gfx950__)
 inline const int WF_SIZE{64};
+#else
+inline const int WF_SIZE{32};
+#endif

 }  // namespace rocshmem

@@ -28,6 +28,7 @@

 #include "backend_gda.hpp"
 #include "constants.hpp"
+#include "util.hpp"

 namespace rocshmem {

@@ -59,7 +60,10 @@ QueuePair::QueuePair(struct ibv_pd* pd, int gda_provider) {
    fetching_atomic_lkey = mr_fetching_atomic->lkey;
  }

-  for(int i{0}; i < FETCHING_ATOMIC_CNT; i+=WF_SIZE) {
+  int deviceId;
+  CHECK_HIP(hipGetDevice(&deviceId));
+  int wf_size = get_wf_size(deviceId);
+  for(int i{0}; i < FETCHING_ATOMIC_CNT; i+=wf_size) {
    fetching_atomic_freelist->push_back(fetching_atomic + i);
  }

@@ -56,14 +56,11 @@ ROBackend::ROBackend(MPI_Comm comm)
  profiler_proxy_ = ProfilerProxyT(envvar::max_num_contexts);

  int device_id;
-  hipDeviceProp_t device_props;
-
  CHECK_HIP(hipGetDevice(&device_id));
-  CHECK_HIP(hipGetDeviceProperties(&device_props, device_id));

-  max_wg_size_ = device_props.maxThreadsPerBlock;
+  max_wg_size_ = get_threads_per_block(device_id);

-  wf_size_ = device_props.warpSize;
+  wf_size_ = get_wf_size(device_id);

  setup_default_ctx_buffers();

@@ -44,6 +44,21 @@ typedef struct device_agent {
 std::vector<device_agent_t> gpu_agents;
 std::vector<device_agent_t> cpu_agents;

+std::vector<device_prop_t> device_properties;
+
+static void device_properties_init(void) {
+  int numDevices;
+  CHECK_HIP(hipGetDeviceCount(&numDevices));
+
+  device_prop_t prop;
+  hipDeviceProp_t hipprop;
+  for (int i=0; i<numDevices; i++) {
+    CHECK_HIP(hipGetDeviceProperties(&hipprop, i));
+    prop.warpSize = hipprop.warpSize;
+    prop.maxThreadsPerBlock = hipprop.maxThreadsPerBlock;
+    device_properties.push_back(prop);
+  }
+}
 hsa_status_t rocm_hsa_amd_memory_pool_callback(
    hsa_amd_memory_pool_t memory_pool, void* data) {
  hsa_amd_memory_pool_global_flag_t pool_flag{};
@@ -108,6 +123,8 @@ int rocm_init() {
    return 1;
  }

+  device_properties_init();
+
  return 0;
 }

@@ -30,6 +30,8 @@
 #include <hsa/hsa_ext_amd.h>

 #include <cstdio>
+#include <cassert>
+#include <vector>

 #include "rocshmem/rocshmem_config.h"  // NOLINT(build/include_subdir)
 #include "constants.hpp"
@@ -146,6 +148,24 @@ do {                                                                        \

 extern const int gpu_clock_freq_mhz;

+
+typedef struct device_prop {
+  int warpSize;
+  int maxThreadsPerBlock;
+} device_prop_t;
+
+extern std::vector<device_prop_t> device_properties;
+
+static int get_threads_per_block(int device_id) {
+  assert(device_properties.size() > device_id);
+  return device_properties[device_id].maxThreadsPerBlock;
+}
+
+static int get_wf_size(int device_id) {
+  assert(device_properties.size() > device_id);
+  return device_properties[device_id].warpSize;
+}
+
 /* Device-side internal functions */
 __device__ __forceinline__ uint32_t lowerID() {
  return __ffsll(__ballot(1)) - 1;
@@ -42,10 +42,12 @@
 
   /**
    * Shared array to capture the start time for each wavefront
-    * Max threads per block = 1024, wavefront size = 64 (in most GPUs)
-    * Maximum array size required = 1024/64 = 16
+   * Max threads per block = 1024, wavefront size = 64 or 32 depending
+   * on the GPUs. Using 32 since its safer for the dimensioning of the array,
+   * the last 16 elements will not be used on GPUs with a wf size of 64.
+   * Maximum array size required = 1024/32 = 32
    */
-   __shared__ long long int wf_start_time[16];
+   __shared__ long long int wf_start_time[32];
 
   /**
    * Calculate start index for each thread within the grid
@@ -190,4 +192,3 @@
     }
   }
 }
- 
@@ -43,10 +43,12 @@ __global__ void PrimitiveTest(int loop, int skip, long long int *start_time,

  /**
   * Shared array to capture the start time for each wavefront
-   * Max threads per block = 1024, wavefront size = 64 (in most GPUs)
-   * Maximum array size required = 1024/64 = 16
+   * Max threads per block = 1024, wavefront size = 64 or 32 depending
+   * on the GPUs. Using 32 since its safer for the dimensioning of the array,
+   * the last 16 elements will not be used on GPUs with a wf size of 64.
+   * Maximum array size required = 1024/32 = 32
   */
-  __shared__ long long int wf_start_time[16];
+  __shared__ long long int wf_start_time[32];

  /**
   * Calculate start index for each thread within the grid
@@ -43,10 +43,12 @@ __global__ void ShmemPtrTest(int loop, int skip, long long int *start_time,

  /**
   * Shared array to capture the start time for each wavefront
-   * Max threads per block = 1024, wavefront size = 64 (in most GPUs)
-   * Maximum array size required = 1024/64 = 16
+   * Max threads per block = 1024, wavefront size = 64 or 32 depending
+   * on the GPUs. Using 32 since its safer for the dimensioning of the array,
+   * the last 16 elements will not be used on GPUs with a wf size of 64.
+   * Maximum array size required = 1024/32 = 32
   */
-  __shared__ long long int wf_start_time[16];
+  __shared__ long long int wf_start_time[32];


  /**
@@ -47,10 +47,12 @@ __global__ void TeamCtxPrimitiveTest(int loop, int skip, long long int *start_ti

  /**
   * Shared array to capture the start time for each wavefront
-   * Max threads per block = 1024, wavefront size = 64 (in most GPUs)
-   * Maximum array size required = 1024/64 = 16
+   * Max threads per block = 1024, wavefront size = 64 or 32 depending
+   * on the GPUs. Using 32 since its safer for the dimensioning of the array,
+   * the last 16 elements will not be used on GPUs with a wf size of 64.
+   * Maximum array size required = 1024/32 = 32
   */
-  __shared__ long long int wf_start_time[16];
+  __shared__ long long int wf_start_time[32];

  /**
   * Calculate start index for each thread within the grid
@@ -144,12 +144,6 @@ TEST_F(BitwiseTestFixture, verify_host_warp_matrix_init_1024_8) {
  verify_zeroed_warp_matrix();
 }

-TEST_F(BitwiseTestFixture, verify_warp_size_64) {
-  setup_fixture({1, 1, 1}, {1, 1, 1});
-
-  ASSERT_EQ(WF_SIZE, 64);
-}
-
 /*****************************************************************************
 ************************** Activate Lane Helper******************************
 *****************************************************************************/
@@ -1402,7 +1396,7 @@ TEST_F(BitwiseTestFixture, fetch_incr_kernel_4_1) {
  for (size_t i = 0; i < _warp_matrix->rows(); i++) {
    for (size_t j = 0; j < _warp_matrix->columns(); j++) {
      auto *elem = _warp_matrix->access(i, j);
-      ASSERT_EQ(*elem % WF_SIZE, 0);
+      ASSERT_EQ(*elem % this->_wf_size, 0);
    }
  }
 }
@@ -1426,7 +1420,7 @@ TEST_F(BitwiseTestFixture, fetch_incr_kernel_64_1) {
  for (size_t i = 0; i < _warp_matrix->rows(); i++) {
    for (size_t j = 0; j < _warp_matrix->columns(); j++) {
      auto *elem = _warp_matrix->access(i, j);
-      ASSERT_EQ(*elem % WF_SIZE, 0);
+      ASSERT_EQ(*elem % this->_wf_size, 0);
    }
  }
 }
@@ -1498,7 +1492,7 @@ TEST_F(BitwiseTestFixture, fetch_incr_kernel_1024_1024) {
  for (size_t i = 0; i < _warp_matrix->rows(); i++) {
    for (size_t j = 0; j < _warp_matrix->columns(); j++) {
      auto *elem = _warp_matrix->access(i, j);
-      ASSERT_EQ(*elem % WF_SIZE, 0);
+      ASSERT_EQ(*elem % this->_wf_size, 0);
    }
  }
 }
@@ -1522,7 +1516,7 @@ TEST_F(BitwiseTestFixture, fetch_incr_logical_1_kernel_1024_1024) {
  for (size_t i = 0; i < _warp_matrix->rows(); i++) {
    for (size_t j = 0; j < _warp_matrix->columns(); j++) {
      auto *elem = _warp_matrix->access(i, j);
-      ASSERT_EQ(*elem % WF_SIZE, 1);
+      ASSERT_EQ(*elem % this->_wf_size, 1);
    }
  }
 }
@@ -28,6 +28,7 @@
 #define HIP_ENABLE_PRINTF

 #include "gtest/gtest.h"
+#include "wf_size.hpp"

 #include "../src/memory/hip_allocator.hpp"
 #include "containers/matrix.hpp"
@@ -259,6 +260,7 @@ class BitwiseTestFixture : public ::testing::Test
    {
        _hip_block_dim = block_dim;
        _hip_grid_dim = grid_dim;
+        _wf_size = get_wf_size();

        assert(_device_methods == nullptr);
        _hip_allocator.allocate(reinterpret_cast<void**>(&_device_methods),
@@ -341,6 +343,7 @@ class BitwiseTestFixture : public ::testing::Test
    HIPAllocator _hip_allocator {};
    WarpMatrix *_warp_matrix = nullptr;
    BitwiseDeviceMethods *_device_methods = nullptr;
+    int _wf_size;
 };

 } // namespace rocshmem
@@ -108,7 +108,7 @@ TYPED_TEST(FreeListTestFixture, push_host_pop_device) {

  CHECK_HIP(hipMemset(results, 0, size_bytes));
  is_empty = reinterpret_cast<bool*>(results + h_input.size());
-  const auto block_size = WF_SIZE;
+  const auto block_size = this->wf_size;
  rocshmem::pop_all<<<1, block_size>>>(free_list, results, h_input.size());
  CHECK_HIP(hipDeviceSynchronize());

@@ -140,7 +140,7 @@ TYPED_TEST(FreeListTestFixture, push_host_concurrent_pop_device) {
  CHECK_HIP(hipMemset(results, 0, size_bytes));
  is_empty = reinterpret_cast<bool*>(results + h_input.size());
  const auto num_blocks = h_input.size();
-  const auto block_size = WF_SIZE;
+  const auto block_size = this->wf_size;
  rocshmem::pop_all<<<num_blocks, block_size>>>(
            free_list, results, h_input.size());
  CHECK_HIP(hipDeviceSynchronize());
@@ -184,7 +184,7 @@ TYPED_TEST(FreeListTestFixture, push_host_pop_push_device) {
  CHECK_HIP(hipMemset(results, 0, size_bytes));
  d_input = reinterpret_cast<T*>(results + h_input.size());
  is_empty = reinterpret_cast<bool*>(d_input + h_input.size());
-  const auto block_size = WF_SIZE;
+  const auto block_size = this->wf_size;

  CHECK_HIP(hipMemcpy(d_input, h_input.data(), sizeof(T) * h_input.size(),
                      hipMemcpyHostToDevice));
@@ -223,7 +223,7 @@ TYPED_TEST(FreeListTestFixture, push_host_pop_concurrent_push_device) {

  CHECK_HIP(hipMemset(results, 0, size_bytes));
  d_input = reinterpret_cast<T*>(results + h_input.size());
-  const auto block_size = WF_SIZE;
+  const auto block_size = this->wf_size;

  CHECK_HIP(hipMemcpy(d_input, h_input.data(), sizeof(T) * h_input.size(),
                      hipMemcpyHostToDevice));
@@ -277,7 +277,7 @@ TYPED_TEST(FreeListTestFixture, push_host_concurrent_pop_push_device) {
  CHECK_HIP(hipMemcpy(d_input, h_input.data(), sizeof(T) * h_input.size(),
                      hipMemcpyHostToDevice));

-  const auto block_size = WF_SIZE;
+  const auto block_size = this->wf_size;
  rocshmem::pop_all<FreeListType, T><<<1, block_size>>>(
            free_list, nullptr, h_input.size());
  CHECK_HIP(hipDeviceSynchronize());
@@ -31,6 +31,7 @@
 #include "../src/containers/free_list_impl.hpp"
 #include "gtest/gtest.h"
 #include "../src/memory/hip_allocator.hpp"
+#include "wf_size.hpp"

 namespace rocshmem {

@@ -45,6 +46,7 @@ class FreeListTestFixture : public ::testing::Test {
 protected:
  void SetUp() override {
    free_list->push_back_range(h_input.begin(), h_input.end());
+    wf_size = get_wf_size();
  }

  using T = ValueType;
@@ -52,6 +54,7 @@ class FreeListTestFixture : public ::testing::Test {
  Allocator hip_allocator_ {};
  const std::size_t num_elements{32};
  std::vector<T> h_input{};
+  int wf_size;

  FreeListProxy<Allocator, T> list_proxy{};
  FreeList<T, Allocator>* free_list{};
@@ -28,16 +28,34 @@

 using namespace rocshmem;

-TEST_F(WavefrontSizeTestFixture, constant_matches_runtime) {
-  int device_count = 0;
-  hipDeviceProp_t prop;
-
-  CHECK_HIP(hipGetDeviceCount(&device_count));
-  ASSERT_GT(device_count, 0);
-
-  for (int i = 0; i < device_count; i++) {
-    CHECK_HIP(hipGetDeviceProperties(&prop, i));
-    ASSERT_EQ(WF_SIZE, prop.warpSize);
+__global__ void check_wf_size(int wf_size_prop, int *ret) {
+  if (wf_size_prop == WF_SIZE) {
+    *ret = 0;
+  } else {
+    *ret = 1;
  }
 }

+
+TEST_F(WavefrontSizeTestFixture, constant_matches_runtime) {
+  int device_count = 0;
+  hipDeviceProp_t prop;
+  int *ret;
+
+  CHECK_HIP(hipGetDeviceCount(&device_count));
+  ASSERT_GT(device_count, 0);
+  CHECK_HIP(hipHostMalloc(&ret, sizeof(int), 0));
+
+  for (int i = 0; i < device_count; i++) {
+    *ret = -1;
+    CHECK_HIP(hipSetDevice(i));
+    CHECK_HIP(hipGetDeviceProperties(&prop, i));
+
+    check_wf_size<<<1, 1>>>(prop.warpSize, ret);
+    CHECK_HIP(hipDeviceSynchronize());
+
+    ASSERT_EQ(*ret, 0);
+  }
+  CHECK_HIP(hipHostFree(ret));
+}
+
@@ -26,10 +26,19 @@
 #define ROCSHMEM_WAVEFRONT_SIZE_GTEST_HPP

 #include "gtest/gtest.h"
+#include "wf_size.hpp"

 namespace rocshmem {

-class WavefrontSizeTestFixture : public ::testing::Test { };
+class WavefrontSizeTestFixture : public ::testing::Test {
+public:
+  void SetUp() override {
+      wf_size = get_wf_size();
+  }
+
+protected:
+  int wf_size;
+};

 } // namespace rocshmem

@@ -0,0 +1,46 @@
+/******************************************************************************
+ * Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *****************************************************************************/
+
+#ifndef ROCSHMEM_WF_SIZE_HPP
+#define ROCSHMEM_WF_SIZE_HPP
+
+#include <hip/hip_runtime.h>
+#include "mpi.h"
+
+#define CHECK_HIP_MPI(cond) {                                      \
+  if(cond != hipSuccess){                                          \
+     fprintf(stderr,"HIP error: %d line: %d\n", cond,  __LINE__);  \
+     MPI_Abort(MPI_COMM_WORLD, 1);  	     	  	 	   \
+  }                                                                \
+}
+
+static int get_wf_size() {
+  int deviceId;
+  hipDeviceProp_t prop;
+  CHECK_HIP_MPI(hipGetDevice(&deviceId));
+  CHECK_HIP_MPI(hipGetDeviceProperties(&prop, deviceId));
+  return prop.warpSize;
+}
+
+#endif