add support for GPUs using wavefront size of 32 (#285)

* add gfx1100 support

Add support for Radeon 7900 GPUs (RX and PRO), and 7800 PRO.

I was contemplating to add gfx1101 and gfx1102 GPUs as well, but those are the lower end models that are more unlikely to be used for compute intensive jobs. In addition, I do not have access to them to test the support.

* update WF_SIZe for different options

Radeon systems use a WarpSize of 32, unlike current Instinct systems,
which use a warp size of 64. For the device side, a gfx specific ifdef
is sufficient. For the host side, we need to query the device
properties.

* adjust functional tests to wf_size of 32

* update unit tests to handle wf_size of 32

* address reviewer comments
This commit is contained in:
Edgar Gabriel
2025-10-22 16:04:58 -05:00
committed by GitHub
vanhempi 955c22aeed
commit d0c2845031
19 muutettua tiedostoa jossa 192 lisäystä ja 56 poistoa
+1
Näytä tiedosto
@@ -108,6 +108,7 @@ include(cmake/rocm_local_targets.cmake)
set(DEFAULT_GPUS
gfx90a:xnack-;
gfx90a:xnack+;
gfx1100;
gfx942)
if(${ROCM_MAJOR_VERSION} GREATER 6)
+16 -3
Näytä tiedosto
@@ -118,6 +118,15 @@ ExecTest() {
NUM_THREADS=$4
MAX_MSG_SIZE=$5
if command -v amd-smi >/dev/null
then
NUM_GPUS=$(amd-smi list | grep GPU | wc -l)
elif command -v rocm-smi >/dev/null
then
NUM_GPUS=$(rocm-smi --showserial | grep GPU | wc -l)
else
NUM_GPUS=64
fi
TIMEOUT=$((5 * 60)) # Timeout in seconds
TEST_NUM=${TEST_NUMBERS[$TEST_NAME]}
@@ -159,9 +168,13 @@ ExecTest() {
CMD+=" >> $LOG_DIR/$TEST_LOG_NAME.log 2>&1"
# Run Test
echo $TEST_LOG_NAME
echo "# $CMD" >"$LOG_DIR/$TEST_LOG_NAME.log"
eval $CMD
if [ $NUM_RANKS -le $NUM_GPUS ] && [[ "" == "$HOSTFILE" ]]; then
echo $TEST_LOG_NAME
echo "# $CMD" >"$LOG_DIR/$TEST_LOG_NAME.log"
eval $CMD
else
echo "Skipping test $TEST_LOG_NAME"
fi
# Validate Test
if [ $? -ne 0 ]
+8 -8
Näytä tiedosto
@@ -46,7 +46,7 @@ __device__ __forceinline__ int uncached_load_ubyte(uint8_t* src) {
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
#if defined(__gfx90a__) || defined (__gfx1100__)
asm volatile(
"global_load_ubyte %0 %1 off glc slc \n"
"s_waitcnt vmcnt(0)"
@@ -69,7 +69,7 @@ __device__ __forceinline__ void refresh_volatile_sbyte(volatile int *assigned_va
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
#if defined(__gfx90a__) || defined (__gfx1100__)
asm volatile(
"global_load_sbyte %0 %1 off glc slc\n "
"s_waitcnt vmcnt(0)"
@@ -91,7 +91,7 @@ __device__ __forceinline__ void refresh_volatile_dwordx2(volatile uint64_t *assi
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
#if defined(__gfx90a__) || defined (__gfx1100__)
asm volatile(
"global_load_dwordx2 %0 %1 off glc slc\n "
"s_waitcnt vmcnt(0)"
@@ -122,7 +122,7 @@ NOWARN(-Wdeprecated-volatile,
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
#if defined(__gfx90a__) || defined (__gfx1100__)
asm volatile(
"global_load_dword %0 %1 off glc slc \n"
"s_waitcnt vmcnt(0)"
@@ -142,7 +142,7 @@ NOWARN(-Wdeprecated-volatile,
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
#if defined(__gfx90a__) || defined (__gfx1100__)
asm volatile(
"global_load_dwordx2 %0 %1 off glc slc \n"
"s_waitcnt vmcnt(0)"
@@ -191,7 +191,7 @@ __device__ __forceinline__ void store_asm(uint8_t* val, uint8_t* dst,
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
#if defined(__gfx90a__) || defined (__gfx1100__)
asm volatile("flat_store_short %0 %1 glc slc" : : "v"(dst), "v"(val16));
#endif
#if defined(__gfx942__) || defined(__gfx950__)
@@ -205,7 +205,7 @@ __device__ __forceinline__ void store_asm(uint8_t* val, uint8_t* dst,
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
#if defined(__gfx90a__) || defined (__gfx1100__)
asm volatile("flat_store_dword %0 %1 glc slc" : : "v"(dst), "v"(val32));
#endif
#if defined(__gfx942__) || defined(__gfx950__)
@@ -219,7 +219,7 @@ __device__ __forceinline__ void store_asm(uint8_t* val, uint8_t* dst,
#endif
#if defined(__gfx908__)
#endif
#if defined(__gfx90a__)
#if defined(__gfx90a__) || defined (__gfx1100__)
asm volatile("flat_store_dwordx2 %0 %1 glc slc" : : "v"(dst), "v"(val64));
#endif
#if defined(__gfx942__) || defined(__gfx950__)
+4
Näytä tiedosto
@@ -57,7 +57,11 @@ inline const unsigned MAX_WG_SIZE{1024};
*
* @note Wavefront size on most systems is either 32 or 64.
*/
#if defined(__gfx90a__) || defined(__gfx942__) || defined (__gfx950__)
inline const int WF_SIZE{64};
#else
inline const int WF_SIZE{32};
#endif
} // namespace rocshmem
+5 -1
Näytä tiedosto
@@ -28,6 +28,7 @@
#include "backend_gda.hpp"
#include "constants.hpp"
#include "util.hpp"
namespace rocshmem {
@@ -59,7 +60,10 @@ QueuePair::QueuePair(struct ibv_pd* pd, int gda_provider) {
fetching_atomic_lkey = mr_fetching_atomic->lkey;
}
for(int i{0}; i < FETCHING_ATOMIC_CNT; i+=WF_SIZE) {
int deviceId;
CHECK_HIP(hipGetDevice(&deviceId));
int wf_size = get_wf_size(deviceId);
for(int i{0}; i < FETCHING_ATOMIC_CNT; i+=wf_size) {
fetching_atomic_freelist->push_back(fetching_atomic + i);
}
+2 -5
Näytä tiedosto
@@ -56,14 +56,11 @@ ROBackend::ROBackend(MPI_Comm comm)
profiler_proxy_ = ProfilerProxyT(envvar::max_num_contexts);
int device_id;
hipDeviceProp_t device_props;
CHECK_HIP(hipGetDevice(&device_id));
CHECK_HIP(hipGetDeviceProperties(&device_props, device_id));
max_wg_size_ = device_props.maxThreadsPerBlock;
max_wg_size_ = get_threads_per_block(device_id);
wf_size_ = device_props.warpSize;
wf_size_ = get_wf_size(device_id);
setup_default_ctx_buffers();
+17
Näytä tiedosto
@@ -44,6 +44,21 @@ typedef struct device_agent {
std::vector<device_agent_t> gpu_agents;
std::vector<device_agent_t> cpu_agents;
std::vector<device_prop_t> device_properties;
static void device_properties_init(void) {
int numDevices;
CHECK_HIP(hipGetDeviceCount(&numDevices));
device_prop_t prop;
hipDeviceProp_t hipprop;
for (int i=0; i<numDevices; i++) {
CHECK_HIP(hipGetDeviceProperties(&hipprop, i));
prop.warpSize = hipprop.warpSize;
prop.maxThreadsPerBlock = hipprop.maxThreadsPerBlock;
device_properties.push_back(prop);
}
}
hsa_status_t rocm_hsa_amd_memory_pool_callback(
hsa_amd_memory_pool_t memory_pool, void* data) {
hsa_amd_memory_pool_global_flag_t pool_flag{};
@@ -108,6 +123,8 @@ int rocm_init() {
return 1;
}
device_properties_init();
return 0;
}
+20
Näytä tiedosto
@@ -30,6 +30,8 @@
#include <hsa/hsa_ext_amd.h>
#include <cstdio>
#include <cassert>
#include <vector>
#include "rocshmem/rocshmem_config.h" // NOLINT(build/include_subdir)
#include "constants.hpp"
@@ -146,6 +148,24 @@ do { \
extern const int gpu_clock_freq_mhz;
typedef struct device_prop {
int warpSize;
int maxThreadsPerBlock;
} device_prop_t;
extern std::vector<device_prop_t> device_properties;
static int get_threads_per_block(int device_id) {
assert(device_properties.size() > device_id);
return device_properties[device_id].maxThreadsPerBlock;
}
static int get_wf_size(int device_id) {
assert(device_properties.size() > device_id);
return device_properties[device_id].warpSize;
}
/* Device-side internal functions */
__device__ __forceinline__ uint32_t lowerID() {
return __ffsll(__ballot(1)) - 1;
@@ -42,10 +42,12 @@
/**
* Shared array to capture the start time for each wavefront
* Max threads per block = 1024, wavefront size = 64 (in most GPUs)
* Maximum array size required = 1024/64 = 16
* Max threads per block = 1024, wavefront size = 64 or 32 depending
* on the GPUs. Using 32 since its safer for the dimensioning of the array,
* the last 16 elements will not be used on GPUs with a wf size of 64.
* Maximum array size required = 1024/32 = 32
*/
__shared__ long long int wf_start_time[16];
__shared__ long long int wf_start_time[32];
/**
* Calculate start index for each thread within the grid
@@ -190,4 +192,3 @@
}
}
}
@@ -43,10 +43,12 @@ __global__ void PrimitiveTest(int loop, int skip, long long int *start_time,
/**
* Shared array to capture the start time for each wavefront
* Max threads per block = 1024, wavefront size = 64 (in most GPUs)
* Maximum array size required = 1024/64 = 16
* Max threads per block = 1024, wavefront size = 64 or 32 depending
* on the GPUs. Using 32 since its safer for the dimensioning of the array,
* the last 16 elements will not be used on GPUs with a wf size of 64.
* Maximum array size required = 1024/32 = 32
*/
__shared__ long long int wf_start_time[16];
__shared__ long long int wf_start_time[32];
/**
* Calculate start index for each thread within the grid
@@ -43,10 +43,12 @@ __global__ void ShmemPtrTest(int loop, int skip, long long int *start_time,
/**
* Shared array to capture the start time for each wavefront
* Max threads per block = 1024, wavefront size = 64 (in most GPUs)
* Maximum array size required = 1024/64 = 16
* Max threads per block = 1024, wavefront size = 64 or 32 depending
* on the GPUs. Using 32 since its safer for the dimensioning of the array,
* the last 16 elements will not be used on GPUs with a wf size of 64.
* Maximum array size required = 1024/32 = 32
*/
__shared__ long long int wf_start_time[16];
__shared__ long long int wf_start_time[32];
/**
@@ -47,10 +47,12 @@ __global__ void TeamCtxPrimitiveTest(int loop, int skip, long long int *start_ti
/**
* Shared array to capture the start time for each wavefront
* Max threads per block = 1024, wavefront size = 64 (in most GPUs)
* Maximum array size required = 1024/64 = 16
* Max threads per block = 1024, wavefront size = 64 or 32 depending
* on the GPUs. Using 32 since its safer for the dimensioning of the array,
* the last 16 elements will not be used on GPUs with a wf size of 64.
* Maximum array size required = 1024/32 = 32
*/
__shared__ long long int wf_start_time[16];
__shared__ long long int wf_start_time[32];
/**
* Calculate start index for each thread within the grid
+4 -10
Näytä tiedosto
@@ -144,12 +144,6 @@ TEST_F(BitwiseTestFixture, verify_host_warp_matrix_init_1024_8) {
verify_zeroed_warp_matrix();
}
TEST_F(BitwiseTestFixture, verify_warp_size_64) {
setup_fixture({1, 1, 1}, {1, 1, 1});
ASSERT_EQ(WF_SIZE, 64);
}
/*****************************************************************************
************************** Activate Lane Helper******************************
*****************************************************************************/
@@ -1402,7 +1396,7 @@ TEST_F(BitwiseTestFixture, fetch_incr_kernel_4_1) {
for (size_t i = 0; i < _warp_matrix->rows(); i++) {
for (size_t j = 0; j < _warp_matrix->columns(); j++) {
auto *elem = _warp_matrix->access(i, j);
ASSERT_EQ(*elem % WF_SIZE, 0);
ASSERT_EQ(*elem % this->_wf_size, 0);
}
}
}
@@ -1426,7 +1420,7 @@ TEST_F(BitwiseTestFixture, fetch_incr_kernel_64_1) {
for (size_t i = 0; i < _warp_matrix->rows(); i++) {
for (size_t j = 0; j < _warp_matrix->columns(); j++) {
auto *elem = _warp_matrix->access(i, j);
ASSERT_EQ(*elem % WF_SIZE, 0);
ASSERT_EQ(*elem % this->_wf_size, 0);
}
}
}
@@ -1498,7 +1492,7 @@ TEST_F(BitwiseTestFixture, fetch_incr_kernel_1024_1024) {
for (size_t i = 0; i < _warp_matrix->rows(); i++) {
for (size_t j = 0; j < _warp_matrix->columns(); j++) {
auto *elem = _warp_matrix->access(i, j);
ASSERT_EQ(*elem % WF_SIZE, 0);
ASSERT_EQ(*elem % this->_wf_size, 0);
}
}
}
@@ -1522,7 +1516,7 @@ TEST_F(BitwiseTestFixture, fetch_incr_logical_1_kernel_1024_1024) {
for (size_t i = 0; i < _warp_matrix->rows(); i++) {
for (size_t j = 0; j < _warp_matrix->columns(); j++) {
auto *elem = _warp_matrix->access(i, j);
ASSERT_EQ(*elem % WF_SIZE, 1);
ASSERT_EQ(*elem % this->_wf_size, 1);
}
}
}
@@ -28,6 +28,7 @@
#define HIP_ENABLE_PRINTF
#include "gtest/gtest.h"
#include "wf_size.hpp"
#include "../src/memory/hip_allocator.hpp"
#include "containers/matrix.hpp"
@@ -259,6 +260,7 @@ class BitwiseTestFixture : public ::testing::Test
{
_hip_block_dim = block_dim;
_hip_grid_dim = grid_dim;
_wf_size = get_wf_size();
assert(_device_methods == nullptr);
_hip_allocator.allocate(reinterpret_cast<void**>(&_device_methods),
@@ -341,6 +343,7 @@ class BitwiseTestFixture : public ::testing::Test
HIPAllocator _hip_allocator {};
WarpMatrix *_warp_matrix = nullptr;
BitwiseDeviceMethods *_device_methods = nullptr;
int _wf_size;
};
} // namespace rocshmem
+5 -5
Näytä tiedosto
@@ -108,7 +108,7 @@ TYPED_TEST(FreeListTestFixture, push_host_pop_device) {
CHECK_HIP(hipMemset(results, 0, size_bytes));
is_empty = reinterpret_cast<bool*>(results + h_input.size());
const auto block_size = WF_SIZE;
const auto block_size = this->wf_size;
rocshmem::pop_all<<<1, block_size>>>(free_list, results, h_input.size());
CHECK_HIP(hipDeviceSynchronize());
@@ -140,7 +140,7 @@ TYPED_TEST(FreeListTestFixture, push_host_concurrent_pop_device) {
CHECK_HIP(hipMemset(results, 0, size_bytes));
is_empty = reinterpret_cast<bool*>(results + h_input.size());
const auto num_blocks = h_input.size();
const auto block_size = WF_SIZE;
const auto block_size = this->wf_size;
rocshmem::pop_all<<<num_blocks, block_size>>>(
free_list, results, h_input.size());
CHECK_HIP(hipDeviceSynchronize());
@@ -184,7 +184,7 @@ TYPED_TEST(FreeListTestFixture, push_host_pop_push_device) {
CHECK_HIP(hipMemset(results, 0, size_bytes));
d_input = reinterpret_cast<T*>(results + h_input.size());
is_empty = reinterpret_cast<bool*>(d_input + h_input.size());
const auto block_size = WF_SIZE;
const auto block_size = this->wf_size;
CHECK_HIP(hipMemcpy(d_input, h_input.data(), sizeof(T) * h_input.size(),
hipMemcpyHostToDevice));
@@ -223,7 +223,7 @@ TYPED_TEST(FreeListTestFixture, push_host_pop_concurrent_push_device) {
CHECK_HIP(hipMemset(results, 0, size_bytes));
d_input = reinterpret_cast<T*>(results + h_input.size());
const auto block_size = WF_SIZE;
const auto block_size = this->wf_size;
CHECK_HIP(hipMemcpy(d_input, h_input.data(), sizeof(T) * h_input.size(),
hipMemcpyHostToDevice));
@@ -277,7 +277,7 @@ TYPED_TEST(FreeListTestFixture, push_host_concurrent_pop_push_device) {
CHECK_HIP(hipMemcpy(d_input, h_input.data(), sizeof(T) * h_input.size(),
hipMemcpyHostToDevice));
const auto block_size = WF_SIZE;
const auto block_size = this->wf_size;
rocshmem::pop_all<FreeListType, T><<<1, block_size>>>(
free_list, nullptr, h_input.size());
CHECK_HIP(hipDeviceSynchronize());
@@ -31,6 +31,7 @@
#include "../src/containers/free_list_impl.hpp"
#include "gtest/gtest.h"
#include "../src/memory/hip_allocator.hpp"
#include "wf_size.hpp"
namespace rocshmem {
@@ -45,6 +46,7 @@ class FreeListTestFixture : public ::testing::Test {
protected:
void SetUp() override {
free_list->push_back_range(h_input.begin(), h_input.end());
wf_size = get_wf_size();
}
using T = ValueType;
@@ -52,6 +54,7 @@ class FreeListTestFixture : public ::testing::Test {
Allocator hip_allocator_ {};
const std::size_t num_elements{32};
std::vector<T> h_input{};
int wf_size;
FreeListProxy<Allocator, T> list_proxy{};
FreeList<T, Allocator>* free_list{};
@@ -28,16 +28,34 @@
using namespace rocshmem;
TEST_F(WavefrontSizeTestFixture, constant_matches_runtime) {
int device_count = 0;
hipDeviceProp_t prop;
CHECK_HIP(hipGetDeviceCount(&device_count));
ASSERT_GT(device_count, 0);
for (int i = 0; i < device_count; i++) {
CHECK_HIP(hipGetDeviceProperties(&prop, i));
ASSERT_EQ(WF_SIZE, prop.warpSize);
__global__ void check_wf_size(int wf_size_prop, int *ret) {
if (wf_size_prop == WF_SIZE) {
*ret = 0;
} else {
*ret = 1;
}
}
TEST_F(WavefrontSizeTestFixture, constant_matches_runtime) {
int device_count = 0;
hipDeviceProp_t prop;
int *ret;
CHECK_HIP(hipGetDeviceCount(&device_count));
ASSERT_GT(device_count, 0);
CHECK_HIP(hipHostMalloc(&ret, sizeof(int), 0));
for (int i = 0; i < device_count; i++) {
*ret = -1;
CHECK_HIP(hipSetDevice(i));
CHECK_HIP(hipGetDeviceProperties(&prop, i));
check_wf_size<<<1, 1>>>(prop.warpSize, ret);
CHECK_HIP(hipDeviceSynchronize());
ASSERT_EQ(*ret, 0);
}
CHECK_HIP(hipHostFree(ret));
}
@@ -26,10 +26,19 @@
#define ROCSHMEM_WAVEFRONT_SIZE_GTEST_HPP
#include "gtest/gtest.h"
#include "wf_size.hpp"
namespace rocshmem {
class WavefrontSizeTestFixture : public ::testing::Test { };
class WavefrontSizeTestFixture : public ::testing::Test {
public:
void SetUp() override {
wf_size = get_wf_size();
}
protected:
int wf_size;
};
} // namespace rocshmem
+46
Näytä tiedosto
@@ -0,0 +1,46 @@
/******************************************************************************
* Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*****************************************************************************/
#ifndef ROCSHMEM_WF_SIZE_HPP
#define ROCSHMEM_WF_SIZE_HPP
#include <hip/hip_runtime.h>
#include "mpi.h"
#define CHECK_HIP_MPI(cond) { \
if(cond != hipSuccess){ \
fprintf(stderr,"HIP error: %d line: %d\n", cond, __LINE__); \
MPI_Abort(MPI_COMM_WORLD, 1); \
} \
}
static int get_wf_size() {
int deviceId;
hipDeviceProp_t prop;
CHECK_HIP_MPI(hipGetDevice(&deviceId));
CHECK_HIP_MPI(hipGetDeviceProperties(&prop, deviceId));
return prop.warpSize;
}
#endif