add additional runtime checks and gfx1201 fix (#2806)

* add additional runtime checks and gfx1201 fix

This commit contains three fixes:
 - increase the max. number of files at the beginning of the run to the
   max. allowed by the system
 - check for large BAR support. WE don not abort if its not available,
   but print a warning.
 - for gfx1201, do not use uncached memory at the moment.

* Change get_arch_name to return const char*

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Fix C++ new syntax

not sure how it compiled before

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* use snprintf instead of strncpy

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* destructor cleanip

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* add const keyword

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Cette révision appartient à :
Edgar Gabriel
2026-01-24 13:31:10 -06:00
révisé par GitHub
Parent 5a7a32bc12
révision 72286016c6
6 fichiers modifiés avec 53 ajouts et 20 suppressions
-14
Voir le fichier
@@ -37,19 +37,6 @@
namespace rocshmem {
static void setFilesLimit() {
rlimit filesLimit;
if (getrlimit(RLIMIT_NOFILE, &filesLimit) != 0) {
DPRINTF("getrlimit failed\n");
return;
}
filesLimit.rlim_cur = filesLimit.rlim_max;
if (setrlimit(RLIMIT_NOFILE, &filesLimit) != 0) {
DPRINTF("setrlimit failed\n");
return;
}
}
/* Socket Interface Selection type */
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
@@ -391,7 +378,6 @@ void TcpBootstrap::Impl::bootstrapRoot() {
std::memset(rankAddresses.data(), 0, sizeof(SocketAddress) * nRanks_);
std::memset(rankAddressesRoot.data(), 0, sizeof(SocketAddress) * nRanks_);
setFilesLimit();
DPRINTF("BEGIN bootstrapRoot\n");
/* Receive addresses from all ranks */
+20 -5
Voir le fichier
@@ -112,6 +112,13 @@ IPCBackend::IPCBackend(TcpBootstrap *bootstrap): Backend(bootstrap) {
void IPCBackend::init() {
ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get();
const char *arch_name = get_arch_name(hip_dev_id);
if (strncmp(arch_name, "gfx1201", strlen("gfx1201")) == 0) {
fine_grained_allocator_ = new HIPAllocatorFinegrained();
} else {
fine_grained_allocator_ = new HIPDefaultFinegrainedAllocator();
}
setup_team_world();
setup_wrk_sync_buffers();
@@ -141,6 +148,14 @@ IPCBackend::~IPCBackend() {
CHECK_HIP(hipFree(team_world));
CHECK_HIP(hipFree(ctx_array));
if (fine_grained_allocator_) {
const char *arch_name = get_arch_name(hip_dev_id);
if (strncmp(arch_name, "gfx1201", strlen("gfx1201")) == 0) {
delete static_cast<HIPAllocatorFinegrained *>(fine_grained_allocator_);
} else {
delete static_cast<HIPDefaultFinegrainedAllocator *>(fine_grained_allocator_);
}
}
}
void IPCBackend::setup_ctxs() {
@@ -368,8 +383,8 @@ void IPCBackend::setup_wrk_sync_buffers() {
* Allocate a buffer of size wrk_sync_pool_size_, using fine-grained
* memory allocator
*/
fine_grained_allocator_.allocate((void**)&wrk_sync_pool_,
wrk_sync_pool_size_);
fine_grained_allocator_->allocate((void**)&wrk_sync_pool_,
wrk_sync_pool_size_);
assert(wrk_sync_pool_);
wrk_sync_pool_top_ = wrk_sync_pool_;
@@ -400,7 +415,7 @@ void IPCBackend::setup_wrk_sync_buffers() {
* Allocate device-side fine grained memory to hold IPC addresses of
* work/sync buffers
*/
fine_grained_allocator_.allocate(
fine_grained_allocator_->allocate(
reinterpret_cast<void**>(&wrk_sync_pool_bases_),
num_pes * sizeof(char*));
assert(wrk_sync_pool_bases_);
@@ -427,8 +442,8 @@ void IPCBackend::cleanup_wrk_sync_buffer() {
CHECK_HIP(hipIpcCloseMemHandle(wrk_sync_pool_bases_[i]));
}
}
fine_grained_allocator_.deallocate(wrk_sync_pool_bases_);
fine_grained_allocator_.deallocate(wrk_sync_pool_);
fine_grained_allocator_->deallocate(wrk_sync_pool_bases_);
fine_grained_allocator_->deallocate(wrk_sync_pool_);
}
void IPCBackend::setup_fence_buffer() {
+1 -1
Voir le fichier
@@ -258,7 +258,7 @@ class IPCBackend : public Backend {
/**
* Fine grained memory allocator for buffers used in collectives Routines
*/
HIPDefaultFinegrainedAllocator fine_grained_allocator_ {};
MemoryAllocator *fine_grained_allocator_{nullptr};
/**
* @brief Collective routines work/sync buffer size
+16
Voir le fichier
@@ -60,6 +60,8 @@
#include <random>
#include <cassert>
#include <unistd.h>
#include <sys/time.h>
#include <sys/resource.h>
namespace rocshmem {
@@ -127,6 +129,18 @@ static BackendType select_backend_type() {
return BackendType::IPC_BACKEND;
}
#endif
static void setFilesLimit() {
rlimit filesLimit;
if (getrlimit(RLIMIT_NOFILE, &filesLimit) != 0) {
DPRINTF("getrlimit failed\n");
return;
}
filesLimit.rlim_cur = filesLimit.rlim_max;
if (setrlimit(RLIMIT_NOFILE, &filesLimit) != 0) {
DPRINTF("setrlimit failed\n");
return;
}
}
[[maybe_unused]] __host__ void inline library_init(MPI_Comm comm) {
assert(!backend);
@@ -138,6 +152,7 @@ static BackendType select_backend_type() {
abort();
}
setFilesLimit();
rocm_init();
int ret;
@@ -255,6 +270,7 @@ static BackendType select_backend_type() {
abort();
}
setFilesLimit();
rocm_init();
#if defined(USE_GDA) && defined(USE_RO) && defined(USE_IPC)
+10
Voir le fichier
@@ -52,11 +52,21 @@ static void device_properties_init(void) {
device_prop_t prop;
hipDeviceProp_t hipprop;
int has_large_bar = 0;
for (int i=0; i<numDevices; i++) {
CHECK_HIP(hipGetDeviceProperties(&hipprop, i));
prop.warpSize = hipprop.warpSize;
prop.maxThreadsPerBlock = hipprop.maxThreadsPerBlock;
std::snprintf(prop.gcnArchName, sizeof(prop.gcnArchName), "%s",
hipprop.gcnArchName);
device_properties.push_back(prop);
CHECK_HIP(hipDeviceGetAttribute (&has_large_bar, hipDeviceAttributeIsLargeBar, i));
if (has_large_bar == 0) {
// Large BAR required for IPC operations
printf("Warning: Large BAR support is not enabled on device %d. "
"This will impact IPC functionality on some systems.\n", i);
}
}
}
hsa_status_t rocm_hsa_amd_memory_pool_callback(
+6
Voir le fichier
@@ -157,6 +157,7 @@ extern const int gpu_clock_freq_mhz;
typedef struct device_prop {
int warpSize;
int maxThreadsPerBlock;
char gcnArchName[256];
} device_prop_t;
extern std::vector<device_prop_t> device_properties;
@@ -171,6 +172,11 @@ static int get_wf_size(int device_id) {
return device_properties[device_id].warpSize;
}
static const char* get_arch_name(int device_id) {
assert(device_properties.size() > device_id);
return device_properties[device_id].gcnArchName;
}
/* Device-side internal functions */
__device__ __forceinline__ uint32_t lowerID() {
return __ffsll(__ballot(1)) - 1;