add additional runtime checks and gfx1201 fix (#2806)
* add additional runtime checks and gfx1201 fix This commit contains three fixes: - increase the max. number of files at the beginning of the run to the max. allowed by the system - check for large BAR support. WE don not abort if its not available, but print a warning. - for gfx1201, do not use uncached memory at the moment. * Change get_arch_name to return const char* Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Fix C++ new syntax not sure how it compiled before Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * use snprintf instead of strncpy Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * destructor cleanip Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * add const keyword --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Cette révision appartient à :
@@ -37,19 +37,6 @@
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
static void setFilesLimit() {
|
||||
rlimit filesLimit;
|
||||
if (getrlimit(RLIMIT_NOFILE, &filesLimit) != 0) {
|
||||
DPRINTF("getrlimit failed\n");
|
||||
return;
|
||||
}
|
||||
filesLimit.rlim_cur = filesLimit.rlim_max;
|
||||
if (setrlimit(RLIMIT_NOFILE, &filesLimit) != 0) {
|
||||
DPRINTF("setrlimit failed\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Socket Interface Selection type */
|
||||
enum bootstrapInterface_t { findSubnetIf = -1, dontCareIf = -2 };
|
||||
|
||||
@@ -391,7 +378,6 @@ void TcpBootstrap::Impl::bootstrapRoot() {
|
||||
|
||||
std::memset(rankAddresses.data(), 0, sizeof(SocketAddress) * nRanks_);
|
||||
std::memset(rankAddressesRoot.data(), 0, sizeof(SocketAddress) * nRanks_);
|
||||
setFilesLimit();
|
||||
|
||||
DPRINTF("BEGIN bootstrapRoot\n");
|
||||
/* Receive addresses from all ranks */
|
||||
|
||||
@@ -112,6 +112,13 @@ IPCBackend::IPCBackend(TcpBootstrap *bootstrap): Backend(bootstrap) {
|
||||
void IPCBackend::init() {
|
||||
ROCSHMEM_HOST_CTX_DEFAULT.ctx_opaque = default_host_ctx.get();
|
||||
|
||||
const char *arch_name = get_arch_name(hip_dev_id);
|
||||
if (strncmp(arch_name, "gfx1201", strlen("gfx1201")) == 0) {
|
||||
fine_grained_allocator_ = new HIPAllocatorFinegrained();
|
||||
} else {
|
||||
fine_grained_allocator_ = new HIPDefaultFinegrainedAllocator();
|
||||
}
|
||||
|
||||
setup_team_world();
|
||||
|
||||
setup_wrk_sync_buffers();
|
||||
@@ -141,6 +148,14 @@ IPCBackend::~IPCBackend() {
|
||||
CHECK_HIP(hipFree(team_world));
|
||||
|
||||
CHECK_HIP(hipFree(ctx_array));
|
||||
if (fine_grained_allocator_) {
|
||||
const char *arch_name = get_arch_name(hip_dev_id);
|
||||
if (strncmp(arch_name, "gfx1201", strlen("gfx1201")) == 0) {
|
||||
delete static_cast<HIPAllocatorFinegrained *>(fine_grained_allocator_);
|
||||
} else {
|
||||
delete static_cast<HIPDefaultFinegrainedAllocator *>(fine_grained_allocator_);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void IPCBackend::setup_ctxs() {
|
||||
@@ -368,8 +383,8 @@ void IPCBackend::setup_wrk_sync_buffers() {
|
||||
* Allocate a buffer of size wrk_sync_pool_size_, using fine-grained
|
||||
* memory allocator
|
||||
*/
|
||||
fine_grained_allocator_.allocate((void**)&wrk_sync_pool_,
|
||||
wrk_sync_pool_size_);
|
||||
fine_grained_allocator_->allocate((void**)&wrk_sync_pool_,
|
||||
wrk_sync_pool_size_);
|
||||
assert(wrk_sync_pool_);
|
||||
wrk_sync_pool_top_ = wrk_sync_pool_;
|
||||
|
||||
@@ -400,7 +415,7 @@ void IPCBackend::setup_wrk_sync_buffers() {
|
||||
* Allocate device-side fine grained memory to hold IPC addresses of
|
||||
* work/sync buffers
|
||||
*/
|
||||
fine_grained_allocator_.allocate(
|
||||
fine_grained_allocator_->allocate(
|
||||
reinterpret_cast<void**>(&wrk_sync_pool_bases_),
|
||||
num_pes * sizeof(char*));
|
||||
assert(wrk_sync_pool_bases_);
|
||||
@@ -427,8 +442,8 @@ void IPCBackend::cleanup_wrk_sync_buffer() {
|
||||
CHECK_HIP(hipIpcCloseMemHandle(wrk_sync_pool_bases_[i]));
|
||||
}
|
||||
}
|
||||
fine_grained_allocator_.deallocate(wrk_sync_pool_bases_);
|
||||
fine_grained_allocator_.deallocate(wrk_sync_pool_);
|
||||
fine_grained_allocator_->deallocate(wrk_sync_pool_bases_);
|
||||
fine_grained_allocator_->deallocate(wrk_sync_pool_);
|
||||
}
|
||||
|
||||
void IPCBackend::setup_fence_buffer() {
|
||||
|
||||
@@ -258,7 +258,7 @@ class IPCBackend : public Backend {
|
||||
/**
|
||||
* Fine grained memory allocator for buffers used in collectives Routines
|
||||
*/
|
||||
HIPDefaultFinegrainedAllocator fine_grained_allocator_ {};
|
||||
MemoryAllocator *fine_grained_allocator_{nullptr};
|
||||
|
||||
/**
|
||||
* @brief Collective routines work/sync buffer size
|
||||
|
||||
@@ -60,6 +60,8 @@
|
||||
#include <random>
|
||||
#include <cassert>
|
||||
#include <unistd.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
|
||||
namespace rocshmem {
|
||||
|
||||
@@ -127,6 +129,18 @@ static BackendType select_backend_type() {
|
||||
return BackendType::IPC_BACKEND;
|
||||
}
|
||||
#endif
|
||||
static void setFilesLimit() {
|
||||
rlimit filesLimit;
|
||||
if (getrlimit(RLIMIT_NOFILE, &filesLimit) != 0) {
|
||||
DPRINTF("getrlimit failed\n");
|
||||
return;
|
||||
}
|
||||
filesLimit.rlim_cur = filesLimit.rlim_max;
|
||||
if (setrlimit(RLIMIT_NOFILE, &filesLimit) != 0) {
|
||||
DPRINTF("setrlimit failed\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
[[maybe_unused]] __host__ void inline library_init(MPI_Comm comm) {
|
||||
assert(!backend);
|
||||
@@ -138,6 +152,7 @@ static BackendType select_backend_type() {
|
||||
abort();
|
||||
}
|
||||
|
||||
setFilesLimit();
|
||||
rocm_init();
|
||||
|
||||
int ret;
|
||||
@@ -255,6 +270,7 @@ static BackendType select_backend_type() {
|
||||
abort();
|
||||
}
|
||||
|
||||
setFilesLimit();
|
||||
rocm_init();
|
||||
|
||||
#if defined(USE_GDA) && defined(USE_RO) && defined(USE_IPC)
|
||||
|
||||
@@ -52,11 +52,21 @@ static void device_properties_init(void) {
|
||||
|
||||
device_prop_t prop;
|
||||
hipDeviceProp_t hipprop;
|
||||
int has_large_bar = 0;
|
||||
for (int i=0; i<numDevices; i++) {
|
||||
CHECK_HIP(hipGetDeviceProperties(&hipprop, i));
|
||||
prop.warpSize = hipprop.warpSize;
|
||||
prop.maxThreadsPerBlock = hipprop.maxThreadsPerBlock;
|
||||
std::snprintf(prop.gcnArchName, sizeof(prop.gcnArchName), "%s",
|
||||
hipprop.gcnArchName);
|
||||
device_properties.push_back(prop);
|
||||
|
||||
CHECK_HIP(hipDeviceGetAttribute (&has_large_bar, hipDeviceAttributeIsLargeBar, i));
|
||||
if (has_large_bar == 0) {
|
||||
// Large BAR required for IPC operations
|
||||
printf("Warning: Large BAR support is not enabled on device %d. "
|
||||
"This will impact IPC functionality on some systems.\n", i);
|
||||
}
|
||||
}
|
||||
}
|
||||
hsa_status_t rocm_hsa_amd_memory_pool_callback(
|
||||
|
||||
@@ -157,6 +157,7 @@ extern const int gpu_clock_freq_mhz;
|
||||
typedef struct device_prop {
|
||||
int warpSize;
|
||||
int maxThreadsPerBlock;
|
||||
char gcnArchName[256];
|
||||
} device_prop_t;
|
||||
|
||||
extern std::vector<device_prop_t> device_properties;
|
||||
@@ -171,6 +172,11 @@ static int get_wf_size(int device_id) {
|
||||
return device_properties[device_id].warpSize;
|
||||
}
|
||||
|
||||
static const char* get_arch_name(int device_id) {
|
||||
assert(device_properties.size() > device_id);
|
||||
return device_properties[device_id].gcnArchName;
|
||||
}
|
||||
|
||||
/* Device-side internal functions */
|
||||
__device__ __forceinline__ uint32_t lowerID() {
|
||||
return __ffsll(__ballot(1)) - 1;
|
||||
|
||||
Référencer dans un nouveau ticket
Bloquer un utilisateur