From 005aebbfad6c2b8e9e2a541e16fd58d0405233b9 Mon Sep 17 00:00:00 2001 From: German Date: Fri, 27 Oct 2023 15:00:15 -0400 Subject: [PATCH] SWDEV-429529 - Allocate glb_ctx_ even for one device Move context allocation into Device::init() method to simplify the logic and handle HIP_VISIBLE_DEVICES properly Change-Id: I0fc6f37c7ae39bedbdad0290295d6794c66d6c54 [ROCm/clr commit: a49d633883381782ff82b7a91e962834f27ee43f] --- projects/clr/rocclr/device/pal/paldevice.cpp | 53 +++++++------ projects/clr/rocclr/device/rocm/rocdevice.cpp | 78 +++++++++---------- 2 files changed, 63 insertions(+), 68 deletions(-) diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp index c8339da8fd..4856c5365c 100644 --- a/projects/clr/rocclr/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/device/pal/paldevice.cpp @@ -872,6 +872,7 @@ extern const char* SchedulerSourceCode; extern const char* SchedulerSourceCode20; extern const char* TrapHandlerCode; +// ================================================================================================ bool Device::create(Pal::IDevice* device) { resourceList_ = new std::unordered_set(); if (nullptr == resourceList_) { @@ -1066,33 +1067,6 @@ bool Device::create(Pal::IDevice* device) { return false; } - if ((glb_ctx_ == nullptr) && (gNumDevices > 1) && (device == gDeviceList[gNumDevices - 1])) { - std::vector devices; - uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true); - // Add all PAL devices - for (uint32_t i = gStartDevice; i < numDevices; ++i) { - devices.push_back(amd::Device::devices()[i]); - } - // Add current - devices.push_back(this); - - if (devices.size() > 1) { - // Create a dummy context - glb_ctx_ = new amd::Context(devices, info); - if (glb_ctx_ == nullptr) { - return false; - } - amd::Buffer* buf = - new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize); - if ((buf != nullptr) && buf->create()) { - p2p_stage_ = buf; - } else { - delete buf; - return false; - } - } - } - return true; } @@ -1333,6 +1307,7 @@ static void parseRequestedDeviceList(const char* requestedDeviceList, } } +// ================================================================================================ bool Device::init() { gStartDevice = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, true); bool useDeviceList = false; @@ -1438,10 +1413,34 @@ bool Device::init() { } } } + + // Query active devices only + constexpr bool kNoOfflineDevices = false; + std::vector devices = getDevices(CL_DEVICE_TYPE_GPU, kNoOfflineDevices); + if (devices.size() > 0) { + // Create a dummy context for internal memory allocations on all reported devices + glb_ctx_ = new amd::Context(devices, amd::Context::Info()); + if (glb_ctx_ == nullptr) { + return false; + } + // Allocate a staging buffer for P2P emulation path + if (devices.size() > 1) { + amd::Buffer* buf = + new (*glb_ctx_) amd::Buffer(*glb_ctx_, CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize); + if ((buf != nullptr) && buf->create()) { + p2p_stage_ = buf; + } else { + delete buf; + return false; + } + } + } } + return true; } +// ================================================================================================ void Device::tearDown() { if (platform_ != nullptr) { platform_->Destroy(); diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index daefffb2bd..2bf54b0432 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -455,6 +455,7 @@ void Device::XferBuffers::release(VirtualGPU& gpu, Memory& buffer) { --acquiredCnt_; } +// ================================================================================================ bool Device::init() { ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Initializing HSA stack."); @@ -543,21 +544,54 @@ bool Device::init() { roc_device.release()->registerDevice(); } - if (0 != Device::numDevices(CL_DEVICE_TYPE_GPU, false)) { + // Query active devices only + constexpr bool kNoOfflineDevices = false; + std::vector devices = getDevices(CL_DEVICE_TYPE_GPU, kNoOfflineDevices); + if (devices.size() > 0) { + bool p2p_available = false; // Loop through all available devices - for (auto device1: Device::devices()) { + for (auto device1: devices) { // Find all agents that can have access to the current device for (auto agent: static_cast(device1)->p2pAgents()) { // Find cl_device_id associated with the current agent - for (auto device2: Device::devices()) { + for (auto device2: devices) { if (agent.handle == static_cast(device2)->getBackendDevice().handle) { // Device2 can have access to device1 device2->p2pDevices_.push_back(as_cl(device1)); device1->p2p_access_devices_.push_back(device2); + p2p_available = true; } } } } + + // Create a dummy context for internal memory allocations on all reported devices + glb_ctx_ = new amd::Context(devices, amd::Context::Info()); + if (glb_ctx_ == nullptr) { + return false; + } + + // Allocate a staging buffer for P2P emulation path + if ((devices.size() >= 1) && !p2p_available) { + amd::Buffer* buf = + new (*glb_ctx_) amd::Buffer(*glb_ctx_, CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize); + if ((buf != nullptr) && buf->create()) { + p2p_stage_ = buf; + } else { + delete buf; + return false; + } + } + + // Allocate mgpu sync buffer for cooperative launches + if (amd::IS_HIP) { + mg_sync_ = reinterpret_cast
(amd::SvmBuffer::malloc( + *glb_ctx_, (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS), + kMGInfoSizePerDevice * devices.size(), kMGInfoSizePerDevice)); + if (mg_sync_ == nullptr) { + return false; + } + } } return true; @@ -741,44 +775,6 @@ bool Device::create() { // Use just 1 entry by default for the map cache mapCache_->push_back(nullptr); - if ((glb_ctx_ == nullptr) && (gpu_agents_.size() >= 1) && - // Allow creation for the last device in the list. - (gpu_agents_[gpu_agents_.size() - 1].handle == bkendDevice_.handle)) { - std::vector devices; - uint32_t numDevices = amd::Device::numDevices(CL_DEVICE_TYPE_GPU, false); - // Add all PAL devices - for (uint32_t i = 0; i < numDevices; ++i) { - devices.push_back(amd::Device::devices()[i]); - } - // Add current - devices.push_back(this); - // Create a dummy context - glb_ctx_ = new amd::Context(devices, info); - if (glb_ctx_ == nullptr) { - return false; - } - - if ((p2p_agents_.size() < (devices.size()-1)) && (devices.size() > 1)) { - amd::Buffer* buf = new (GlbCtx()) amd::Buffer(GlbCtx(), CL_MEM_ALLOC_HOST_PTR, kP2PStagingSize); - if ((buf != nullptr) && buf->create()) { - p2p_stage_ = buf; - } - else { - delete buf; - return false; - } - } - // Check if sync buffer wasn't allocated yet - if (amd::IS_HIP && mg_sync_ == nullptr) { - mg_sync_ = reinterpret_cast
(amd::SvmBuffer::malloc( - GlbCtx(), (CL_MEM_SVM_FINE_GRAIN_BUFFER | CL_MEM_SVM_ATOMICS), - kMGInfoSizePerDevice * GlbCtx().devices().size(), kMGInfoSizePerDevice)); - if (mg_sync_ == nullptr) { - return false; - } - } - } - if (settings().stagedXferSize_ != 0) { // Initialize staged write buffers if (settings().stagedXferWrite_) {