diff --git a/rocclr/device/blit.hpp b/rocclr/device/blit.hpp index 794bfa15ca..3019640352 100644 --- a/rocclr/device/blit.hpp +++ b/rocclr/device/blit.hpp @@ -196,6 +196,9 @@ class BlitManager : public amd::HeapObject { //! Enables synchronization on blit operations void enableSynchronization() { syncOperation_ = true; } + //! Disables synchronization on blit operations + void disableSynchronization() { syncOperation_ = false; } + //! Returns Xfer queue lock virtual amd::Monitor* lockXfer() const { return nullptr; } diff --git a/rocclr/device/device.cpp b/rocclr/device/device.cpp index 8df0078fa6..d89f526d58 100644 --- a/rocclr/device/device.cpp +++ b/rocclr/device/device.cpp @@ -22,6 +22,7 @@ #include "thread/monitor.hpp" #include "utils/options.hpp" #include "comgrctx.hpp" +#include "blit.hpp" #include #include @@ -472,7 +473,7 @@ Device::~Device() { } if (heap_buffer_ != nullptr) { - delete heap_buffer_; + heap_buffer_->release(); heap_buffer_ = nullptr; } @@ -511,6 +512,25 @@ bool Device::ValidateHsail() { return true; } +// ================================================================================================ +amd::Memory* Device::createPrivateBuffer(size_t size) const { + auto buffer = new (context()) amd::Buffer(context(), CL_MEM_READ_WRITE, size); + + if ((nullptr != buffer) && !buffer->create(nullptr)) { + buffer->release(); + LogError("Couldn't allocate internal buffer on device!"); + return nullptr; + } + + if (nullptr == buffer->getDeviceMemory(*this)) { + LogError("Couldn't allocate internal buffer on device!"); + return nullptr; + } + + return buffer; +} + +// ================================================================================================ bool Device::create(const Isa &isa) { assert(!vaCacheAccess_ && !vaCacheMap_); isa_ = &isa; @@ -525,6 +545,7 @@ bool Device::create(const Isa &isa) { return true; } +// ================================================================================================ void Device::registerDevice() { assert(Runtime::singleThreaded() && "this is not thread-safe"); @@ -549,6 +570,33 @@ void Device::registerDevice() { devices_->push_back(this); } +// ================================================================================================ +device::VirtualDevice* Device::CreateDeviceQueue(CommandQueue* queue) { + auto vgpu = createVirtualDevice(queue); + + // Device library expects cleared to zero heap memory + // @note: It must occur once per device, but runtime needs a queue to clear memory + if (HeapBuffer() != nullptr) { + auto HeapZeroOut = [this, vgpu]()->bool { + uint64_t pattern = 0; + amd::Coord3D origin(0, 0, 0); + amd::Coord3D region(HeapBuffer()->size(), 1, 1); + + // Force synchronization in the blit manager. Runtime has to make sure the clear is done + vgpu->blitMgr().enableSynchronization(); + auto result = vgpu->blitMgr().fillBuffer( + *HeapBuffer(), &pattern, sizeof(pattern), region, origin, region); + // Disable synchronization in the blit manager on the queue + vgpu->blitMgr().disableSynchronization(); + return result; + }; + std::call_once(heap_initialized_, HeapZeroOut); + } + + return vgpu; +} + +// ================================================================================================ void Device::addVACache(device::Memory* memory) const { // Make sure system memory has direct access if (memory->isHostMemDirectAccess()) { diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index e3818b5c18..5f319c0852 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -57,6 +57,7 @@ #include #include #include +#include namespace amd { class Command; @@ -1629,6 +1630,9 @@ class Device : public RuntimeObject { //! Return this device's type. cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); } + //! Creates a queue on device for submitting the commands + device::VirtualDevice* CreateDeviceQueue(CommandQueue* queue = nullptr); + //! Create a new virtual device environment. virtual device::VirtualDevice* createVirtualDevice(CommandQueue* queue = NULL) = 0; @@ -1638,8 +1642,8 @@ class Device : public RuntimeObject { //! Allocate a chunk of device memory as a cache for a CL memory object virtual device::Memory* createMemory(Memory& owner) const = 0; - //! Allocate a chunk of device memory without owner class - virtual device::Memory* createMemory(size_t size) const = 0; + //! Allocate a chunk of device memory with the owner class in the internal context + amd::Memory* createPrivateBuffer(size_t size) const; //! Allocate a device sampler object virtual bool createSampler(const Sampler&, device::Sampler**) const = 0; @@ -1895,7 +1899,7 @@ class Device : public RuntimeObject { Memory* P2PStage() const { return p2p_stage_; } //! Returns heap buffer object for device allocator - device::Memory* HeapBuffer() const { return heap_buffer_; } + device::Memory* HeapBuffer() const { return heap_buffer_->getDeviceMemory(*this); } //! Does this device allow P2P access? bool P2PAccessAllowed() const { return (p2p_access_devices_.size() > 0) ? true : false; } @@ -1953,8 +1957,8 @@ class Device : public RuntimeObject { static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources static Memory* p2p_stage_; //!< Staging resources - device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device - + std::once_flag heap_initialized_; //!< Heap buffer initialization flag + amd::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device amd::Memory* arena_mem_obj_; //!< Arena memory object private: diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp index 0ad4745d4c..40acd56135 100644 --- a/rocclr/device/pal/paldevice.cpp +++ b/rocclr/device/pal/paldevice.cpp @@ -1154,13 +1154,14 @@ bool Device::initializeHeapResources() { xferQueue_->enableSyncedBlit(); if (amd::IS_HIP) { // Allocate initial heap for device memory allocator - static constexpr size_t HeapBufferSize = 1024 * Ki; - heap_buffer_ = createMemory(HeapBufferSize); + constexpr size_t kHeapBufferSize = 128 * Ki; + heap_buffer_ = createPrivateBuffer(kHeapBufferSize); } } return true; } +// ================================================================================================ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { bool profiling = false; uint rtCUs = amd::CommandQueue::RealTimeDisabled; @@ -1190,14 +1191,15 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { } VirtualGPU* vgpu = new VirtualGPU(*this); - if (vgpu && vgpu->create(profiling, deviceQueueSize, rtCUs, queue->priority())) { - return vgpu; - } else { + if (nullptr == vgpu || !vgpu->create(profiling, deviceQueueSize, rtCUs, queue->priority())) { delete vgpu; return nullptr; } + + return vgpu; } +// ================================================================================================ device::Program* Device::createProgram(amd::Program& owner, amd::option::Options* options) { device::Program* program; if (settings().useLightning_) { @@ -1714,16 +1716,6 @@ device::Memory* Device::createMemory(amd::Memory& owner) const { return memory; } -// ================================================================================================ -device::Memory* Device::createMemory(size_t size) const { - auto buffer = new pal::Memory(*this, size); - if ((buffer == nullptr) || !buffer->create(Resource::Local)) { - LogError("Couldn't allocate memory on device!"); - return nullptr; - } - return buffer; -} - // ================================================================================================ bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const { *sampler = nullptr; diff --git a/rocclr/device/pal/paldevice.hpp b/rocclr/device/pal/paldevice.hpp index 36c3ee5b00..d8822b4d57 100644 --- a/rocclr/device/pal/paldevice.hpp +++ b/rocclr/device/pal/paldevice.hpp @@ -91,8 +91,6 @@ class NullDevice : public amd::Device { //! Just returns NULL for the dummy device virtual device::Memory* createMemory(amd::Memory& owner) const { return nullptr; } - //! Just returns NULL for the dummy device - virtual device::Memory* createMemory(size_t size) const { return nullptr; } //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object @@ -371,8 +369,6 @@ class Device : public NullDevice { //! Memory allocation virtual device::Memory* createMemory(amd::Memory& owner //!< abstraction layer memory object ) const; - virtual device::Memory* createMemory(size_t size //!< Size of memory allocation - ) const; //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index b2391b9509..91b34a2014 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -788,8 +788,8 @@ bool Device::create() { if (amd::IS_HIP) { // Allocate initial heap for device memory allocator - static constexpr size_t HeapBufferSize = 1024 * Ki; - heap_buffer_ = createMemory(HeapBufferSize); + constexpr size_t kHeapBufferSize = 128 * Ki; + heap_buffer_ = createPrivateBuffer(kHeapBufferSize); } return true; @@ -1659,6 +1659,7 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { return virtualDevice; } +// ================================================================================================ bool Device::globalFreeMemory(size_t* freeMemory) const { const uint TotalFreeMemory = 0; const uint LargestFreeBlock = 1; @@ -1673,6 +1674,7 @@ bool Device::globalFreeMemory(size_t* freeMemory) const { return true; } +// ================================================================================================ bool Device::bindExternalDevice(uint flags, void* const gfxDevice[], void* gfxContext, bool validateOnly) { #if defined(_WIN32) @@ -1888,17 +1890,6 @@ device::Memory* Device::createMemory(amd::Memory& owner) const { return memory; } -// ================================================================================================ -device::Memory* Device::createMemory(size_t size) const { - auto buffer = new roc::Buffer(*this, size); - static constexpr bool LocalAlloc = true; - if ((buffer == nullptr) || !buffer->create(LocalAlloc)) { - LogError("Couldn't allocate memory on device!"); - return nullptr; - } - return buffer; -} - // ================================================================================================ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const { void* ptr = nullptr; diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index a6a6631710..06acdb2d44 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -163,10 +163,6 @@ class NullDevice : public amd::Device { ShouldNotReachHere(); return nullptr; } - virtual device::Memory* createMemory(size_t size) const { - ShouldNotReachHere(); - return nullptr; - } //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object @@ -385,7 +381,6 @@ class Device : public NullDevice { virtual device::Program* createProgram(amd::Program& owner, amd::option::Options* options = nullptr); virtual device::Memory* createMemory(amd::Memory& owner) const; - virtual device::Memory* createMemory(size_t size) const; //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object diff --git a/rocclr/device/rocm/rocmemory.cpp b/rocclr/device/rocm/rocmemory.cpp index f1e09a35c3..6038cb0ef4 100644 --- a/rocclr/device/rocm/rocmemory.cpp +++ b/rocclr/device/rocm/rocmemory.cpp @@ -883,7 +883,8 @@ bool Buffer::create(bool alloc_local) { const_cast(dev()).updateFreeMemory(size(), false); } } - else { + // Hide private allocations from memory tracking + else if (owner()->getContext() != dev().context()) { const_cast(dev()).updateFreeMemory(size(), false); } diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp index 7a8fee1f6f..265fd2cc8b 100644 --- a/rocclr/platform/commandqueue.cpp +++ b/rocclr/platform/commandqueue.cpp @@ -284,7 +284,7 @@ bool DeviceQueue::create() { const bool defaultDeviceQueue = properties().test(CL_QUEUE_ON_DEVICE_DEFAULT); bool result = false; - virtualDevice_ = device().createVirtualDevice(this); + virtualDevice_ = device().CreateDeviceQueue(this); if (virtualDevice_ != NULL) { result = true; context().addDeviceQueue(device(), this, defaultDeviceQueue); diff --git a/rocclr/platform/commandqueue.hpp b/rocclr/platform/commandqueue.hpp index 863f8ecac3..fe4dc937dc 100644 --- a/rocclr/platform/commandqueue.hpp +++ b/rocclr/platform/commandqueue.hpp @@ -167,7 +167,7 @@ class HostQueue : public CommandQueue { //! The command queue thread entry point. void run(void* data) { HostQueue* queue = static_cast(data); - virtualDevice_ = queue->device().createVirtualDevice(queue); + virtualDevice_ = queue->device().CreateDeviceQueue(queue); if (virtualDevice_ != NULL) { queue->loop(virtualDevice_); Release(); @@ -178,7 +178,7 @@ class HostQueue : public CommandQueue { } void Init(HostQueue* queue) { - virtualDevice_ = queue->device().createVirtualDevice(queue); + virtualDevice_ = queue->device().CreateDeviceQueue(queue); if (virtualDevice_ != nullptr) { acceptingCommands_ = true; }