From 07c1b9a9984e38fee27e223be67ddf9fcd42b343 Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Wed, 18 May 2022 12:57:55 -0400 Subject: [PATCH] SWDEV-336024 - Clear device heap to 0 This reverts commit 04bfd935697aefe50428d35c07f014487a4977e0. Reason for revert: Fix regressions Change-Id: I7d883e1c3cbd27bb64b581ec800243ad7dfe24fd --- rocclr/device/blit.hpp | 3 -- rocclr/device/device.cpp | 50 +------------------------------- rocclr/device/device.hpp | 16 ++++------ rocclr/device/pal/paldevice.cpp | 22 +++++++++----- rocclr/device/pal/paldevice.hpp | 4 +++ rocclr/device/rocm/rocdevice.cpp | 17 ++++++++--- rocclr/device/rocm/rocdevice.hpp | 5 ++++ rocclr/device/rocm/rocmemory.cpp | 3 +- rocclr/platform/commandqueue.cpp | 2 +- rocclr/platform/commandqueue.hpp | 4 +-- 10 files changed, 47 insertions(+), 79 deletions(-) diff --git a/rocclr/device/blit.hpp b/rocclr/device/blit.hpp index 3019640352..794bfa15ca 100644 --- a/rocclr/device/blit.hpp +++ b/rocclr/device/blit.hpp @@ -196,9 +196,6 @@ class BlitManager : public amd::HeapObject { //! Enables synchronization on blit operations void enableSynchronization() { syncOperation_ = true; } - //! Disables synchronization on blit operations - void disableSynchronization() { syncOperation_ = false; } - //! Returns Xfer queue lock virtual amd::Monitor* lockXfer() const { return nullptr; } diff --git a/rocclr/device/device.cpp b/rocclr/device/device.cpp index f6417cfaf9..c5791b49e7 100644 --- a/rocclr/device/device.cpp +++ b/rocclr/device/device.cpp @@ -22,7 +22,6 @@ #include "thread/monitor.hpp" #include "utils/options.hpp" #include "comgrctx.hpp" -#include "blit.hpp" #include #include @@ -473,7 +472,7 @@ Device::~Device() { } if (heap_buffer_ != nullptr) { - heap_buffer_->release(); + delete heap_buffer_; heap_buffer_ = nullptr; } @@ -512,25 +511,6 @@ bool Device::ValidateHsail() { return true; } -// ================================================================================================ -amd::Memory* Device::createPrivateBuffer(size_t size) const { - auto buffer = new (context()) amd::Buffer(context(), CL_MEM_READ_WRITE, size); - - if ((nullptr != buffer) && !buffer->create(nullptr)) { - buffer->release(); - LogError("Couldn't allocate internal buffer on device!"); - return nullptr; - } - - if (nullptr == buffer->getDeviceMemory(*this)) { - LogError("Couldn't allocate internal buffer on device!"); - return nullptr; - } - - return buffer; -} - -// ================================================================================================ bool Device::create(const Isa &isa) { assert(!vaCacheAccess_ && !vaCacheMap_); isa_ = &isa; @@ -545,7 +525,6 @@ bool Device::create(const Isa &isa) { return true; } -// ================================================================================================ void Device::registerDevice() { assert(Runtime::singleThreaded() && "this is not thread-safe"); @@ -570,33 +549,6 @@ void Device::registerDevice() { devices_->push_back(this); } -// ================================================================================================ -device::VirtualDevice* Device::CreateDeviceQueue(CommandQueue* queue) { - auto vgpu = createVirtualDevice(queue); - - // Device library expects cleared to zero heap memory - // @note: It must occur once per device, but runtime needs a queue to clear memory - if (HeapBuffer() != nullptr) { - auto HeapZeroOut = [this, vgpu]()->bool { - uint64_t pattern = 0; - amd::Coord3D origin(0, 0, 0); - amd::Coord3D region(HeapBuffer()->size(), 1, 1); - - // Force synchronization in the blit manager. Runtime has to make sure the clear is done - vgpu->blitMgr().enableSynchronization(); - auto result = vgpu->blitMgr().fillBuffer( - *HeapBuffer(), &pattern, sizeof(pattern), region, origin, region); - // Disable synchronization in the blit manager on the queue - vgpu->blitMgr().disableSynchronization(); - return result; - }; - std::call_once(heap_initialized_, HeapZeroOut); - } - - return vgpu; -} - -// ================================================================================================ void Device::addVACache(device::Memory* memory) const { // Make sure system memory has direct access if (memory->isHostMemDirectAccess()) { diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index 48510d1eef..9d2c0ea28c 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -57,7 +57,6 @@ #include #include #include -#include namespace amd { class Command; @@ -1630,9 +1629,6 @@ class Device : public RuntimeObject { //! Return this device's type. cl_device_type type() const { return info().type_ & ~(CL_DEVICE_TYPE_DEFAULT); } - //! Creates a queue on device for submitting the commands - device::VirtualDevice* CreateDeviceQueue(CommandQueue* queue = nullptr); - //! Create a new virtual device environment. virtual device::VirtualDevice* createVirtualDevice(CommandQueue* queue = NULL) = 0; @@ -1642,8 +1638,8 @@ class Device : public RuntimeObject { //! Allocate a chunk of device memory as a cache for a CL memory object virtual device::Memory* createMemory(Memory& owner) const = 0; - //! Allocate a chunk of device memory with the owner class in the internal context - amd::Memory* createPrivateBuffer(size_t size) const; + //! Allocate a chunk of device memory without owner class + virtual device::Memory* createMemory(size_t size) const = 0; //! Allocate a device sampler object virtual bool createSampler(const Sampler&, device::Sampler**) const = 0; @@ -1899,9 +1895,7 @@ class Device : public RuntimeObject { Memory* P2PStage() const { return p2p_stage_; } //! Returns heap buffer object for device allocator - device::Memory* HeapBuffer() const { - return (heap_buffer_ != nullptr) ? heap_buffer_->getDeviceMemory(*this) : nullptr; - } + device::Memory* HeapBuffer() const { return heap_buffer_; } //! Returns stack size set for the device uint64_t StackSize() const { return stack_size_; } @@ -1965,8 +1959,8 @@ class Device : public RuntimeObject { static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources static Memory* p2p_stage_; //!< Staging resources - std::once_flag heap_initialized_; //!< Heap buffer initialization flag - amd::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device + device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device + amd::Memory* arena_mem_obj_; //!< Arena memory object uint64_t stack_size_{0}; //!< Device stack size diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp index 4b3dfdd2bd..7e66b51d64 100644 --- a/rocclr/device/pal/paldevice.cpp +++ b/rocclr/device/pal/paldevice.cpp @@ -1154,14 +1154,13 @@ bool Device::initializeHeapResources() { xferQueue_->enableSyncedBlit(); if (amd::IS_HIP) { // Allocate initial heap for device memory allocator - constexpr size_t kHeapBufferSize = 128 * Ki; - heap_buffer_ = createPrivateBuffer(kHeapBufferSize); + static constexpr size_t HeapBufferSize = 1024 * Ki; + heap_buffer_ = createMemory(HeapBufferSize); } } return true; } -// ================================================================================================ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { bool profiling = false; uint rtCUs = amd::CommandQueue::RealTimeDisabled; @@ -1191,15 +1190,14 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { } VirtualGPU* vgpu = new VirtualGPU(*this); - if (nullptr == vgpu || !vgpu->create(profiling, deviceQueueSize, rtCUs, queue->priority())) { + if (vgpu && vgpu->create(profiling, deviceQueueSize, rtCUs, queue->priority())) { + return vgpu; + } else { delete vgpu; return nullptr; } - - return vgpu; } -// ================================================================================================ device::Program* Device::createProgram(amd::Program& owner, amd::option::Options* options) { device::Program* program; if (settings().useLightning_) { @@ -1717,6 +1715,16 @@ device::Memory* Device::createMemory(amd::Memory& owner) const { return memory; } +// ================================================================================================ +device::Memory* Device::createMemory(size_t size) const { + auto buffer = new pal::Memory(*this, size); + if ((buffer == nullptr) || !buffer->create(Resource::Local)) { + LogError("Couldn't allocate memory on device!"); + return nullptr; + } + return buffer; +} + // ================================================================================================ bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const { *sampler = nullptr; diff --git a/rocclr/device/pal/paldevice.hpp b/rocclr/device/pal/paldevice.hpp index d8822b4d57..36c3ee5b00 100644 --- a/rocclr/device/pal/paldevice.hpp +++ b/rocclr/device/pal/paldevice.hpp @@ -91,6 +91,8 @@ class NullDevice : public amd::Device { //! Just returns NULL for the dummy device virtual device::Memory* createMemory(amd::Memory& owner) const { return nullptr; } + //! Just returns NULL for the dummy device + virtual device::Memory* createMemory(size_t size) const { return nullptr; } //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object @@ -369,6 +371,8 @@ class Device : public NullDevice { //! Memory allocation virtual device::Memory* createMemory(amd::Memory& owner //!< abstraction layer memory object ) const; + virtual device::Memory* createMemory(size_t size //!< Size of memory allocation + ) const; //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 91b34a2014..b2391b9509 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -788,8 +788,8 @@ bool Device::create() { if (amd::IS_HIP) { // Allocate initial heap for device memory allocator - constexpr size_t kHeapBufferSize = 128 * Ki; - heap_buffer_ = createPrivateBuffer(kHeapBufferSize); + static constexpr size_t HeapBufferSize = 1024 * Ki; + heap_buffer_ = createMemory(HeapBufferSize); } return true; @@ -1659,7 +1659,6 @@ device::VirtualDevice* Device::createVirtualDevice(amd::CommandQueue* queue) { return virtualDevice; } -// ================================================================================================ bool Device::globalFreeMemory(size_t* freeMemory) const { const uint TotalFreeMemory = 0; const uint LargestFreeBlock = 1; @@ -1674,7 +1673,6 @@ bool Device::globalFreeMemory(size_t* freeMemory) const { return true; } -// ================================================================================================ bool Device::bindExternalDevice(uint flags, void* const gfxDevice[], void* gfxContext, bool validateOnly) { #if defined(_WIN32) @@ -1890,6 +1888,17 @@ device::Memory* Device::createMemory(amd::Memory& owner) const { return memory; } +// ================================================================================================ +device::Memory* Device::createMemory(size_t size) const { + auto buffer = new roc::Buffer(*this, size); + static constexpr bool LocalAlloc = true; + if ((buffer == nullptr) || !buffer->create(LocalAlloc)) { + LogError("Couldn't allocate memory on device!"); + return nullptr; + } + return buffer; +} + // ================================================================================================ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const { void* ptr = nullptr; diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index 06acdb2d44..a6a6631710 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -163,6 +163,10 @@ class NullDevice : public amd::Device { ShouldNotReachHere(); return nullptr; } + virtual device::Memory* createMemory(size_t size) const { + ShouldNotReachHere(); + return nullptr; + } //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object @@ -381,6 +385,7 @@ class Device : public NullDevice { virtual device::Program* createProgram(amd::Program& owner, amd::option::Options* options = nullptr); virtual device::Memory* createMemory(amd::Memory& owner) const; + virtual device::Memory* createMemory(size_t size) const; //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object diff --git a/rocclr/device/rocm/rocmemory.cpp b/rocclr/device/rocm/rocmemory.cpp index 6038cb0ef4..f1e09a35c3 100644 --- a/rocclr/device/rocm/rocmemory.cpp +++ b/rocclr/device/rocm/rocmemory.cpp @@ -883,8 +883,7 @@ bool Buffer::create(bool alloc_local) { const_cast(dev()).updateFreeMemory(size(), false); } } - // Hide private allocations from memory tracking - else if (owner()->getContext() != dev().context()) { + else { const_cast(dev()).updateFreeMemory(size(), false); } diff --git a/rocclr/platform/commandqueue.cpp b/rocclr/platform/commandqueue.cpp index 265fd2cc8b..7a8fee1f6f 100644 --- a/rocclr/platform/commandqueue.cpp +++ b/rocclr/platform/commandqueue.cpp @@ -284,7 +284,7 @@ bool DeviceQueue::create() { const bool defaultDeviceQueue = properties().test(CL_QUEUE_ON_DEVICE_DEFAULT); bool result = false; - virtualDevice_ = device().CreateDeviceQueue(this); + virtualDevice_ = device().createVirtualDevice(this); if (virtualDevice_ != NULL) { result = true; context().addDeviceQueue(device(), this, defaultDeviceQueue); diff --git a/rocclr/platform/commandqueue.hpp b/rocclr/platform/commandqueue.hpp index fe4dc937dc..863f8ecac3 100644 --- a/rocclr/platform/commandqueue.hpp +++ b/rocclr/platform/commandqueue.hpp @@ -167,7 +167,7 @@ class HostQueue : public CommandQueue { //! The command queue thread entry point. void run(void* data) { HostQueue* queue = static_cast(data); - virtualDevice_ = queue->device().CreateDeviceQueue(queue); + virtualDevice_ = queue->device().createVirtualDevice(queue); if (virtualDevice_ != NULL) { queue->loop(virtualDevice_); Release(); @@ -178,7 +178,7 @@ class HostQueue : public CommandQueue { } void Init(HostQueue* queue) { - virtualDevice_ = queue->device().CreateDeviceQueue(queue); + virtualDevice_ = queue->device().createVirtualDevice(queue); if (virtualDevice_ != nullptr) { acceptingCommands_ = true; }