diff --git a/rocclr/device/device.cpp b/rocclr/device/device.cpp index eaa7a97f14..8c6b16cd47 100644 --- a/rocclr/device/device.cpp +++ b/rocclr/device/device.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2008 - 2021 Advanced Micro Devices, Inc. +/* Copyright (c) 2008 - 2022 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -458,6 +458,7 @@ Device::Device() blitProgram_(nullptr), hwDebugMgr_(nullptr), context_(nullptr), + heap_buffer_(nullptr), arena_mem_obj_(nullptr), vaCacheAccess_(nullptr), vaCacheMap_(nullptr), @@ -471,6 +472,11 @@ Device::~Device() { delete vaCacheMap_; } + if (heap_buffer_ != nullptr) { + delete heap_buffer_; + heap_buffer_ = nullptr; + } + delete vaCacheAccess_; if (arena_mem_obj_ != nullptr) { @@ -517,6 +523,11 @@ bool Device::create(const Isa &isa) { if (nullptr == vaCacheMap_) { return false; } + if (amd::IS_HIP) { + // Allocate initial heap for device memory allocator + static constexpr size_t HeapBufferSize = 1024 * Ki; + heap_buffer_ = createMemory(HeapBufferSize); + } return true; } diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index f36ef43ec5..adb2aaf7d3 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -1625,6 +1625,9 @@ class Device : public RuntimeObject { //! Allocate a chunk of device memory as a cache for a CL memory object virtual device::Memory* createMemory(Memory& owner) const = 0; + //! Allocate a chunk of device memory without owner class + virtual device::Memory* createMemory(size_t size) const = 0; + //! Allocate a device sampler object virtual bool createSampler(const Sampler&, device::Sampler**) const = 0; @@ -1852,6 +1855,9 @@ class Device : public RuntimeObject { //! Staging buffer for P2P transfer Memory* P2PStage() const { return p2p_stage_; } + //! Returns heap buffer object for device allocator + device::Memory* HeapBuffer() const { return heap_buffer_; } + //! Does this device allow P2P access? bool P2PAccessAllowed() const { return (p2p_access_devices_.size() > 0) ? true : false; } @@ -1908,7 +1914,9 @@ class Device : public RuntimeObject { static amd::Monitor p2p_stage_ops_; //!< Lock to serialise cache for the P2P resources static Memory* p2p_stage_; //!< Staging resources - amd::Memory* arena_mem_obj_; //!< Arena memory object + device::Memory* heap_buffer_; //!< Preallocated heap buffer for memory allocations on device + + amd::Memory* arena_mem_obj_; //!< Arena memory object private: const Isa *isa_; //!< Device isa diff --git a/rocclr/device/devkernel.hpp b/rocclr/device/devkernel.hpp index f762e71654..f8ebde53cb 100644 --- a/rocclr/device/devkernel.hpp +++ b/rocclr/device/devkernel.hpp @@ -50,21 +50,22 @@ struct KernelParameterDescriptor { HiddenDefaultQueue = 12, HiddenCompletionAction = 13, HiddenMultiGridSync = 14, - HiddenHostcallBuffer = 15, - HiddenBlockCountX = 16, - HiddenBlockCountY = 17, - HiddenBlockCountZ = 18, - HiddenGroupSizeX = 19, - HiddenGroupSizeY = 20, - HiddenGroupSizeZ = 21, - HiddenRemainderX = 22, - HiddenRemainderY = 23, - HiddenRemainderZ = 24, - HiddenGridDims = 25, - HiddenPrivateBase = 26, - HiddenSharedBase = 27, - HiddenQueuePtr = 28, - HiddenLast = 29 + HiddenHeap = 15, + HiddenHostcallBuffer = 16, + HiddenBlockCountX = 17, + HiddenBlockCountY = 18, + HiddenBlockCountZ = 19, + HiddenGroupSizeX = 20, + HiddenGroupSizeY = 21, + HiddenGroupSizeZ = 22, + HiddenRemainderX = 23, + HiddenRemainderY = 24, + HiddenRemainderZ = 25, + HiddenGridDims = 26, + HiddenPrivateBase = 27, + HiddenSharedBase = 28, + HiddenQueuePtr = 29, + HiddenLast = 30 }; clk_value_type_t type_; //!< The parameter's type size_t offset_; //!< Its offset in the parameter's stack @@ -276,6 +277,7 @@ static const std::map ArgValueKindV3 = { {"hidden_default_queue", amd::KernelParameterDescriptor::HiddenDefaultQueue}, {"hidden_completion_action", amd::KernelParameterDescriptor::HiddenCompletionAction}, {"hidden_multigrid_sync_arg", amd::KernelParameterDescriptor::HiddenMultiGridSync}, + {"hidden_heap", amd::KernelParameterDescriptor::HiddenHeap}, {"hidden_hostcall_buffer", amd::KernelParameterDescriptor::HiddenHostcallBuffer}, {"hidden_block_count_x", amd::KernelParameterDescriptor::HiddenBlockCountX}, {"hidden_block_count_y", amd::KernelParameterDescriptor::HiddenBlockCountY}, diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp index 53f4367d52..e0f7cb6669 100644 --- a/rocclr/device/pal/paldevice.cpp +++ b/rocclr/device/pal/paldevice.cpp @@ -1675,7 +1675,7 @@ pal::Memory* Device::createImage(amd::Memory& owner, bool directAccess) const { return gpuImage; } -//! Allocates cache memory on the card +// ================================================================================================ device::Memory* Device::createMemory(amd::Memory& owner) const { bool directAccess = false; pal::Memory* memory = nullptr; @@ -1706,6 +1706,17 @@ device::Memory* Device::createMemory(amd::Memory& owner) const { return memory; } +// ================================================================================================ +device::Memory* Device::createMemory(size_t size) const { + auto buffer = new pal::Memory(*this, size); + if ((buffer == nullptr) || !buffer->create(Resource::Local)) { + LogError("Couldn't allocate memory on device!"); + return nullptr; + } + return buffer; +} + +// ================================================================================================ bool Device::createSampler(const amd::Sampler& owner, device::Sampler** sampler) const { *sampler = nullptr; Sampler* gpuSampler = new Sampler(*this); diff --git a/rocclr/device/pal/paldevice.hpp b/rocclr/device/pal/paldevice.hpp index 90f9cb4f4b..c128a1bd3a 100644 --- a/rocclr/device/pal/paldevice.hpp +++ b/rocclr/device/pal/paldevice.hpp @@ -90,6 +90,8 @@ class NullDevice : public amd::Device { //! Just returns NULL for the dummy device virtual device::Memory* createMemory(amd::Memory& owner) const { return nullptr; } + //! Just returns NULL for the dummy device + virtual device::Memory* createMemory(size_t size) const { return nullptr; } //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object @@ -365,6 +367,8 @@ class Device : public NullDevice { //! Memory allocation virtual device::Memory* createMemory(amd::Memory& owner //!< abstraction layer memory object ) const; + virtual device::Memory* createMemory(size_t size //!< Size of memory allocation + ) const; //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object diff --git a/rocclr/device/pal/palkernel.cpp b/rocclr/device/pal/palkernel.cpp index e1c748cb38..9c4729c71f 100644 --- a/rocclr/device/pal/palkernel.cpp +++ b/rocclr/device/pal/palkernel.cpp @@ -358,6 +358,14 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const break; case amd::KernelParameterDescriptor::HiddenMultiGridSync: break; + case amd::KernelParameterDescriptor::HiddenHeap: + if (gpu.dev().HeapBuffer() != nullptr) { + // Add heap pointer to the code + size_t heap_ptr = static_cast(gpu.dev().HeapBuffer()->virtualAddress()); + gpu.addVmMemory(reinterpret_cast(gpu.dev().HeapBuffer())); + WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_); + } + break; case amd::KernelParameterDescriptor::HiddenBlockCountX: WriteAqlArgAt(hidden_arguments, static_cast(global[0] / local[0]), it.size_, it.offset_); diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index da1c127b97..31336e9447 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -1,4 +1,4 @@ -/* Copyright (c) 2008 - 2021 Advanced Micro Devices, Inc. +/* Copyright (c) 2008 - 2022 Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -1086,7 +1086,6 @@ bool Device::populateOCLDeviceConstants() { info_.uuid_[i] = unique_id[i+4]; } } - if (HSA_STATUS_SUCCESS != hsa_agent_get_info(_bkendDevice, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COOPERATIVE_COMPUTE_UNIT_COUNT, @@ -1867,6 +1866,17 @@ device::Memory* Device::createMemory(amd::Memory& owner) const { return memory; } +// ================================================================================================ +device::Memory* Device::createMemory(size_t size) const { + auto buffer = new roc::Buffer(*this, size); + static constexpr bool LocalAlloc = true; + if ((buffer == nullptr) || !buffer->create(LocalAlloc)) { + LogError("Couldn't allocate memory on device!"); + return nullptr; + } + return buffer; +} + // ================================================================================================ void* Device::hostAlloc(size_t size, size_t alignment, MemorySegment mem_seg) const { void* ptr = nullptr; diff --git a/rocclr/device/rocm/rocdevice.hpp b/rocclr/device/rocm/rocdevice.hpp index ce98deb2b4..07e461ab13 100644 --- a/rocclr/device/rocm/rocdevice.hpp +++ b/rocclr/device/rocm/rocdevice.hpp @@ -163,6 +163,10 @@ class NullDevice : public amd::Device { ShouldNotReachHere(); return nullptr; } + virtual device::Memory* createMemory(size_t size) const { + ShouldNotReachHere(); + return nullptr; + } //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object @@ -366,6 +370,7 @@ class Device : public NullDevice { virtual device::Program* createProgram(amd::Program& owner, amd::option::Options* options = nullptr); virtual device::Memory* createMemory(amd::Memory& owner) const; + virtual device::Memory* createMemory(size_t size) const; //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object diff --git a/rocclr/device/rocm/rocmemory.cpp b/rocclr/device/rocm/rocmemory.cpp index 544d63116d..f1e09a35c3 100644 --- a/rocclr/device/rocm/rocmemory.cpp +++ b/rocclr/device/rocm/rocmemory.cpp @@ -708,12 +708,20 @@ void Buffer::destroy() { } // ================================================================================================ -bool Buffer::create() { +bool Buffer::create(bool alloc_local) { if (owner() == nullptr) { - deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics); - if (deviceMemory_ != nullptr) { - flags_ |= HostMemoryDirectAccess; - return true; + if (alloc_local) { + deviceMemory_ = dev().deviceLocalAlloc(size()); + if (deviceMemory_ != nullptr) { + flags_ |= HostMemoryDirectAccess; + return true; + } + } else { + deviceMemory_ = dev().hostAlloc(size(), 1, Device::MemorySegment::kNoAtomics); + if (deviceMemory_ != nullptr) { + flags_ |= HostMemoryDirectAccess; + return true; + } } return false; } @@ -1120,7 +1128,7 @@ bool Image::createInteropImage() { return true; } -bool Image::create() { +bool Image::create(bool alloc_local) { if (owner()->parent() != nullptr) { if (!ValidateMemory()) { return false; diff --git a/rocclr/device/rocm/rocmemory.hpp b/rocclr/device/rocm/rocmemory.hpp index a9b558d0d8..bcffcf3de1 100644 --- a/rocclr/device/rocm/rocmemory.hpp +++ b/rocclr/device/rocm/rocmemory.hpp @@ -60,7 +60,7 @@ class Memory : public device::Memory { uint mapFlags, size_t* rowPitch, size_t* slicePitch) override; // Create device memory according to OpenCL memory flag. - virtual bool create() = 0; + virtual bool create(bool local_alloc = false) = 0; // Pins system memory associated with this memory object. bool pinSystemMemory(void* hostPtr, // System memory address @@ -169,7 +169,7 @@ class Buffer : public roc::Memory { virtual ~Buffer(); // Create device memory according to OpenCL memory flag. - virtual bool create(); + virtual bool create(bool local_alloc = false); // Recreate the device memory using new size and alignment. bool recreate(size_t newSize, size_t newAlignment, bool forceSystem); @@ -198,7 +198,7 @@ class Image : public roc::Memory { virtual ~Image(); //! Create device memory according to OpenCL memory flag. - virtual bool create(); + virtual bool create(bool local_alloc = false); //! Create an image view bool createView(const Memory& parent); diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 4b002cd7d2..c23452612e 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -2829,6 +2829,13 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, const WriteAqlArgAt(hidden_arguments, gridSync, it.size_, it.offset_); break; } + case amd::KernelParameterDescriptor::HiddenHeap: + if (dev().HeapBuffer() != nullptr) { + // Add heap pointer to the code + size_t heap_ptr = static_cast(dev().HeapBuffer()->virtualAddress()); + WriteAqlArgAt(hidden_arguments, heap_ptr, it.size_, it.offset_); + } + break; case amd::KernelParameterDescriptor::HiddenBlockCountX: WriteAqlArgAt(hidden_arguments, static_cast(newGlobalSize[0] / local[0]), it.size_, it.offset_);