diff --git a/projects/clr/rocclr/device/devhostcall.cpp b/projects/clr/rocclr/device/devhostcall.cpp index e385047c51..51b2d259b0 100644 --- a/projects/clr/rocclr/device/devhostcall.cpp +++ b/projects/clr/rocclr/device/devhostcall.cpp @@ -102,7 +102,8 @@ static void handlePayload(MessageHandler& messages, uint32_t service, uint64_t* } } else { amd::Context& ctx = dev.context(); - amd::Buffer* buf = new(ctx) amd::Buffer(ctx, CL_MEM_READ_WRITE, payload[1]); + amd::Buffer* buf = new(ctx) amd::Buffer(ctx, CL_MEM_READ_WRITE, payload[1], NULL, + (payload[1] == 2 * Mi) ? 2 * Mi : 0); uint64_t va = 0; if (buf) { if (buf->create()) { diff --git a/projects/clr/rocclr/device/device.hpp b/projects/clr/rocclr/device/device.hpp index ed04665560..bc08ea9775 100644 --- a/projects/clr/rocclr/device/device.hpp +++ b/projects/clr/rocclr/device/device.hpp @@ -1750,8 +1750,8 @@ class Device : public RuntimeObject { //! Allocate a chunk of device memory as a cache for a CL memory object virtual device::Memory* createMemory(Memory& owner) const = 0; - //! Allocate a chunk of device memory without owner class - virtual device::Memory* createMemory(size_t size) const = 0; + //! Allocate a chunk of device memory with address alignment + virtual device::Memory* createMemory(size_t size, size_t alignment = 0) const = 0; //! Allocate a device sampler object virtual bool createSampler(const Sampler&, device::Sampler**) const = 0; diff --git a/projects/clr/rocclr/device/pal/paldevice.cpp b/projects/clr/rocclr/device/pal/paldevice.cpp index ecc64a215b..cf302053e2 100644 --- a/projects/clr/rocclr/device/pal/paldevice.cpp +++ b/projects/clr/rocclr/device/pal/paldevice.cpp @@ -1687,6 +1687,7 @@ pal::Memory* Device::createBuffer(amd::Memory& owner, bool directAccess) const { if (owner.ipcShared()) { type = Resource::IpcMemory; } + params.alignment_ = owner.getAlignment(); // Create memory object result = gpuMemory->create(type, ¶ms); @@ -1887,9 +1888,11 @@ device::Memory* Device::createMemory(amd::Memory& owner) const { } // ================================================================================================ -device::Memory* Device::createMemory(size_t size) const { +device::Memory* Device::createMemory(size_t size, size_t alignment) const { auto buffer = new pal::Memory(*this, size); - if ((buffer == nullptr) || !buffer->create(Resource::Local)) { + Resource::CreateParams params {}; + params.alignment_ = alignment; + if ((buffer == nullptr) || !buffer->create(Resource::Local, ¶ms)) { LogError("Couldn't allocate memory on device!"); return nullptr; } @@ -2607,7 +2610,7 @@ void Device::HiddenHeapAlloc(const VirtualGPU& gpu) { heap_buffer_ = createMemory(HeapBufferSize); if (initial_heap_size_ != 0) { initial_heap_size_ = amd::alignUp(initial_heap_size_, 2 * Mi); - initial_heap_buffer_ = createMemory(initial_heap_size_); + initial_heap_buffer_ = createMemory(initial_heap_size_, 2 * Mi); } if (heap_buffer_ == nullptr) { LogError("Heap buffer allocation failed!"); diff --git a/projects/clr/rocclr/device/pal/paldevice.hpp b/projects/clr/rocclr/device/pal/paldevice.hpp index f7104a1c56..3c99abc0e9 100644 --- a/projects/clr/rocclr/device/pal/paldevice.hpp +++ b/projects/clr/rocclr/device/pal/paldevice.hpp @@ -92,8 +92,7 @@ class NullDevice : public amd::Device { //! Just returns NULL for the dummy device virtual device::Memory* createMemory(amd::Memory& owner) const { return nullptr; } //! Just returns NULL for the dummy device - virtual device::Memory* createMemory(size_t size) const { return nullptr; } - + virtual device::Memory* createMemory(size_t size, size_t alignment = 0) const { return nullptr; } //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object device::Sampler** sampler //!< device sampler object @@ -404,9 +403,7 @@ class Device : public NullDevice { //! Memory allocation virtual device::Memory* createMemory(amd::Memory& owner //!< abstraction layer memory object ) const; - virtual device::Memory* createMemory(size_t size //!< Size of memory allocation - ) const; - + virtual device::Memory* createMemory(size_t size, size_t alignment = 0) const; //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object device::Sampler** sampler //!< device sampler object diff --git a/projects/clr/rocclr/device/pal/palresource.cpp b/projects/clr/rocclr/device/pal/palresource.cpp index ab931a99fd..3bc5d64038 100644 --- a/projects/clr/rocclr/device/pal/palresource.cpp +++ b/projects/clr/rocclr/device/pal/palresource.cpp @@ -1323,7 +1323,9 @@ bool Resource::create(MemoryType memType, CreateParams* params, bool forceLinear Pal::GpuMemoryCreateInfo createInfo = {}; createInfo.size = desc().width_ * elementSize_; createInfo.size = amd::alignUp(createInfo.size, MaxGpuAlignment); - createInfo.alignment = desc().scratch_ ? 64 * Ki : MaxGpuAlignment; + createInfo.alignment = (params && params->alignment_ != 0) + ? params->alignment_ + : (desc().scratch_ ? 64 * Ki : MaxGpuAlignment); createInfo.vaRange = Pal::VaRange::Default; createInfo.priority = Pal::GpuMemPriority::Normal; diff --git a/projects/clr/rocclr/device/pal/palresource.hpp b/projects/clr/rocclr/device/pal/palresource.hpp index 7f6f55c3db..7d1fb37e43 100644 --- a/projects/clr/rocclr/device/pal/palresource.hpp +++ b/projects/clr/rocclr/device/pal/palresource.hpp @@ -105,7 +105,9 @@ class Resource : public amd::HeapObject { VirtualGPU* gpu_; //!< Resource won't be shared between multiple queues const Resource* svmBase_; //!< SVM base for MGPU allocations bool interprocess_; //!< Ressource can be used in the interprocess communication - CreateParams() : owner_(nullptr), gpu_(nullptr), svmBase_(nullptr), interprocess_(false) {} + size_t alignment_; //!< allocation address alignment + CreateParams() : owner_(nullptr), gpu_(nullptr), svmBase_(nullptr), interprocess_(false), + alignment_(0) {} }; struct PinnedParams : public CreateParams { diff --git a/projects/clr/rocclr/device/rocm/rocdevice.cpp b/projects/clr/rocclr/device/rocm/rocdevice.cpp index 0ecb1e7c59..9ab3403f2a 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.cpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.cpp @@ -2044,7 +2044,7 @@ device::Memory* Device::createMemory(amd::Memory& owner) const { } // ================================================================================================ -device::Memory* Device::createMemory(size_t size) const { +device::Memory* Device::createMemory(size_t size, size_t alignment) const { auto buffer = new roc::Buffer(*this, size); static constexpr bool LocalAlloc = true; if ((buffer == nullptr) || !buffer->create(LocalAlloc)) { diff --git a/projects/clr/rocclr/device/rocm/rocdevice.hpp b/projects/clr/rocclr/device/rocm/rocdevice.hpp index 8980d98530..d74600f614 100644 --- a/projects/clr/rocclr/device/rocm/rocdevice.hpp +++ b/projects/clr/rocclr/device/rocm/rocdevice.hpp @@ -178,11 +178,10 @@ class NullDevice : public amd::Device { ShouldNotReachHere(); return nullptr; } - device::Memory* createMemory(size_t size) const override { + device::Memory* createMemory(size_t size, size_t alignment = 0) const override { ShouldNotReachHere(); return nullptr; } - //! Sampler object allocation bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object device::Sampler** sampler //!< device sampler object @@ -369,8 +368,7 @@ class Device : public NullDevice { virtual device::Program* createProgram(amd::Program& owner, amd::option::Options* options = nullptr); virtual device::Memory* createMemory(amd::Memory& owner) const; - virtual device::Memory* createMemory(size_t size) const; - + virtual device::Memory* createMemory(size_t size, size_t alignment = 0) const; //! Sampler object allocation virtual bool createSampler(const amd::Sampler& owner, //!< abstraction layer sampler object device::Sampler** sampler //!< device sampler object diff --git a/projects/clr/rocclr/platform/memory.cpp b/projects/clr/rocclr/platform/memory.cpp index d801a1eabd..ab4e683ce8 100644 --- a/projects/clr/rocclr/platform/memory.cpp +++ b/projects/clr/rocclr/platform/memory.cpp @@ -77,7 +77,8 @@ void HostMemoryReference::deallocateMemory(const Context& context) { } } -Memory::Memory(Context& context, Type type, Flags flags, size_t size, void* svmPtr) +Memory::Memory(Context& context, Type type, Flags flags, size_t size, void* svmPtr, + size_t alignment) : numDevices_(0), deviceMemories_(NULL), destructorCallbacks_(NULL), @@ -96,7 +97,8 @@ Memory::Memory(Context& context, Type type, Flags flags, size_t size, void* svmP svmHostAddress_(svmPtr), resOffset_(0), flagsEx_(0), - lockMemoryOps_(true) /* Memory Ops Lock */ { + lockMemoryOps_(true), + alignment_(alignment) /* Memory Ops Lock */ { svmPtrCommited_ = (flags & CL_MEM_SVM_FINE_GRAIN_BUFFER) ? true : false; canBeCached_ = true; } diff --git a/projects/clr/rocclr/platform/memory.hpp b/projects/clr/rocclr/platform/memory.hpp index 20b3d4c11a..a19e2660c6 100644 --- a/projects/clr/rocclr/platform/memory.hpp +++ b/projects/clr/rocclr/platform/memory.hpp @@ -216,7 +216,6 @@ class Memory : public amd::RuntimeObject { uint32_t uniqueId_ = 0; //! used to save the user data during memory allocation. UserData userData_; - private: //! Disable default assignment operator Memory& operator=(const Memory&); @@ -227,6 +226,7 @@ class Memory : public amd::RuntimeObject { Monitor lockMemoryOps_; //!< Lock to serialize memory operations std::set subBuffers_; //!< List of all subbuffers for this memory object device::Memory* svmBase_; //!< svmBase allocation for MGPU case + size_t alignment_ = 0; //!< alignment for allocation address protected: //! The constructor creates a memory object but does not allocate either host memory @@ -235,7 +235,8 @@ class Memory : public amd::RuntimeObject { Type type, //!< Memory type Flags flags, //!< Object's flags size_t size, //!< Memory size - void* svmPtr = NULL //!< svm host memory address, NULL if no SVM mem object + void* svmPtr = NULL, //!< svm host memory address, NULL if no SVM mem object + size_t alignment = 0 //!< allocation addr alignment ); Memory(Memory& parent, //!< Context object Flags flags, //!< Object's flags @@ -419,6 +420,9 @@ class Memory : public amd::RuntimeObject { //! Validate memory access for vmm memory bool ValidateMemAccess(const Device& dev, bool read_write); + + //! Get alignment_ + size_t getAlignment() const { return alignment_; } }; //! Buffers are a specialization of memory. Just a wrapper, really, @@ -436,8 +440,8 @@ class Buffer : public Memory { : Memory(context, type, flags, size) {} public: - Buffer(Context& context, Flags flags, size_t size, void* svmPtr = NULL) - : Memory(context, CL_MEM_OBJECT_BUFFER, flags, size, svmPtr) {} + Buffer(Context& context, Flags flags, size_t size, void* svmPtr = NULL, size_t alignment = 0) + : Memory(context, CL_MEM_OBJECT_BUFFER, flags, size, svmPtr, alignment) {} Buffer(Memory& parent, Flags flags, size_t origin, size_t size) : Memory(parent, flags, origin, size) {}