From de03c3f2e22779c5aad8600d431107f1f986f56b Mon Sep 17 00:00:00 2001 From: foreman Date: Fri, 22 Aug 2014 11:05:20 -0400 Subject: [PATCH] P4 to Git Change 1069294 by bsumner@bsumner-lin-opencl on 2014/08/22 10:56:33 ECR #304775 - fix bug 10248 where patching the local mem pointer hadn't been previously accounted for Affected files ... ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_kernel.h#22 edit ... //depot/stg/opencl/drivers/opencl/library/x86/common/src/misc/workitem.cl#23 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpucommand.cpp#63 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpucommand.hpp#36 edit [ROCm/clr commit: 85d86251c1c5a632203af19af835db4fdcedaf18] --- .../rocclr/runtime/device/cpu/cpucommand.cpp | 23 +++--- .../rocclr/runtime/device/cpu/cpucommand.hpp | 79 ++++++++++--------- 2 files changed, 55 insertions(+), 47 deletions(-) diff --git a/projects/clr/rocclr/runtime/device/cpu/cpucommand.cpp b/projects/clr/rocclr/runtime/device/cpu/cpucommand.cpp index a419f7b904..6511412a7d 100644 --- a/projects/clr/rocclr/runtime/device/cpu/cpucommand.cpp +++ b/projects/clr/rocclr/runtime/device/cpu/cpucommand.cpp @@ -28,8 +28,7 @@ WorkerThread::WorkerThread(const cpu::Device& device) : { localDataSize_ = (size_t) device.info().localMemSize_; localDataStorage_ = (address) amd::AlignedMemory::allocate( - localDataSize_ + __CPU_SCRATCH_SIZE, sizeof(cl_long16)) + - __CPU_SCRATCH_SIZE; + localDataSize_ + __CPU_SCRATCH_SIZE, sizeof(cl_long16)); #if defined(__linux__) && defined(NUMA_SUPPORT) const nodemask_t* numaMask = device.getNumaMask(); @@ -42,7 +41,7 @@ WorkerThread::WorkerThread(const cpu::Device& device) : WorkerThread::~WorkerThread() { guarantee(Thread::current() != this && "thread suicide!"); - amd::AlignedMemory::deallocate(localDataStorage_ - __CPU_SCRATCH_SIZE); + amd::AlignedMemory::deallocate(localDataStorage_); } bool @@ -392,15 +391,16 @@ NDRangeKernelBatch::execute() const size_t numWorkItems = command.sizes().local().product(); address params = thread.baseWorkItemsStack(); - address localMemPtr = thread.localDataStorage(); - if (!patchParameters(kernel, - params, localMemPtr, localMemPtr + thread.localDataSize(), + address baseLocalMemPtr = thread.localDataStorage(); + address patchedLocalMemPtr = thread.localDataStorage() + __CPU_SCRATCH_SIZE; + if (!patchParameters(kernel, params, + patchedLocalMemPtr, patchedLocalMemPtr + thread.localDataSize(), kernel.workGroupInfo()->localMemSize_)) { return; } WorkItem* workItem0 = ::new((WorkItem*)params - 1) WorkItem( - command.sizes(), localMemPtr); + command.sizes(), baseLocalMemPtr, patchedLocalMemPtr); WorkGroup wg(command, kernel, thread, params, workItem0, numWorkItems); @@ -549,7 +549,9 @@ WorkGroup::callKernelRange(kernelentrypoint_t entryPoint, } } -WorkItem::WorkItem(const amd::NDRangeContainer& sizes, void* localMemPtr) +WorkItem::WorkItem(const amd::NDRangeContainer& sizes, + void* scratchMemPtr, + void* localMemPtr) { const amd::NDRange& local = sizes.local(); const amd::NDRange& global = sizes.global(); @@ -557,9 +559,11 @@ WorkItem::WorkItem(const amd::NDRangeContainer& sizes, void* localMemPtr) const size_t dims = sizes.dimensions(); tib_.builtins = &Builtins::dispatchTable_; - tib_.work_dim = (cl_uint) sizes.dimensions(); tib_.local_mem_base = localMemPtr; + tib_.local_scratch = scratchMemPtr; tib_.table_base = (const void *)cpuTables; + tib_.work_dim = (cl_uint) sizes.dimensions(); + for (size_t i = 0; i < dims; ++i) { tib_.global_offset[i] = offset[i]; tib_.global_size[i] = global[i]; @@ -568,6 +572,7 @@ WorkItem::WorkItem(const amd::NDRangeContainer& sizes, void* localMemPtr) tib_.local_id[i] = 0; tib_.group_id[i] = 0; } + // Fill the remaining dimensions. for (size_t i = dims; i < sizeof(tib_.global_size)/sizeof(size_t); ++i) { tib_.global_offset[i] = 0; diff --git a/projects/clr/rocclr/runtime/device/cpu/cpucommand.hpp b/projects/clr/rocclr/runtime/device/cpu/cpucommand.hpp index 30afd56d9f..61c3805bde 100644 --- a/projects/clr/rocclr/runtime/device/cpu/cpucommand.hpp +++ b/projects/clr/rocclr/runtime/device/cpu/cpucommand.hpp @@ -101,7 +101,7 @@ public: }; protected: - amd::Command& command_; + amd::Command& command_; public: Operation(amd::Command& command) : command_(command) @@ -113,7 +113,7 @@ public: void cleanup(); - amd::Command& command() { return command_;} + amd::Command& command() { return command_;} virtual void execute() = 0; }; @@ -136,7 +136,10 @@ private: public: //! Initialize this workgroup. - WorkItem(const amd::NDRangeContainer& size, void* localMemPtr); + WorkItem( + const amd::NDRangeContainer& size, + void* scratchMemPtr, + void* localMemPtr); //! Return the current WorkItem (based of the current stack pointer). static WorkItem* current() { @@ -303,59 +306,59 @@ public: class WorkerThread : public amd::Thread { private: - Fiber mainFiber_; //!< main fiber for this worker thread. + Fiber mainFiber_; //!< main fiber for this worker thread. - amd::Monitor queueLock_; //!< lock protecting the queue. + amd::Monitor queueLock_; //!< lock protecting the queue. volatile int waitingOp_; - bool terminated_; //!< true if the thread is shutting down. - - //! Local memory storage - address localDataStorage_; - //! Size of the local memory. - size_t localDataSize_; + bool terminated_; //!< true if the thread is shutting down. + + //! Local memory storage + address localDataStorage_; + //! Size of the local memory. + size_t localDataSize_; char operation_[MAX_OPERATION_ALLOC_SIZE]; address baseWorkItemsStack_; private: - //! Awaits operations and execute them as they become ready. - void loop(); + //! Awaits operations and execute them as they become ready. + void loop(); public: - //! Construct a new WorkerThread. - WorkerThread(const cpu::Device& device); - //! Destroy the worker thread. - virtual ~WorkerThread(); - //! Cleanup the thread before termination. - bool terminate(); + //! Construct a new WorkerThread. + WorkerThread(const cpu::Device& device); + //! Destroy the worker thread. + virtual ~WorkerThread(); + //! Cleanup the thread before termination. + bool terminate(); - //! Return the main fiber for this thread. - Fiber& mainFiber() { return mainFiber_; } - //! Return the LDS for this thread - address localDataStorage() const { return localDataStorage_; } - //! Return the size of the local memory for this thread. - size_t localDataSize() const { return localDataSize_; } + //! Return the main fiber for this thread. + Fiber& mainFiber() { return mainFiber_; } + //! Return the LDS for this thread + address localDataStorage() const { return localDataStorage_; } + //! Return the size of the local memory for this thread. + size_t localDataSize() const { return localDataSize_; } address baseWorkItemsStack() { return baseWorkItemsStack_; } Operation* operation() { return reinterpret_cast(operation_); } bool isOperationValid() { return waitingOp_ > 0; } - //! Enqueue a new operation to execute in this thread. - void enqueue(Operation& op); - //! Signal to start processing the commands in the queue. - void flush() { amd::ScopedLock sl(queueLock_); queueLock_.notify(); } + //! Enqueue a new operation to execute in this thread. + void enqueue(Operation& op); + //! Signal to start processing the commands in the queue. + void flush() { amd::ScopedLock sl(queueLock_); queueLock_.notify(); } - //! This thread's execution engine. - void run(void* data) { - loop(); - } + //! This thread's execution engine. + void run(void* data) { + loop(); + } - //! Return the currently executing WorkerThread's instance. - static WorkerThread* current() - { - return static_cast(Thread::current()); - } + //! Return the currently executing WorkerThread's instance. + static WorkerThread* current() + { + return static_cast(Thread::current()); + } }; /*! @}