From de03c3f2e22779c5aad8600d431107f1f986f56b Mon Sep 17 00:00:00 2001
From: foreman
Date: Fri, 22 Aug 2014 11:05:20 -0400
Subject: [PATCH] P4 to Git Change 1069294 by bsumner@bsumner-lin-opencl on
2014/08/22 10:56:33
ECR #304775 - fix bug 10248 where patching the local mem pointer hadn't been previously accounted for
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_kernel.h#22 edit
... //depot/stg/opencl/drivers/opencl/library/x86/common/src/misc/workitem.cl#23 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpucommand.cpp#63 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpucommand.hpp#36 edit
[ROCm/clr commit: 85d86251c1c5a632203af19af835db4fdcedaf18]
---
.../rocclr/runtime/device/cpu/cpucommand.cpp | 23 +++---
.../rocclr/runtime/device/cpu/cpucommand.hpp | 79 ++++++++++---------
2 files changed, 55 insertions(+), 47 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/cpu/cpucommand.cpp b/projects/clr/rocclr/runtime/device/cpu/cpucommand.cpp
index a419f7b904..6511412a7d 100644
--- a/projects/clr/rocclr/runtime/device/cpu/cpucommand.cpp
+++ b/projects/clr/rocclr/runtime/device/cpu/cpucommand.cpp
@@ -28,8 +28,7 @@ WorkerThread::WorkerThread(const cpu::Device& device) :
{
localDataSize_ = (size_t) device.info().localMemSize_;
localDataStorage_ = (address) amd::AlignedMemory::allocate(
- localDataSize_ + __CPU_SCRATCH_SIZE, sizeof(cl_long16)) +
- __CPU_SCRATCH_SIZE;
+ localDataSize_ + __CPU_SCRATCH_SIZE, sizeof(cl_long16));
#if defined(__linux__) && defined(NUMA_SUPPORT)
const nodemask_t* numaMask = device.getNumaMask();
@@ -42,7 +41,7 @@ WorkerThread::WorkerThread(const cpu::Device& device) :
WorkerThread::~WorkerThread()
{
guarantee(Thread::current() != this && "thread suicide!");
- amd::AlignedMemory::deallocate(localDataStorage_ - __CPU_SCRATCH_SIZE);
+ amd::AlignedMemory::deallocate(localDataStorage_);
}
bool
@@ -392,15 +391,16 @@ NDRangeKernelBatch::execute()
const size_t numWorkItems = command.sizes().local().product();
address params = thread.baseWorkItemsStack();
- address localMemPtr = thread.localDataStorage();
- if (!patchParameters(kernel,
- params, localMemPtr, localMemPtr + thread.localDataSize(),
+ address baseLocalMemPtr = thread.localDataStorage();
+ address patchedLocalMemPtr = thread.localDataStorage() + __CPU_SCRATCH_SIZE;
+ if (!patchParameters(kernel, params,
+ patchedLocalMemPtr, patchedLocalMemPtr + thread.localDataSize(),
kernel.workGroupInfo()->localMemSize_)) {
return;
}
WorkItem* workItem0 = ::new((WorkItem*)params - 1) WorkItem(
- command.sizes(), localMemPtr);
+ command.sizes(), baseLocalMemPtr, patchedLocalMemPtr);
WorkGroup wg(command, kernel, thread, params, workItem0, numWorkItems);
@@ -549,7 +549,9 @@ WorkGroup::callKernelRange(kernelentrypoint_t entryPoint,
}
}
-WorkItem::WorkItem(const amd::NDRangeContainer& sizes, void* localMemPtr)
+WorkItem::WorkItem(const amd::NDRangeContainer& sizes,
+ void* scratchMemPtr,
+ void* localMemPtr)
{
const amd::NDRange& local = sizes.local();
const amd::NDRange& global = sizes.global();
@@ -557,9 +559,11 @@ WorkItem::WorkItem(const amd::NDRangeContainer& sizes, void* localMemPtr)
const size_t dims = sizes.dimensions();
tib_.builtins = &Builtins::dispatchTable_;
- tib_.work_dim = (cl_uint) sizes.dimensions();
tib_.local_mem_base = localMemPtr;
+ tib_.local_scratch = scratchMemPtr;
tib_.table_base = (const void *)cpuTables;
+ tib_.work_dim = (cl_uint) sizes.dimensions();
+
for (size_t i = 0; i < dims; ++i) {
tib_.global_offset[i] = offset[i];
tib_.global_size[i] = global[i];
@@ -568,6 +572,7 @@ WorkItem::WorkItem(const amd::NDRangeContainer& sizes, void* localMemPtr)
tib_.local_id[i] = 0;
tib_.group_id[i] = 0;
}
+
// Fill the remaining dimensions.
for (size_t i = dims; i < sizeof(tib_.global_size)/sizeof(size_t); ++i) {
tib_.global_offset[i] = 0;
diff --git a/projects/clr/rocclr/runtime/device/cpu/cpucommand.hpp b/projects/clr/rocclr/runtime/device/cpu/cpucommand.hpp
index 30afd56d9f..61c3805bde 100644
--- a/projects/clr/rocclr/runtime/device/cpu/cpucommand.hpp
+++ b/projects/clr/rocclr/runtime/device/cpu/cpucommand.hpp
@@ -101,7 +101,7 @@ public:
};
protected:
- amd::Command& command_;
+ amd::Command& command_;
public:
Operation(amd::Command& command) : command_(command)
@@ -113,7 +113,7 @@ public:
void cleanup();
- amd::Command& command() { return command_;}
+ amd::Command& command() { return command_;}
virtual void execute() = 0;
};
@@ -136,7 +136,10 @@ private:
public:
//! Initialize this workgroup.
- WorkItem(const amd::NDRangeContainer& size, void* localMemPtr);
+ WorkItem(
+ const amd::NDRangeContainer& size,
+ void* scratchMemPtr,
+ void* localMemPtr);
//! Return the current WorkItem (based of the current stack pointer).
static WorkItem* current() {
@@ -303,59 +306,59 @@ public:
class WorkerThread : public amd::Thread
{
private:
- Fiber mainFiber_; //!< main fiber for this worker thread.
+ Fiber mainFiber_; //!< main fiber for this worker thread.
- amd::Monitor queueLock_; //!< lock protecting the queue.
+ amd::Monitor queueLock_; //!< lock protecting the queue.
volatile int waitingOp_;
- bool terminated_; //!< true if the thread is shutting down.
-
- //! Local memory storage
- address localDataStorage_;
- //! Size of the local memory.
- size_t localDataSize_;
+ bool terminated_; //!< true if the thread is shutting down.
+
+ //! Local memory storage
+ address localDataStorage_;
+ //! Size of the local memory.
+ size_t localDataSize_;
char operation_[MAX_OPERATION_ALLOC_SIZE];
address baseWorkItemsStack_;
private:
- //! Awaits operations and execute them as they become ready.
- void loop();
+ //! Awaits operations and execute them as they become ready.
+ void loop();
public:
- //! Construct a new WorkerThread.
- WorkerThread(const cpu::Device& device);
- //! Destroy the worker thread.
- virtual ~WorkerThread();
- //! Cleanup the thread before termination.
- bool terminate();
+ //! Construct a new WorkerThread.
+ WorkerThread(const cpu::Device& device);
+ //! Destroy the worker thread.
+ virtual ~WorkerThread();
+ //! Cleanup the thread before termination.
+ bool terminate();
- //! Return the main fiber for this thread.
- Fiber& mainFiber() { return mainFiber_; }
- //! Return the LDS for this thread
- address localDataStorage() const { return localDataStorage_; }
- //! Return the size of the local memory for this thread.
- size_t localDataSize() const { return localDataSize_; }
+ //! Return the main fiber for this thread.
+ Fiber& mainFiber() { return mainFiber_; }
+ //! Return the LDS for this thread
+ address localDataStorage() const { return localDataStorage_; }
+ //! Return the size of the local memory for this thread.
+ size_t localDataSize() const { return localDataSize_; }
address baseWorkItemsStack() { return baseWorkItemsStack_; }
Operation* operation() { return reinterpret_cast(operation_); }
bool isOperationValid() { return waitingOp_ > 0; }
- //! Enqueue a new operation to execute in this thread.
- void enqueue(Operation& op);
- //! Signal to start processing the commands in the queue.
- void flush() { amd::ScopedLock sl(queueLock_); queueLock_.notify(); }
+ //! Enqueue a new operation to execute in this thread.
+ void enqueue(Operation& op);
+ //! Signal to start processing the commands in the queue.
+ void flush() { amd::ScopedLock sl(queueLock_); queueLock_.notify(); }
- //! This thread's execution engine.
- void run(void* data) {
- loop();
- }
+ //! This thread's execution engine.
+ void run(void* data) {
+ loop();
+ }
- //! Return the currently executing WorkerThread's instance.
- static WorkerThread* current()
- {
- return static_cast(Thread::current());
- }
+ //! Return the currently executing WorkerThread's instance.
+ static WorkerThread* current()
+ {
+ return static_cast(Thread::current());
+ }
};
/*! @}