P4 to Git Change 1069294 by bsumner@bsumner-lin-opencl on 2014/08/22 10:56:33

ECR #304775 - fix bug 10248 where patching the local mem pointer hadn't been previously accounted for Affected files ... ... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_kernel.h#22 edit ... //depot/stg/opencl/drivers/opencl/library/x86/common/src/misc/workitem.cl#23 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpucommand.cpp#63 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/cpu/cpucommand.hpp#36 edit
2014-08-22 11:05:20 -04:00
@@ -28,8 +28,7 @@ WorkerThread::WorkerThread(const cpu::Device& device) :
 {
    localDataSize_ = (size_t) device.info().localMemSize_;
    localDataStorage_ = (address) amd::AlignedMemory::allocate(
-        localDataSize_ + __CPU_SCRATCH_SIZE, sizeof(cl_long16)) +
-	__CPU_SCRATCH_SIZE;
+        localDataSize_ + __CPU_SCRATCH_SIZE, sizeof(cl_long16));

 #if defined(__linux__) && defined(NUMA_SUPPORT)
    const nodemask_t* numaMask = device.getNumaMask();
@@ -42,7 +41,7 @@ WorkerThread::WorkerThread(const cpu::Device& device) :
 WorkerThread::~WorkerThread()
 {
    guarantee(Thread::current() != this && "thread suicide!");
-    amd::AlignedMemory::deallocate(localDataStorage_ - __CPU_SCRATCH_SIZE);
+    amd::AlignedMemory::deallocate(localDataStorage_);
 }

 bool
@@ -392,15 +391,16 @@ NDRangeKernelBatch::execute()
    const size_t numWorkItems = command.sizes().local().product();

    address params = thread.baseWorkItemsStack();
-    address localMemPtr = thread.localDataStorage();
-    if (!patchParameters(kernel,
-        params, localMemPtr, localMemPtr + thread.localDataSize(),
+    address baseLocalMemPtr = thread.localDataStorage();
+    address patchedLocalMemPtr = thread.localDataStorage() + __CPU_SCRATCH_SIZE;
+    if (!patchParameters(kernel, params,
+        patchedLocalMemPtr, patchedLocalMemPtr + thread.localDataSize(),
        kernel.workGroupInfo()->localMemSize_)) {
        return;
    }

    WorkItem* workItem0 = ::new((WorkItem*)params - 1) WorkItem(
-        command.sizes(), localMemPtr);
+        command.sizes(), baseLocalMemPtr, patchedLocalMemPtr);

    WorkGroup wg(command, kernel, thread, params, workItem0, numWorkItems);

@@ -549,7 +549,9 @@ WorkGroup::callKernelRange(kernelentrypoint_t entryPoint,
    }
 }

-WorkItem::WorkItem(const amd::NDRangeContainer& sizes, void* localMemPtr)
+WorkItem::WorkItem(const amd::NDRangeContainer& sizes,
+                   void* scratchMemPtr,
+                   void* localMemPtr)
 {
    const amd::NDRange& local = sizes.local();
    const amd::NDRange& global = sizes.global();
@@ -557,9 +559,11 @@ WorkItem::WorkItem(const amd::NDRangeContainer& sizes, void* localMemPtr)
    const size_t dims = sizes.dimensions();

    tib_.builtins = &Builtins::dispatchTable_;
-    tib_.work_dim = (cl_uint) sizes.dimensions();
    tib_.local_mem_base = localMemPtr;
+    tib_.local_scratch = scratchMemPtr;
    tib_.table_base = (const void *)cpuTables;
+    tib_.work_dim = (cl_uint) sizes.dimensions();
+
    for (size_t i = 0; i < dims; ++i) {
        tib_.global_offset[i] = offset[i];
        tib_.global_size[i] = global[i];
@@ -568,6 +572,7 @@ WorkItem::WorkItem(const amd::NDRangeContainer& sizes, void* localMemPtr)
        tib_.local_id[i] = 0;
        tib_.group_id[i] = 0;
    }
+
    // Fill the remaining dimensions.
    for (size_t i = dims; i < sizeof(tib_.global_size)/sizeof(size_t); ++i) {
        tib_.global_offset[i] =  0;
@@ -101,7 +101,7 @@ public:
    };

 protected:
-	amd::Command& command_; 
+    amd::Command& command_; 

 public:
    Operation(amd::Command& command) : command_(command)
@@ -113,7 +113,7 @@ public:

    void cleanup();

-	amd::Command& command() { return command_;}
+    amd::Command& command() { return command_;}

    virtual void execute() = 0;
 };
@@ -136,7 +136,10 @@ private:

 public:
    //! Initialize this workgroup.
-    WorkItem(const amd::NDRangeContainer& size, void* localMemPtr);
+    WorkItem(
+        const amd::NDRangeContainer& size,
+        void* scratchMemPtr,
+        void* localMemPtr);

    //! Return the current WorkItem (based of the current stack pointer).
    static WorkItem* current() {
@@ -303,59 +306,59 @@ public:
 class WorkerThread : public amd::Thread
 {
 private:
-	Fiber mainFiber_; //!< main fiber for this worker thread.
+    Fiber mainFiber_; //!< main fiber for this worker thread.

-	amd::Monitor queueLock_; //!< lock protecting the queue.
+    amd::Monitor queueLock_; //!< lock protecting the queue.
    volatile int waitingOp_;
-	bool terminated_; //!< true if the thread is shutting down.
-	
-	//! Local memory storage
-	address localDataStorage_;
-	//! Size of the local memory.
-	size_t localDataSize_;
+    bool terminated_; //!< true if the thread is shutting down.
+    
+    //! Local memory storage
+    address localDataStorage_;
+    //! Size of the local memory.
+    size_t localDataSize_;

    char operation_[MAX_OPERATION_ALLOC_SIZE];

    address baseWorkItemsStack_;
 private:
-	//! Awaits operations and execute them as they become ready.
-	void loop();
+    //! Awaits operations and execute them as they become ready.
+    void loop();

 public:
-	//! Construct a new WorkerThread.
-	WorkerThread(const cpu::Device& device);
-	//! Destroy the worker thread.
-	virtual ~WorkerThread();
-	//! Cleanup the thread before termination.
-	bool terminate();
+    //! Construct a new WorkerThread.
+    WorkerThread(const cpu::Device& device);
+    //! Destroy the worker thread.
+    virtual ~WorkerThread();
+    //! Cleanup the thread before termination.
+    bool terminate();

-	//! Return the main fiber for this thread.
-	Fiber& mainFiber() { return mainFiber_; }
-	//! Return the LDS for this thread
-	address localDataStorage() const { return localDataStorage_; }
-	//! Return the size of the local memory for this thread.
-	size_t localDataSize() const { return localDataSize_; }
+    //! Return the main fiber for this thread.
+    Fiber& mainFiber() { return mainFiber_; }
+    //! Return the LDS for this thread
+    address localDataStorage() const { return localDataStorage_; }
+    //! Return the size of the local memory for this thread.
+    size_t localDataSize() const { return localDataSize_; }

    address baseWorkItemsStack() { return baseWorkItemsStack_; }

    Operation* operation() { return reinterpret_cast<Operation*>(operation_); }
    bool isOperationValid() { return waitingOp_ > 0; }

-	//! Enqueue a new operation to execute in this thread.
-	void enqueue(Operation& op);
-	//! Signal to start processing the commands in the queue.
-	void flush() { amd::ScopedLock sl(queueLock_); queueLock_.notify(); }
+    //! Enqueue a new operation to execute in this thread.
+    void enqueue(Operation& op);
+    //! Signal to start processing the commands in the queue.
+    void flush() { amd::ScopedLock sl(queueLock_); queueLock_.notify(); }

-	//! This thread's execution engine.
-	void run(void* data) {
-		loop();
-	}
+    //! This thread's execution engine.
+    void run(void* data) {
+        loop();
+    }

-	//! Return the currently executing WorkerThread's instance.
-	static WorkerThread* current()
-	{
-		return static_cast<WorkerThread*>(Thread::current());
-	}
+    //! Return the currently executing WorkerThread's instance.
+    static WorkerThread* current()
+    {
+        return static_cast<WorkerThread*>(Thread::current());
+    }
 };

 /*! @}