From a94fa4eabb8a884dc93a359ee279af2ab58cb408 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Wed, 18 May 2016 18:11:40 -0400
Subject: [PATCH] P4 to Git Change 1270658 by gandryey@gera-w8 on 2016/05/18
 17:53:45

	SWDEV-86035 - Add PAL backend to OpenCL
	- Fix a crash in the pipe test. Device layer can't use device blit queue directly, but requires a blit manager call, which will perform correct wait for idle sequence.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#3 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#8 edit
... //depot/stg/opencl/drivers/opencl/tests/ocltst/module/runtime/OCLRTQueue.cpp#2 edit
---
 rocclr/runtime/device/pal/palblit.cpp    | 14 +++++++++++++-
 rocclr/runtime/device/pal/palblit.hpp    | 11 +++++++++--
 rocclr/runtime/device/pal/paldevice.cpp  |  7 ++++---
 rocclr/runtime/device/pal/palkernel.cpp  |  5 ++---
 rocclr/runtime/device/pal/palmemory.cpp  | 14 ++------------
 rocclr/runtime/device/pal/palprogram.cpp |  2 --
 rocclr/runtime/device/pal/palvirtual.cpp | 24 +++++++++---------------
 7 files changed, 39 insertions(+), 38 deletions(-)

diff --git a/rocclr/runtime/device/pal/palblit.cpp b/rocclr/runtime/device/pal/palblit.cpp
index ab5f713c8b..c9c1ac53d2 100644
--- a/rocclr/runtime/device/pal/palblit.cpp
+++ b/rocclr/runtime/device/pal/palblit.cpp
@@ -23,8 +23,8 @@ inline void
 DmaBlitManager::synchronize() const
 {
     if (syncOperation_) {
-        gpu().waitAllEngines();
         gpu().releaseMemObjects();
+        gpu().waitAllEngines();
     }
 }
 
@@ -2685,6 +2685,18 @@ KernelBlitManager::runScheduler(
     return result;
 }
 
+void
+KernelBlitManager::writeRawData(
+    device::Memory& memory,
+    size_t      size,
+    const void* data
+    ) const
+{
+    static_cast<pal::Memory&>(memory).writeRawData(gpu(), size, data, false);
+
+    synchronize();
+}
+
 amd::Memory*
 DmaBlitManager::pinHostMemory(
     const void* hostMem,
diff --git a/rocclr/runtime/device/pal/palblit.hpp b/rocclr/runtime/device/pal/palblit.hpp
index 462842dd60..112d1fb5af 100644
--- a/rocclr/runtime/device/pal/palblit.hpp
+++ b/rocclr/runtime/device/pal/palblit.hpp
@@ -371,14 +371,21 @@ public:
         bool        entire = false      //!< Entire buffer will be updated
         ) const;
 
-    //! Fills an image memory with a pattern data
-    virtual bool runScheduler(
+    //! Runs a GPU scheduler for device enqueue
+    bool runScheduler(
         device::Memory& vqueue,         //!< Memory object for virtual queue
         device::Memory& params,         //!< Extra arguments for the scheduler
         uint    paramIdx,               //!< Parameter index
         uint    threads                 //!< Number of scheduling threads
         ) const;
 
+    //! Writes CPU raw data into GPU memory
+    void writeRawData(
+        device::Memory& memory,         //!< Memory object for data udpate
+        size_t      size,               //!< Size of raw data
+        const void* data                //!< Raw data pointer
+        ) const;
+
 private:
     static const size_t MaxXferBuffers = 2;
 
diff --git a/rocclr/runtime/device/pal/paldevice.cpp b/rocclr/runtime/device/pal/paldevice.cpp
index 8c3e1d3e26..90c81e2a6a 100644
--- a/rocclr/runtime/device/pal/paldevice.cpp
+++ b/rocclr/runtime/device/pal/paldevice.cpp
@@ -987,8 +987,8 @@ Device::init()
     acl_error   error;
     compiler_ = aclCompilerInit(&opts, &error);
     if (error != ACL_SUCCESS) {
-            LogError("Error initializing the compiler");
-            return false;
+        LogError("Error initializing the compiler");
+        return false;
     }
 
     size_t size = Pal::GetPlatformSize();
@@ -1210,7 +1210,8 @@ Device::createBuffer(
                 // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure.
                 // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit
                 size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()};
-                gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true);
+                static_cast<const KernelBlitManager&>(xferMgr()).writeRawData(
+                    *gpuMemory, sizeof(pipeInit), pipeInit);
             }
             // If memory has direct access from host, then get CPU address
             if (gpuMemory->isHostMemDirectAccess() &&
diff --git a/rocclr/runtime/device/pal/palkernel.cpp b/rocclr/runtime/device/pal/palkernel.cpp
index 7b0e47baad..ea82fd075a 100644
--- a/rocclr/runtime/device/pal/palkernel.cpp
+++ b/rocclr/runtime/device/pal/palkernel.cpp
@@ -677,9 +677,8 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
         return false;
     }
 
-    //! @todo get the right value;
     // Copy wavefront size
-    workGroupInfo_.wavefrontSize_ = 64;//dev().getAttribs().wavefrontSize;
+    workGroupInfo_.wavefrontSize_ = dev().properties().gfxipProperties.shaderCore.wavefrontSize;
     // Find total workgroup size
     if (workGroupInfo_.compileSize_[0] != 0) {
         workGroupInfo_.size_ =
@@ -999,7 +998,7 @@ HSAILKernel::loadArguments(
                     break;
                 }
 
-                //! @todo 64 bit isn't supported with 32 bit binary
+                //! 64 bit isn't supported with 32 bit binary
                 uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset();
                 WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*));
 
diff --git a/rocclr/runtime/device/pal/palmemory.cpp b/rocclr/runtime/device/pal/palmemory.cpp
index 01b9124fd6..6f03d78cdb 100644
--- a/rocclr/runtime/device/pal/palmemory.cpp
+++ b/rocclr/runtime/device/pal/palmemory.cpp
@@ -151,14 +151,8 @@ Memory::create(
                 reinterpret_cast<Resource::ViewParams*>(params);
             // Check if parent was allocated in system memory
             if ((view->resource_->memoryType() == Resource::Pinned) ||
-                (((view->resource_->memoryType() == Resource::Remote) ||
-                  (view->resource_->memoryType() == Resource::RemoteUSWC)) &&
-                // @todo Enable unconditional optimization for remote memory
-                // Check for external allocation, to avoid the optimization
-                // for non-VM (double copy) mode
-                 (owner() != nullptr) &&
-                 ((owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) ||
-                  dev().settings().remoteAlloc_))) {
+                (view->resource_->memoryType() == Resource::Remote) ||
+                (view->resource_->memoryType() == Resource::RemoteUSWC)) {
                 // Marks memory object for direct GPU access to the host memory
                 flags_ |= HostMemoryDirectAccess;
             }
@@ -578,10 +572,6 @@ Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags)
             }
         }
 
-        //!@todo A wait isn't really necessary. However
-        //! Linux no-VM may have extra random failures.
-        wait(gpu);
-
         // Should never fail
         assert(result && "Memory synchronization failed!");
     }
diff --git a/rocclr/runtime/device/pal/palprogram.cpp b/rocclr/runtime/device/pal/palprogram.cpp
index 383b170552..d677959ff9 100644
--- a/rocclr/runtime/device/pal/palprogram.cpp
+++ b/rocclr/runtime/device/pal/palprogram.cpp
@@ -237,7 +237,6 @@ HSAILProgram::getCompilationStagesFromBinary(std::vector<aclType>& completeStage
     aclType from = ACL_TYPE_DEFAULT;
     needOptionsCheck = true;
     size_t boolSize = sizeof(bool);
-    //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
     // Checking llvmir in .llvmir section
     bool containsSpirv = true;
     errorCode = aclQueryInfo(dev().compiler(), binaryElf_,
@@ -375,7 +374,6 @@ HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) {
           return continueCompileFrom;
       }
       bool recompile = false;
-      //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
       switch (continueCompileFrom) {
       case ACL_TYPE_HSAIL_BINARY:
       case ACL_TYPE_CG:
diff --git a/rocclr/runtime/device/pal/palvirtual.cpp b/rocclr/runtime/device/pal/palvirtual.cpp
index 98e3764cfe..e43b9c1320 100644
--- a/rocclr/runtime/device/pal/palvirtual.cpp
+++ b/rocclr/runtime/device/pal/palvirtual.cpp
@@ -149,8 +149,9 @@ VirtualGPU::Queue::addCmdMemRef(Pal::IGpuMemory* iMem)
 void
 VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem)
 {
-    memReferences_.erase(iMem);
-    iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
+    if (0 != memReferences_.erase(iMem)) {
+        iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
+    }
 }
 
 uint
@@ -2264,39 +2265,32 @@ VirtualGPU::submitMarker(amd::Marker& vcmd)
 GpuEvent*
 VirtualGPU::getGpuEvent(Pal::IGpuMemory* iMem)
 {
-    GpuEvents::iterator it = gpuEvents_.find(iMem);
-    if (it == gpuEvents_.end()) {
-//        queue(MainEngine).addMemRef(iMem);
-//        queue(SdmaEngine).addMemRef(iMem);
-    }
     return &gpuEvents_[iMem];
 }
 
 void 
 VirtualGPU::assignGpuEvent(Pal::IGpuMemory* iMem, GpuEvent gpuEvent)
 { 
-    GpuEvents::iterator it = gpuEvents_.find(iMem);
+    auto it = gpuEvents_.find(iMem);
+
     if (it != gpuEvents_.end()) {
         it->second = gpuEvent;
     }
     else {
-//        queue(gpuEvent.engineId_).addMemRef(iMem);
         gpuEvents_[iMem] = gpuEvent;
     }
-//    queues_[gpuEvent.engineId_]->addCmdMemRef(iMem);
 }
 
 void
 VirtualGPU::releaseMemory(Pal::IGpuMemory* iMem, bool wait)
 {
+    auto it = gpuEvents_.find(iMem);
     //! @note if there is no wait, then it's a view release
-    if (wait) {
-        waitForEvent(&gpuEvents_[iMem]);
-        //queue(MainEngine).removeMemRef(iMem);
-        //queue(SdmaEngine).removeMemRef(iMem);
+    if (wait &&  (it != gpuEvents_.end())) {
+        waitForEvent(&it->second);
         queues_[MainEngine]->removeCmdMemRef(iMem);
         queues_[SdmaEngine]->removeCmdMemRef(iMem);
-        gpuEvents_.erase(iMem);
+        gpuEvents_.erase(it);
     }
 }