P4 to Git Change 1270658 by gandryey@gera-w8 on 2016/05/18 17:53:45

SWDEV-86035 - Add PAL backend to OpenCL - Fix a crash in the pipe test. Device layer can't use device blit queue directly, but requires a blit manager call, which will perform correct wait for idle sequence. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#6 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#3 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#8 edit ... //depot/stg/opencl/drivers/opencl/tests/ocltst/module/runtime/OCLRTQueue.cpp#2 edit
2016-05-18 18:11:40 -04:00
@@ -23,8 +23,8 @@ inline void
 DmaBlitManager::synchronize() const
 {
    if (syncOperation_) {
-        gpu().waitAllEngines();
        gpu().releaseMemObjects();
+        gpu().waitAllEngines();
    }
 }

@@ -2685,6 +2685,18 @@ KernelBlitManager::runScheduler(
    return result;
 }

+void
+KernelBlitManager::writeRawData(
+    device::Memory& memory,
+    size_t      size,
+    const void* data
+    ) const
+{
+    static_cast<pal::Memory&>(memory).writeRawData(gpu(), size, data, false);
+
+    synchronize();
+}
+
 amd::Memory*
 DmaBlitManager::pinHostMemory(
    const void* hostMem,
@@ -371,14 +371,21 @@ public:
        bool        entire = false      //!< Entire buffer will be updated
        ) const;

-    //! Fills an image memory with a pattern data
-    virtual bool runScheduler(
+    //! Runs a GPU scheduler for device enqueue
+    bool runScheduler(
        device::Memory& vqueue,         //!< Memory object for virtual queue
        device::Memory& params,         //!< Extra arguments for the scheduler
        uint    paramIdx,               //!< Parameter index
        uint    threads                 //!< Number of scheduling threads
        ) const;

+    //! Writes CPU raw data into GPU memory
+    void writeRawData(
+        device::Memory& memory,         //!< Memory object for data udpate
+        size_t      size,               //!< Size of raw data
+        const void* data                //!< Raw data pointer
+        ) const;
+
 private:
    static const size_t MaxXferBuffers = 2;

@@ -987,8 +987,8 @@ Device::init()
    acl_error   error;
    compiler_ = aclCompilerInit(&opts, &error);
    if (error != ACL_SUCCESS) {
-            LogError("Error initializing the compiler");
-            return false;
+        LogError("Error initializing the compiler");
+        return false;
    }

    size_t size = Pal::GetPlatformSize();
@@ -1210,7 +1210,8 @@ Device::createBuffer(
                // Pipe initialize in order read_idx, write_idx, end_idx. Refer clk_pipe_t structure.
                // Init with 3 DWORDS for 32bit addressing and 6 DWORDS for 64bit
                size_t pipeInit[3] = {0 , 0, owner.asPipe()->getMaxNumPackets()};
-                gpuMemory->writeRawData(*xferQueue_, sizeof(pipeInit), pipeInit, true);
+                static_cast<const KernelBlitManager&>(xferMgr()).writeRawData(
+                    *gpuMemory, sizeof(pipeInit), pipeInit);
            }
            // If memory has direct access from host, then get CPU address
            if (gpuMemory->isHostMemDirectAccess() &&
@@ -677,9 +677,8 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
        return false;
    }

-    //! @todo get the right value;
    // Copy wavefront size
-    workGroupInfo_.wavefrontSize_ = 64;//dev().getAttribs().wavefrontSize;
+    workGroupInfo_.wavefrontSize_ = dev().properties().gfxipProperties.shaderCore.wavefrontSize;
    // Find total workgroup size
    if (workGroupInfo_.compileSize_[0] != 0) {
        workGroupInfo_.size_ =
@@ -999,7 +998,7 @@ HSAILKernel::loadArguments(
                    break;
                }

-                //! @todo 64 bit isn't supported with 32 bit binary
+                //! 64 bit isn't supported with 32 bit binary
                uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset();
                WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*));

@@ -151,14 +151,8 @@ Memory::create(
                reinterpret_cast<Resource::ViewParams*>(params);
            // Check if parent was allocated in system memory
            if ((view->resource_->memoryType() == Resource::Pinned) ||
-                (((view->resource_->memoryType() == Resource::Remote) ||
-                  (view->resource_->memoryType() == Resource::RemoteUSWC)) &&
-                // @todo Enable unconditional optimization for remote memory
-                // Check for external allocation, to avoid the optimization
-                // for non-VM (double copy) mode
-                 (owner() != nullptr) &&
-                 ((owner()->getMemFlags() & CL_MEM_ALLOC_HOST_PTR) ||
-                  dev().settings().remoteAlloc_))) {
+                (view->resource_->memoryType() == Resource::Remote) ||
+                (view->resource_->memoryType() == Resource::RemoteUSWC)) {
                // Marks memory object for direct GPU access to the host memory
                flags_ |= HostMemoryDirectAccess;
            }
@@ -578,10 +572,6 @@ Memory::syncCacheFromHost(VirtualGPU& gpu, device::Memory::SyncFlags syncFlags)
            }
        }

-        //!@todo A wait isn't really necessary. However
-        //! Linux no-VM may have extra random failures.
-        wait(gpu);
-
        // Should never fail
        assert(result && "Memory synchronization failed!");
    }
@@ -237,7 +237,6 @@ HSAILProgram::getCompilationStagesFromBinary(std::vector<aclType>& completeStage
    aclType from = ACL_TYPE_DEFAULT;
    needOptionsCheck = true;
    size_t boolSize = sizeof(bool);
-    //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
    // Checking llvmir in .llvmir section
    bool containsSpirv = true;
    errorCode = aclQueryInfo(dev().compiler(), binaryElf_,
@@ -375,7 +374,6 @@ HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) {
          return continueCompileFrom;
      }
      bool recompile = false;
-      //! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
      switch (continueCompileFrom) {
      case ACL_TYPE_HSAIL_BINARY:
      case ACL_TYPE_CG:
@@ -149,8 +149,9 @@ VirtualGPU::Queue::addCmdMemRef(Pal::IGpuMemory* iMem)
 void
 VirtualGPU::Queue::removeCmdMemRef(Pal::IGpuMemory* iMem)
 {
-    memReferences_.erase(iMem);
-    iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
+    if (0 != memReferences_.erase(iMem)) {
+        iDev_->RemoveGpuMemoryReferences(1, &iMem, iQueue_);
+    }
 }

 uint
@@ -2264,39 +2265,32 @@ VirtualGPU::submitMarker(amd::Marker& vcmd)
 GpuEvent*
 VirtualGPU::getGpuEvent(Pal::IGpuMemory* iMem)
 {
-    GpuEvents::iterator it = gpuEvents_.find(iMem);
-    if (it == gpuEvents_.end()) {
-//        queue(MainEngine).addMemRef(iMem);
-//        queue(SdmaEngine).addMemRef(iMem);
-    }
    return &gpuEvents_[iMem];
 }

 void 
 VirtualGPU::assignGpuEvent(Pal::IGpuMemory* iMem, GpuEvent gpuEvent)
 { 
-    GpuEvents::iterator it = gpuEvents_.find(iMem);
+    auto it = gpuEvents_.find(iMem);
+
    if (it != gpuEvents_.end()) {
        it->second = gpuEvent;
    }
    else {
-//        queue(gpuEvent.engineId_).addMemRef(iMem);
        gpuEvents_[iMem] = gpuEvent;
    }
-//    queues_[gpuEvent.engineId_]->addCmdMemRef(iMem);
 }

 void
 VirtualGPU::releaseMemory(Pal::IGpuMemory* iMem, bool wait)
 {
+    auto it = gpuEvents_.find(iMem);
    //! @note if there is no wait, then it's a view release
-    if (wait) {
-        waitForEvent(&gpuEvents_[iMem]);
-        //queue(MainEngine).removeMemRef(iMem);
-        //queue(SdmaEngine).removeMemRef(iMem);
+    if (wait &&  (it != gpuEvents_.end())) {
+        waitForEvent(&it->second);
        queues_[MainEngine]->removeCmdMemRef(iMem);
        queues_[SdmaEngine]->removeCmdMemRef(iMem);
-        gpuEvents_.erase(iMem);
+        gpuEvents_.erase(it);
    }
 }