P4 to Git Change 1329979 by gandryey@gera-w8 on 2016/10/21 13:26:53

SWDEV-86035 - Add PAL backend to OpenCL - Allow device memory usage for blit kernels Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#27 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#11 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#9 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#15 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#32 edit [ROCm/clr commit: 43f8188d59]
2016-10-21 13:31:50 -04:00
@@ -801,6 +801,12 @@ KernelBlitManager::create(amd::Device& device)
 bool
 KernelBlitManager::createProgram(Device& device)
 {
+    if (device.blitProgram() == nullptr) {
+        if (!device.createBlitProgram()) {
+            return false;
+        }
+    }
+
    std::vector<amd::Device*> devices;
    devices.push_back(&device);

@@ -478,7 +478,7 @@ void NullDevice::fillDeviceInfo(
        info_.numAsyncQueues_       = numComputeRings;
        info_.numRTQueues_          =
            palProp.engineProperties[Pal::EngineTypeExclusiveCompute].engineCount  - 1;
-        info_.numRTCUs_             = 0x8;
+        info_.numRTCUs_             = 8;
            //palProp.engineProperties[Pal::EngineTypeExclusiveCompute].maxNumDedicatedCu;
        info_.threadTraceEnable_    = settings().threadTraceEnable_;
    }
@@ -718,6 +718,8 @@ Device::create(Pal::IDevice* device)
    palSettings->forceHighClocks = appProfile_.enableHighPerformanceState();
    palSettings->longRunningSubmissions = true;
    palSettings->cmdBufBatchedSubmitChainLimit = 0;
+    //palSettings->disableResourceProcessingManager = true;
+    //palSettings->disableScManager = true;

    // Commit the new settings for the device
    result = iDev()->CommitSettingsAndInit();
@@ -885,28 +887,6 @@ Device::initializeHeapResources()
            }
        }

-        // Delay compilation due to brig_loader memory allocation
-        const char* scheduler = nullptr;
-        const char* ocl20 = nullptr;
-#if !defined(WITH_LIGHTNING_COMPILER)
-        std::string sch = SchedulerSourceCode;
-        if (settings().oclVersion_ == OpenCL20) {
-            size_t loc = sch.find("%s");
-            sch.replace(loc, 2, iDev()->GetDispatchKernelSource());
-            scheduler = sch.c_str();
-            ocl20 = "-cl-std=CL2.0";
-        }
-#endif // !defined(WITH_LIGHTNING_COMPILER)
-        blitProgram_ = new BlitProgram(context_);
-        // Create blit programs
-        if (blitProgram_ == nullptr ||
-            !blitProgram_->create(this, scheduler, ocl20)) {
-            delete blitProgram_;
-            blitProgram_ = nullptr;
-            LogError("Couldn't create blit kernels!");
-            return false;
-        }
-
        // Create a synchronized transfer queue
        xferQueue_ = new VirtualGPU(*this);
        if (!(xferQueue_ && xferQueue_->create(
@@ -2176,6 +2156,35 @@ Device::updateFreeMemory(Pal::GpuHeap heap, Pal::gpusize size, bool free)
    }
 }

+bool
+Device::createBlitProgram()
+{
+    bool result = true;
+
+    // Delayed compilation due to brig_loader memory allocation
+    const char* scheduler = nullptr;
+    const char* ocl20 = nullptr;
+#if !defined(WITH_LIGHTNING_COMPILER)
+    std::string sch = SchedulerSourceCode;
+    if (settings().oclVersion_ == OpenCL20) {
+        size_t loc = sch.find("%s");
+        sch.replace(loc, 2, iDev()->GetDispatchKernelSource());
+        scheduler = sch.c_str();
+        ocl20 = "-cl-std=CL2.0";
+    }
+#endif // !defined(WITH_LIGHTNING_COMPILER)
+    blitProgram_ = new BlitProgram(context_);
+    // Create blit programs
+    if (blitProgram_ == nullptr ||
+        !blitProgram_->create(this, scheduler, ocl20)) {
+        delete blitProgram_;
+        blitProgram_ = nullptr;
+        LogError("Couldn't create blit kernels!");
+        result = false;
+    }
+    return result;
+}
+
 void
 Device::SrdManager::fillResourceList(std::vector<const Memory*>& memList)
 {
@@ -482,6 +482,9 @@ public:
        bool free               //!< TRUE if runtime frees memory
        );

+    //! Create internal blit program
+    bool createBlitProgram();
+
    //! Interop for GL device
    bool initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const;
    bool glCanInterop(void* GLplatformContext, void* GLdeviceContext) const;
@@ -394,22 +394,12 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
    if (!prog().isNull()) {
        code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align));
        Resource::MemoryType    type = Resource::Local;
-        if (flags_.internalKernel_) {
-            type = Resource::RemoteUSWC;
-        }

        // Initialize kernel ISA code
        if (code_ && code_->create(type)) {
-            if (flags_.internalKernel_) {
-                address cpuCodePtr = static_cast<address>(code_->map(nullptr, Resource::WriteOnly));
-                // Copy only amd_kernel_code_t
-                memcpy(cpuCodePtr, reinterpret_cast<address>(akc), codeSize_);
-                code_->unmap(nullptr);
-            }
-            else {
-                static_cast<const KernelBlitManager&>(dev().xferMgr()).writeRawData(
-                    *code_, codeSize_, reinterpret_cast<void*>(akc));
-            }
+            constexpr bool WaitForUpload = true;
+            code_->writeRawData(*code_->dev().xferQueue(), 0, codeSize_,
+                reinterpret_cast<void*>(akc), WaitForUpload);
        }
        else {
            LogError("Failed to allocate ISA code!");
@@ -469,8 +469,6 @@ HSAILProgram::linkImpl(amd::option::Options* options)
    aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
    bool finalize = true;
    bool hsaLoad = true;
-    internal_ = (compileOptions_.find("-cl-internal-kernel") !=
-        std::string::npos) ? true : false;

    // If !binaryElf_ then program must have been created using clCreateProgramWithBinary
    if (!binaryElf_) {
@@ -936,7 +934,7 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
    assert(size);
    assert(align);
    assert(sizeof(void*) == 8 || sizeof(void*) == 4);
-    if (program_->isNull() || program_->isInternal()) {
+    if (program_->isNull()) {
        return new char[size];
    }

@@ -962,19 +960,20 @@ bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src,
    if (0 == size) {
        return true;
    }
-    if (program_->isNull() || program_->isInternal()) {
+    if (program_->isNull()) {
        memcpy(reinterpret_cast<address>(dst) + offset, src, size);
        return true;
    }
    assert(program_->dev().xferQueue());
    pal::Memory* mem = reinterpret_cast<pal::Memory*>(dst);
-    return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true);
+    constexpr bool WaitForCopy = true;
+    mem->writeRawData(*mem->dev().xferQueue(), offset, size, src, WaitForCopy);
    return true;
 }

 void ORCAHSALoaderContext::GpuMemFree(void *ptr, size_t size)
 {
-    if (program_->isNull() || program_->isInternal()) {
+    if (program_->isNull()) {
        delete[] reinterpret_cast<char*>(ptr);
    }
    else {
@@ -1020,9 +1019,6 @@ LightningProgram::linkImpl(amd::option::Options *options)
 {
    using namespace amd::opencl_driver;

-    internal_ = (compileOptions_.find("-cl-internal-kernel") !=
-        std::string::npos) ? true : false;
-
    aclType continueCompileFrom = llvmBinary_.empty()
        ? getNextCompilationStageFromBinary(options)
        : ACL_TYPE_LLVMIR_BINARY;
@@ -182,9 +182,6 @@ public:
    //! Returns TRUE if the program just compiled
    bool isNull() const { return isNull_; }

-    //! Returns TRUE if the program used internally by runtime
-    bool isInternal() const { return internal_; }
-
    //! Returns TRUE if the program contains static samplers
    bool isStaticSampler() const { return (staticSamplers_.size() != 0); }

@@ -278,7 +275,6 @@ protected:
    union {
        struct {
            uint32_t    isNull_     : 1;    //!< Null program no memory allocations
-            uint32_t    internal_   : 1;    //!< Internal blit program
        };
        uint32_t    flags_;  //!< Program flags
    };
@@ -1118,14 +1118,18 @@ Resource::writeRawData(
    gpu.iCmd()->CmdUpdateMemory(*iMem(), offset, size, reinterpret_cast<const uint32_t*>(data));
    gpu.eventEnd(MainEngine, event);

-    setBusy(gpu, event);
-    // Update the global GPU event
-    gpu.setGpuEvent(event, false);
-
    if (waitForEvent) {
+        //! @note: We don't really have to mark the allocations as busy
+        //! if we are waiting for a transfer
+
        // Wait for event to complete
        gpu.waitForEvent(&event);
    }
+    else {
+        setBusy(gpu, event);
+        // Update the global GPU event
+        gpu.setGpuEvent(event, false);
+    }
 }
 static const Pal::ChNumFormat ChannelFmt(uint bytesPerElement)
 {
@@ -63,7 +63,7 @@ VirtualGPU::Queue::Create(
        qCreateInfo.numReservedCu = rtCU;
        qCreateInfo.engineIndex = 0x0;
        cmdCreateInfo.engineType = qCreateInfo.engineType = Pal::EngineTypeExclusiveCompute;
-        cmdCreateInfo.flags.rtCu = true;
+        cmdCreateInfo.flags.realtimeComputeUnits = true;
    }
 */
    // Find queue object size
@@ -801,28 +801,6 @@ VirtualGPU::create(bool profiling, uint  deviceQueueSize, uint rtCUs,
        return false;
    }

-    // Choose the appropriate class for blit engine
-    switch (dev().settings().blitEngine_) {
-        default:
-            // Fall through ...
-        case Settings::BlitEngineHost:
-            blitSetup.disableAll();
-            // Fall through ...
-        case Settings::BlitEngineCAL:
-        case Settings::BlitEngineKernel:
-            // use host blit for HW debug
-            if (dev().settings().enableHwDebug_) {
-                blitSetup.disableCopyImageToBuffer_   = true;
-                blitSetup.disableCopyBufferToImage_   = true;
-            }
-            blitMgr_ = new KernelBlitManager(*this, blitSetup);
-            break;
-    }
-    if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
-        LogError("Could not create BlitManager!");
-        return false;
-    }
-
    tsCache_ = new TimeStampCache(*this);
    if (nullptr == tsCache_) {
        LogError("Could not create TimeStamp cache!");
@@ -846,6 +824,28 @@ VirtualGPU::create(bool profiling, uint  deviceQueueSize, uint rtCUs,
        return false;
    }

+    // Choose the appropriate class for blit engine
+    switch (dev().settings().blitEngine_) {
+    default:
+        // Fall through ...
+    case Settings::BlitEngineHost:
+        blitSetup.disableAll();
+        // Fall through ...
+    case Settings::BlitEngineCAL:
+    case Settings::BlitEngineKernel:
+        // use host blit for HW debug
+        if (dev().settings().enableHwDebug_) {
+            blitSetup.disableCopyImageToBuffer_ = true;
+            blitSetup.disableCopyBufferToImage_ = true;
+        }
+        blitMgr_ = new KernelBlitManager(*this, blitSetup);
+        break;
+    }
+    if ((nullptr == blitMgr_) || !blitMgr_->create(gpuDevice_)) {
+        LogError("Could not create BlitManager!");
+        return false;
+    }
+
    return true;
 }