P4 to Git Change 1250684 by gandryey@gera-w8 on 2016/03/23 17:59:05

SWDEV-86035 - Add PAL backend to OpenCL - Update PAL backend to match the latests PAL interfaces Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/Makefile#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/build/Makefile.pal#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/build/Makefile#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbe/build/Makefile.palbe#1 add ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#2 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#2 edit
2016-03-23 18:09:21 -04:00
parent 68da626227
commit a340e36035
@@ -937,6 +937,8 @@ KernelBlitManager::copyBufferToImage(
    static const bool CopyRect = false;
    // Flush DMA for ASYNC copy
    static const bool FlushDMA = true;
+    size_t imgRowPitch = size[0] * gpuMem(dstMemory).elementSize();
+    size_t imgSlicePitch = imgRowPitch * size[1];

    if (setup_.disableCopyBufferToImage_) {
        result = DmaBlitManager::copyBufferToImage(
@@ -947,7 +949,9 @@ KernelBlitManager::copyBufferToImage(
    }
    // Check if buffer is in system memory with direct access
    else if (gpuMem(srcMemory).isHostMemDirectAccess() &&
-             (rowPitch == 0) && (slicePitch == 0)) {
+             (((rowPitch == 0) && (slicePitch == 0)) ||
+              ((rowPitch == imgRowPitch) &&
+               ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) {
        // First attempt to do this all with DMA,
        // but there are restriciton with older hardware
        if (dev().settings().imageDMA_) {
@@ -1327,6 +1331,8 @@ KernelBlitManager::copyImageToBuffer(
    static const bool CopyRect = false;
    // Flush DMA for ASYNC copy
    static const bool FlushDMA = true;
+    size_t imgRowPitch = size[0] * gpuMem(srcMemory).elementSize();
+    size_t imgSlicePitch = imgRowPitch * size[1];

    if (setup_.disableCopyImageToBuffer_) {
        result = HostBlitManager::copyImageToBuffer(
@@ -1337,7 +1343,9 @@ KernelBlitManager::copyImageToBuffer(
    }
    // Check if buffer is in system memory with direct access
    else if (gpuMem(dstMemory).isHostMemDirectAccess() &&
-             (rowPitch == 0) && (slicePitch == 0)) {
+             (((rowPitch == 0) && (slicePitch == 0)) ||
+              ((rowPitch == imgRowPitch) &&
+                ((slicePitch == 0) || (slicePitch == imgSlicePitch))))) {
        // First attempt to do this all with DMA,
        // but there are restriciton with older hardware
        if (dev().settings().imageDMA_) {
@@ -175,10 +175,10 @@ void NullDevice::fillDeviceInfo(

    info_.maxWorkItemDimensions_    = 3;
    info_.maxComputeUnits_          =
-        palProp.gfxipProperties.engineCore.numOfShaderEngines *
-        palProp.gfxipProperties.engineCore.numOfShaderArrays *
-        palProp.gfxipProperties.engineCore.numOfCUsPerShaderArray;
-    info_.numberOfShaderEngines     = palProp.gfxipProperties.engineCore.numOfShaderEngines;
+        palProp.gfxipProperties.shaderCore.numShaderEngines *
+        palProp.gfxipProperties.shaderCore.numShaderArrays *
+        palProp.gfxipProperties.shaderCore.numCusPerShaderArray;
+    info_.numberOfShaderEngines     = palProp.gfxipProperties.shaderCore.numShaderEngines;

    // SI parts are scalar.  Also, reads don't need to be 128-bits to get peak rates.
    // For example, float4 is not faster than float as long as all threads fetch the same
@@ -417,7 +417,7 @@ void NullDevice::fillDeviceInfo(
        info_.simdPerCU_            = hwInfo()->simdPerCU_;
        info_.simdWidth_            = hwInfo()->simdWidth_;
        info_.simdInstructionWidth_ = hwInfo()->simdInstructionWidth_;
-        info_.wavefrontWidth_       = palProp.gfxipProperties.engineCore.wavefrontSize;
+        info_.wavefrontWidth_       = palProp.gfxipProperties.shaderCore.wavefrontSize;
        //info_.globalMemChannels_    = calAttr.memBusWidth / 32;
        //info_.globalMemChannelBanks_    = calAttr.numMemBanks;
        info_.globalMemChannelBankWidth_ = hwInfo()->memChannelBankWidth_;
@@ -1541,35 +1541,34 @@ Device::createView(amd::Memory& owner, const device::Memory& parent) const

 //! Attempt to bind with external graphics API's device/context
 bool
-Device::bindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool validateOnly)
+Device::bindExternalDevice(uint flags, void* pDevice, void* pContext, bool validateOnly)
 {
    assert(pDevice);

-    switch (type) {
 #ifdef _WIN32
-    case CL_CONTEXT_D3D10_DEVICE_KHR:
+    if (flags & amd::Context::Flags::D3D10DeviceKhr) {
        if (!associateD3D10Device(pDevice)) {
            LogError("Failed gslD3D10Associate()");
            return false;
        }
-        break;
-    case CL_CONTEXT_D3D11_DEVICE_KHR:
+    }
+    else if (flags & amd::Context::Flags::D3D11DeviceKhr) {
        if (!associateD3D11Device(pDevice)) {
            LogError("Failed gslD3D11Associate()");
            return false;
        }
-        break;
-    case CL_CONTEXT_ADAPTER_D3D9_KHR:
-    case CL_CONTEXT_ADAPTER_D3D9EX_KHR:
+    }
+    else if (flags & (amd::Context::Flags::D3D9DeviceKhr |
+                      amd::Context::Flags::D3D9DeviceEXKhr)) {
        if (!associateD3D9Device(pDevice)) {
            LogWarning("D3D9<->OpenCL adapter mismatch or D3D9Associate() failure");
            return false;
        }
-        break;
-    case CL_CONTEXT_ADAPTER_DXVA_KHR:
-        break;
+    }
+    else if (flags & amd::Context::Flags::D3D9DeviceVAKhr) {
+    }
 #endif //_WIN32
-    case CL_GL_CONTEXT_KHR:
+    if (flags & amd::Context::Flags::GLDeviceKhr) {
        // Attempt to associate GSL-OGL
        if (!glAssociate(pContext, pDevice)) {
            if (!validateOnly) {
@@ -1577,20 +1576,15 @@ Device::bindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool va
            }
            return false;
        }
-        break;
-    default:
-        LogError("Unknown external device!");
-        return false;
-        break;
    }

    return true;
 }

 bool
-Device::unbindExternalDevice(intptr_t type, void* pDevice, void* pContext, bool validateOnly)
+Device::unbindExternalDevice(uint flags, void* pDevice, void* pContext, bool validateOnly)
 {
-    if (type != CL_GL_CONTEXT_KHR) {
+    if ((flags & amd::Context::Flags::GLDeviceKhr) == 0) {
        return true;
    }

@@ -1820,8 +1814,8 @@ Device::allocScratch(uint regNum, const VirtualGPU* vgpu)
                    // Calculate the size of the scratch buffer for a queue
                    uint32_t numTotalCUs = info().maxComputeUnits_;
                    uint32_t numMaxWaves =
-                        properties().gfxipProperties.engineCore.maxScratchWavesPerCU * numTotalCUs;
-                    scratchBuf->size_ = properties().gfxipProperties.engineCore.wavefrontSize *
+                        properties().gfxipProperties.shaderCore.maxScratchWavesPerCu * numTotalCUs;
+                    scratchBuf->size_ = properties().gfxipProperties.shaderCore.wavefrontSize *
                        scratchBuf->regNum_ * numMaxWaves * sizeof(uint32_t);
                    scratchBuf->size_ = amd::alignUp(scratchBuf->size_, 0xFFFF);
                    scratchBuf->offset_ = offset;
@@ -1920,8 +1914,7 @@ Device::fillHwSampler(

    samplerInfo.borderColorType = Pal::BorderColorType::TransparentBlack;

-    // Assign defaults
-    samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipBase;
+    samplerInfo.filter.zFilter = Pal::XyFilterPoint;

    samplerInfo.flags.unnormalizedCoords = !(state & amd::Sampler::StateNormalizedCoordsMask);

@@ -1956,24 +1949,16 @@ Device::fillHwSampler(

    // Program texture filter mode
    if (state == amd::Sampler::StateFilterLinear) {
-        samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipBase;
+        samplerInfo.filter.magnification = Pal::XyFilterLinear;
+        samplerInfo.filter.minification = Pal::XyFilterLinear;
+        samplerInfo.filter.zFilter = Pal::ZFilterLinear;
    }

    if (mipFilter == CL_FILTER_NEAREST) {
-        if (state == amd::Sampler::StateFilterLinear) {
-            samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipPoint;
-        }
-        else {
-            samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipPoint;
-        }
+        samplerInfo.filter.mipFilter = Pal::MipFilterPoint;
    }
    else if (mipFilter == CL_FILTER_LINEAR) {
-        if (state == amd::Sampler::StateFilterLinear) {
-            samplerInfo.filter = Pal::TexFilter::MagLinearMinLinearMipLinear;
-        }
-        else {
-            samplerInfo.filter = Pal::TexFilter::MagPointMinPointMipLinear;
-        }
+        samplerInfo.filter.mipFilter = Pal::MipFilterLinear;
    }

    iDev()->CreateSamplerSrds(1, &samplerInfo, hwState);
@@ -91,10 +91,10 @@ public:
    //! Needed for OpenGL objects on CPU device

    virtual bool bindExternalDevice(
-        intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
+        uint flags, void* pDevice, void* pContext, bool validateOnly) { return true; }

    virtual bool unbindExternalDevice(
-        intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
+        uint flags, void* pDevice, void* pContext, bool validateOnly) { return true; }

    //! Releases non-blocking map target memory
    virtual void freeMapTarget(amd::Memory& mem, void* target) {}
@@ -369,17 +369,11 @@ public:

    //! Attempt to bind with external graphics API's device/context
    virtual bool bindExternalDevice(
-        intptr_t type,
-        void* pDevice,
-        void* pContext,
-        bool validateOnly);
+        uint flags, void* pDevice, void* pContext, bool validateOnly);

    //! Attempt to unbind with external graphics API's device/context
    virtual bool unbindExternalDevice(
-        intptr_t type,
-        void* pDevice,
-        void* pContext,
-        bool validateOnly);
+        uint flags, void* pDevice, void* pContext, bool validateOnly);

    //! Validates kernel before execution
    virtual bool validateKernel(
@@ -387,40 +387,49 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
    if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast<void*>(&akc_align))) {
        return false;
    }
-    code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align));
-    Resource::MemoryType    type = Resource::RemoteUSWC;
-    if (flags_.internalKernel_) {
-        type = Resource::RemoteUSWC;
-    }
-    // Initialize kernel ISA code
-    if (code_ && code_->create(type)) {
-        address cpuCodePtr = static_cast<address>(code_->map(nullptr, Resource::WriteOnly));
-        // Copy only amd_kernel_code_t
-        memcpy(cpuCodePtr,  reinterpret_cast<address>(akc), codeSize_);
-        code_->unmap(nullptr);
-    }
-    else {
-        LogError("Failed to allocate ISA code!");
-        return false;
+    // Allocate HW resources for the real program only
+    if (!prog().isNull()) {
+        code_ = new Memory(dev(), amd::alignUp(codeSize_, akc_align));
+        Resource::MemoryType    type = Resource::RemoteUSWC;
+        if (flags_.internalKernel_) {
+            type = Resource::RemoteUSWC;
+        }
+        // Initialize kernel ISA code
+        if (code_ && code_->create(type)) {
+            address cpuCodePtr = static_cast<address>(code_->map(nullptr, Resource::WriteOnly));
+            // Copy only amd_kernel_code_t
+            memcpy(cpuCodePtr,  reinterpret_cast<address>(akc), codeSize_);
+            code_->unmap(nullptr);
+        }
+        else {
+            LogError("Failed to allocate ISA code!");
+            return false;
+        }
    }

    assert((akc->workitem_private_segment_byte_size & 3) == 0 &&
        "Scratch must be DWORD aligned");
    workGroupInfo_.scratchRegs_ =
        amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
-/*
-    workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable();
-    workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable();
-    workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize;
-    workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize;
-*/
    workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
    workGroupInfo_.localMemSize_ =
    workGroupInfo_.usedLDSSize_ = akc->workgroup_group_segment_byte_size;
    workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
    workGroupInfo_.usedStackSize_ = 0;
    workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
-    
+
+    if (!prog().isNull()) {
+        workGroupInfo_.availableSGPRs_ = dev().properties().gfxipProperties.shaderCore.numAvailableSgprs;
+        workGroupInfo_.availableVGPRs_ = dev().properties().gfxipProperties.shaderCore.numAvailableVgprs;
+        workGroupInfo_.preferredSizeMultiple_ =
+        workGroupInfo_.wavefrontPerSIMD_ =  dev().properties().gfxipProperties.shaderCore.wavefrontSize;
+    }
+    else {
+        workGroupInfo_.availableSGPRs_ = 104;
+        workGroupInfo_.availableVGPRs_ = 256;
+        workGroupInfo_.preferredSizeMultiple_ =
+        workGroupInfo_.wavefrontPerSIMD_ = 64;
+    }
    return true;
 }

@@ -633,10 +642,7 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
        }
    }

-    // Allocate HW resources for the real program only
-    if (!prog().isNull()) {
-        aqlCreateHWInfo(sym);
-    }
+    aqlCreateHWInfo(sym);

    // Pull out metadata from the ELF
    size_t sizeOfArgList;
@@ -153,7 +153,7 @@ public:
        { return cpuAqlCode_->workgroup_group_segment_byte_size; }

    //! Returns pointer on CPU to AQL code info
-    const void* cpuAqlCode() const { return cpuAqlCode_; }
+    const amd_kernel_code_t* cpuAqlCode() const { return cpuAqlCode_; }

    //! Returns memory object with AQL code
    pal::Memory* gpuAqlCode() const { return code_; }
@@ -505,7 +505,7 @@ HSAILProgram::linkImpl(amd::option::Options* options)
    hsa_agent_t agent;
    agent.handle = 1;
    if (!isNull() && hsaLoad) {
-        executable_ = loader_->CreateExecutable(HSA_PROFILE_BASE, nullptr);
+        executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL);
        if (executable_ == nullptr) {
            buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n";
            return false;
@@ -55,6 +55,11 @@ public:
    void* SegmentAddress(amdgpu_hsa_elf_segment_t segment,
        hsa_agent_t agent, void* seg, size_t offset) override;

+    void* SegmentHostAddress(amdgpu_hsa_elf_segment_t segment,
+        hsa_agent_t agent, void* seg, size_t offset) override {
+      return nullptr;
+    }
+
    bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment,
        hsa_agent_t agent, void* seg, size_t size) override { return false; }

@@ -43,6 +43,7 @@ VirtualGPU::Queue::Create(
    Pal::QueueCreateInfo        qCreateInfo = {};
    qCreateInfo.engineType = queueType;
    qCreateInfo.engineIndex = engineIdx;
+    qCreateInfo.aqlQueue = true;

    // Find queue object size
    size_t qSize = palDev->GetQueueSize(qCreateInfo, &result);
@@ -181,8 +182,10 @@ VirtualGPU::Queue::flush()
            memRef.push_back(it->first);
        }
    }
+
    if (memRef.size() != 0) {
-        iDev_->AddGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_);
+        iDev_->AddGpuMemoryReferences(memRef.size(), &memRef[0], iQueue_,
+             Pal::GpuMemoryRefCantTrim);
    }

    // Submit command buffer to OS
@@ -1982,12 +1985,12 @@ VirtualGPU::submitKernelInternal(
        eventBegin(MainEngine);
        if (nullptr == scratch) {
            iCmd()->CmdDispatchAql(aqlPkt, 0, 0, 0,
-                hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, 0x3ff);
+                hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), 0x3ff);
        }
        else {
            iCmd()->CmdDispatchAql(aqlPkt, scratch->memObj_->vmAddress(),
                scratch->size_, scratch->offset_,
-                hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo, 0x3ff);
+                hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), 0x3ff);
        }
        eventEnd(MainEngine, gpuEvent);

@@ -69,7 +69,8 @@ public:

        void addMemRef(Pal::IGpuMemory* iMem) const
        {
-            iDev_->AddGpuMemoryReferences(1, &iMem, NULL);
+            iDev_->AddGpuMemoryReferences(1, &iMem, NULL,
+                Pal::GpuMemoryRefCantTrim);
        }
        void removeMemRef(Pal::IGpuMemory* iMem) const
        {