P4 to Git Change 1196584 by gandryey@gera-dev-w7 on 2015/10/02 14:01:41

SWDEV-77880 - Enable memory dependency tracking for HSAIL path - Use read/write attribute provided by the compiler for async execution optimization Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#299 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#118 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#384 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#136 edit
2015-10-02 14:19:07 -04:00
commit f8648f412d
@@ -2893,6 +2893,23 @@ GetHSAILArgAlignment(const aclArgData* argInfo)
    }
 }

+inline static HSAIL_ACCESS_TYPE
+GetHSAILArgAccessType(const aclArgData* argInfo)
+{
+    if (argInfo->type == ARG_TYPE_POINTER) {
+        switch (argInfo->arg.pointer.type) {
+        case ACCESS_TYPE_RO:
+            return HSAIL_ACCESS_TYPE_RO;
+        case ACCESS_TYPE_WO:
+            return HSAIL_ACCESS_TYPE_WO;
+        case ACCESS_TYPE_RW:
+        default:
+            return HSAIL_ACCESS_TYPE_RW;
+        }
+    }
+    return HSAIL_ACCESS_TYPE_NONE;
+}
+
 inline static HSAIL_ADDRESS_QUALIFIER
 GetHSAILAddrQual(const aclArgData* argInfo)
 {
@@ -3268,6 +3285,7 @@ HSAILKernel::initHsailArgs(const aclArgData* aclArg)
             (aclArg->arg.value.data != DATATYPE_struct)) ?
             aclArg->arg.value.numElements : 1;
        arg->alignment_ = GetHSAILArgAlignment(aclArg);
+        arg->access_    = GetHSAILArgAccessType(aclArg);
        offset += GetHSAILArgSize(aclArg);
        arguments_.push_back(arg);
    }
@@ -807,6 +807,12 @@ enum HSAIL_DATA_TYPE{
    HSAIL_DATATYPE_MAX_TYPES
 };

+enum HSAIL_ACCESS_TYPE {
+    HSAIL_ACCESS_TYPE_NONE = 0,
+    HSAIL_ACCESS_TYPE_RO,
+    HSAIL_ACCESS_TYPE_WO,
+    HSAIL_ACCESS_TYPE_RW
+};

 class HSAILKernel : public device::Kernel
 {
@@ -822,6 +828,7 @@ public:
        HSAIL_ADDRESS_QUALIFIER addrQual_;  //!< Address qualifier of the argument
        HSAIL_DATA_TYPE dataType_;  //!< The type of data
        uint        numElem_;       //!< Number of elements
+        HSAIL_ACCESS_TYPE access_;  //!< Access type for the argument
    };

    // Max number of possible extra (hidden) kernel arguments
@@ -1665,54 +1665,17 @@ VirtualGPU::submitKernelInternalHSA(
    std::vector<const Memory*>    memList;

    bool printfEnabled = (hsaKernel.printfInfo().size() > 0) ? true:false;
-    if (!printfDbgHSA().init(*this, printfEnabled )){
+    if (!printfDbgHSA().init(*this, printfEnabled )) {
        LogError( "Printf debug buffer initialization failed!");
        return false;
    }

-    bool deviceSupportFGS = 0 != (dev().info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM);
-    bool supportFineGrainedSystem = deviceSupportFGS;
-    FGSStatus status = kernel.parameters().getSvmSystemPointersSupport();
-    switch (status) {
-        case FGS_YES:
-            if (!deviceSupportFGS) {
-                return false;
-            }
-            supportFineGrainedSystem = true;
-            break;
-        case FGS_NO:
-            supportFineGrainedSystem = false;
-            break;
-        case FGS_DEFAULT:
-        default:
-            break;
+    // Check memory dependency and SVM objects
+    if (!processMemObjectsHSA(kernel, parameters, nativeMem, &memList)) {
+        LogError("Wrong memory objects!");
+        return false;
    }

-    size_t count = kernel.parameters().getNumberOfSvmPtr();
-    size_t execInfoOffset = kernel.parameters().getExecInfoOffset();
-    amd::Memory* memory = NULL;
-    //get svm non arugment information
-    void* const* svmPtrArray = reinterpret_cast<void* const*>(parameters + execInfoOffset);
-    for (size_t i = 0; i < count; i++) {
-        memory =  amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
-        if (NULL == memory) {
-            if (!supportFineGrainedSystem) {
-                return false;
-            }
-        }
-        else {
-            Memory* gpuMemory = dev().getGpuMemory(memory);
-            if (NULL != gpuMemory) {
-                memList.push_back(gpuMemory);
-            }
-            else {
-                return false;
-            }
-        }
-    }
-
-    // Check memory dependency and cache coherency
-    processMemObjectsHSA(kernel, parameters, nativeMem);
    cal_.memCount_ = 0;

    if (hsaKernel.dynamicParallelism()) {
@@ -3217,22 +3180,79 @@ VirtualGPU::profileEvent(EngineType engine, bool type) const
    }
 }

-void
+bool
 VirtualGPU::processMemObjectsHSA(
    const amd::Kernel&  kernel,
    const_address       params,
-    bool                nativeMem)
+    bool                nativeMem,
+    std::vector<const Memory*>* memList)
 {
    static const bool NoAlias = true;
    const HSAILKernel& hsaKernel = static_cast<const HSAILKernel&>
        (*(kernel.getDeviceKernel(dev(), NoAlias)));
+    const amd::KernelSignature& signature = kernel.signature();
+    const amd::KernelParameters& kernelParams = kernel.parameters();

    // Mark the tracker with a new kernel,
    // so we can avoid checks of the aliased objects
    memoryDependency().newKernel();

-    const amd::KernelSignature& signature = kernel.signature();
-    const amd::KernelParameters& kernelParams = kernel.parameters();
+    bool deviceSupportFGS = 0 != (dev().info().svmCapabilities_ & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM);
+    bool supportFineGrainedSystem = deviceSupportFGS;
+    FGSStatus status = kernelParams.getSvmSystemPointersSupport();
+    switch (status) {
+        case FGS_YES:
+            if (!deviceSupportFGS) {
+                return false;
+            }
+            supportFineGrainedSystem = true;
+            break;
+        case FGS_NO:
+            supportFineGrainedSystem = false;
+            break;
+        case FGS_DEFAULT:
+        default:
+            break;
+    }
+
+    size_t count = kernelParams.getNumberOfSvmPtr();
+    size_t execInfoOffset = kernelParams.getExecInfoOffset();
+    bool sync = true;
+
+    amd::Memory* memory = NULL;
+    //get svm non arugment information
+    void* const* svmPtrArray =
+        reinterpret_cast<void* const*>(params + execInfoOffset);
+    for (size_t i = 0; i < count; i++) {
+        memory =  amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
+        if (NULL == memory) {
+            if (!supportFineGrainedSystem) {
+                return false;
+            }
+            else if (sync) {
+                flushCUCaches();
+                // Clear memory dependency state
+                const static bool All = true;
+                memoryDependency().clear(!All);
+            }
+        }
+        else {
+            Memory* gpuMemory = dev().getGpuMemory(memory);
+            if (NULL != gpuMemory) {
+                // Synchronize data with other memory instances if necessary
+                gpuMemory->syncCacheFromHost(*this);
+
+                const static bool IsReadOnly = false;
+                // Validate SVM passed in the non argument list
+                memoryDependency().validate(*this, gpuMemory, IsReadOnly);
+
+                memList->push_back(gpuMemory);
+            }
+            else {
+                return false;
+            }
+        }
+    }

    // Check all parameters for the current kernel
    for (size_t i = 0; i < signature.numParameters(); ++i) {
@@ -3248,9 +3268,10 @@ VirtualGPU::processMemObjectsHSA(
                svmMem = amd::SvmManager::FindSvmBuffer(
                    *reinterpret_cast<void* const*>(params + desc.offset_));
                if (!svmMem) {
-                    //!\todo Do we have to sync cache coherency or wait for SDMA?
                    flushCUCaches();
-                    break;
+                    // Clear memory dependency state
+                    const static bool All = true;
+                    memoryDependency().clear(!All);
                }
            }

@@ -3271,10 +3292,11 @@ VirtualGPU::processMemObjectsHSA(
            }

            if (memory != NULL) {
-                //!@todo The code below can handle images only,
-                //! but the qualifier is broken anyway
+                // Check image
                readOnly = (desc.accessQualifier_ ==
                    CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false;
+                // Check buffer
+                readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
                // Validate memory for a dependency in the queue
                memoryDependency().validate(*this, memory, readOnly);
            }
@@ -3286,6 +3308,8 @@ VirtualGPU::processMemObjectsHSA(
        // Validate global store for a dependency in the queue
        memoryDependency().validate(*this, mem, IsReadOnly);
    }
+
+    return true;
 }

 amd::Memory*
@@ -463,10 +463,11 @@ private:
        );

    //! Detects memory dependency for HSAIL kernels and flushes caches
-    void processMemObjectsHSA(
+    bool processMemObjectsHSA(
        const amd::Kernel&  kernel,     //!< AMD kernel object for execution
        const_address       params,     //!< Pointer to the param's store
-        bool                nativeMem   //!< Native memory objects
+        bool                nativeMem,  //!< Native memory objects
+        std::vector<const Memory*>* memList //!< Memory list for KMD tracking
        );

    //! Common function for fill memory used by both svm Fill and non-svm fill