P4 to Git Change 1089823 by gandryey@gera-dev-w7 on 2014/10/21 17:52:16

ECR #304775 - Move OCL runtime to the latest HSA1.0 spec Affected files ... ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/brig_loader.cpp#14 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/build/Makefile.gpu#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/build/Makefile.gpu#57 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudefs.hpp#115 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#471 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#267 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#102 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#181 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#14 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#335 edit
2014-10-21 18:07:39 -04:00
@@ -57,6 +57,12 @@ const static uint CalOfflineImpl = 0xffffffff;
 //! Alignment restriciton for the pinned memory
 const static size_t PinnedMemoryAlignment = 4 * Ki;

+//! HSA path specific defines for images
+const static uint HsaImageObjectSize = 48;
+const static uint HsaImageObjectAlignment = 16;
+const static uint HsaSamplerObjectSize = 32;
+const static uint HsaSamplerObjectAlignment = 16;
+
 //! Defines all supported ASIC families
 enum AsicFamilies {
    Family7xx,
@@ -909,7 +909,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)

    // Allocate SRD manager
    srdManager_ = new SrdManager(*this,
-        std::max(HSA_IMAGE_OBJECT_SIZE, HSA_SAMPLER_OBJECT_SIZE), 64 * Ki);
+        std::max(HsaImageObjectSize, HsaSamplerObjectSize), 64 * Ki);
    if (srdManager_ == NULL) {
        return false;
    }
@@ -2509,7 +2509,7 @@ Sampler::create(
    if (0 == hwSrd_) {
        return false;
    }
-    dev_.fillHwSampler(oclSamplerState, hwState_, HSA_SAMPLER_OBJECT_SIZE);
+    dev_.fillHwSampler(oclSamplerState, hwState_, HsaSamplerObjectSize);
    return true;
 }

@@ -3791,7 +3791,10 @@ WriteAqlArg(
    *dst += size;
 }

-HsaAqlDispatchPacket*
+const hsa_packet_header_t DISPATCH_PACKET_HEADER = {
+    HSA_PACKET_TYPE_DISPATCH, 1, HSA_FENCE_SCOPE_SYSTEM, HSA_FENCE_SCOPE_COMPONENT, 0 };
+
+hsa_dispatch_packet_t*
 HSAILKernel::loadArguments(
    VirtualGPU&                     gpu,
    const amd::Kernel&              kernel,
@@ -3944,7 +3947,7 @@ HSAILKernel::loadArguments(
            if (dev().settings().hsailDirectSRD_) {
                // Image arguments are of size 48 bytes and aligned to 16 bytes
                WriteAqlArg(&aqlArgBuf, image->hwState(),
-                    HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
+                    HsaImageObjectSize, HsaImageObjectAlignment);
            }
            else {
                //! \note Special case for the image views.
@@ -3952,9 +3955,9 @@ HSAILKernel::loadArguments(
                //! this view without a wait for SRD resource.
                if (image->memoryType() == Resource::ImageView) {
                    // Copy the current structre into CB1
-                    memcpy(aqlStruct, image->hwState(), HSA_IMAGE_OBJECT_SIZE);
+                    memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
                    ConstBuffer* cb = gpu.constBufs_[1];
-                    cb->uploadDataToHw(HSA_IMAGE_OBJECT_SIZE);
+                    cb->uploadDataToHw(HsaImageObjectSize);
                    // Then use a pointer in aqlArgBuffer to CB1
                    uint64_t srd = cb->vmAddress() + cb->wrtOffset();
                    WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
@@ -3983,7 +3986,7 @@ HSAILKernel::loadArguments(
                    (sampler->getDeviceSampler(dev()));
            if (dev().settings().hsailDirectSRD_) {
                WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
-                    HSA_SAMPLER_OBJECT_SIZE, HSA_SAMPLER_OBJECT_ALIGNMENT);
+                    HsaSamplerObjectSize, HsaSamplerObjectAlignment);
            }
            else {
                uint64_t srd = gpuSampler->hwSrd();
@@ -4025,46 +4028,41 @@ HSAILKernel::loadArguments(
        "Size and the number of arguments don't match!");
    uint  argBufSize = amd::alignUp(
        static_cast<uint>(argsBufferSize()), sizeof(uint32_t));
-    HsaAqlDispatchPacket*   aqlPkt = reinterpret_cast<HsaAqlDispatchPacket*>(
+    hsa_dispatch_packet_t*   hsaDisp = reinterpret_cast<hsa_dispatch_packet_t*>(
        gpu.cb(0)->sysMemCopy() + argBufSize);

-    amd::NDRange    local(sizes.local());
-    amd::NDRange    global(sizes.global());
-
-    // Initialize the Global, Local and Offset values
-    aqlPkt->dimensions = sizes.dimensions();
-    // Initialize the work grid parameter
-    for (uint idx = 0; idx < 3; idx++) {
-        aqlPkt->grid_size[idx] = 1;
-        aqlPkt->workgroup_size[idx] = 1;
-    }
+    amd::NDRange        local(sizes.local());
+    const amd::NDRange& global = sizes.global();

    // Check if runtime has to find local workgroup size
-    findLocalWorkSize(aqlPkt->dimensions, global, local);
-    for (uint idx = 0; idx < aqlPkt->dimensions; idx++) {
-        aqlPkt->grid_size[idx] = global[idx];
-        aqlPkt->workgroup_size[idx] = local[idx];
-    }
+    findLocalWorkSize(sizes.dimensions(), sizes.global(), local);

-    // Initialize if dispatch should enable profiling
-    aqlPkt->reserved2 = 0;  //config->profile ? 1:0;
+    hsaDisp->header = DISPATCH_PACKET_HEADER;
+    hsaDisp->dimensions = sizes.dimensions();
+    hsaDisp->reserved = 0;
+
+    hsaDisp->workgroup_size_x = local[0];
+    hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
+    hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
+
+    hsaDisp->grid_size_x = global[0];
+    hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
+    hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
+    hsaDisp->reserved2 = 0;

    // Initialize kernel ISA and execution buffer requirements
-    aqlPkt->kernel_object_address = gpuAqlCode()->vmAddress();
-    aqlPkt->group_segment_size_bytes = ldsAddress - ldsSize();
-    aqlPkt->private_segment_size_bytes = spillSegSize();
-
-    // Initialize cache flush configuration for the dispatch
-    //! @todo Currently not used in emulation
-    aqlPkt->barrier = 1;
-    aqlPkt->release_fence_scope = 1;
-    aqlPkt->acquire_fence_scope = 2;
-    aqlPkt->invalidate_instruction_cache = 1;
+    hsaDisp->private_segment_size   = spillSegSize();
+    hsaDisp->group_segment_size     = ldsAddress - ldsSize();
+    hsaDisp->kernel_object_address  = gpuAqlCode()->vmAddress();

    ConstBuffer* cb = gpu.constBufs_[0];
-    cb->uploadDataToHw(argBufSize + sizeof(HsaAqlDispatchPacket));
+    cb->uploadDataToHw(argBufSize + sizeof(hsa_dispatch_packet_t));
    uint64_t argList = cb->vmAddress() + cb->wrtOffset();
-    aqlPkt->kernel_arg_address = argList;
+
+    hsaDisp->kernarg_address = argList;
+    hsaDisp->reserved3 = 0;
+    hsaDisp->completion_signal = 0;
+
    memList.push_back(cb);
    memList.push_back(gpuAqlCode());
    if (NULL != prog().globalStore()) {
@@ -4078,7 +4076,7 @@ HSAILKernel::loadArguments(
        dev().srds().fillResourceList(memList);
    }

-    return aqlPkt;
+    return hsaDisp;
 }

 } // namespace gpu
@@ -15,7 +15,7 @@
 #include "device/gpu/gpuvirtual.hpp"
 #include "sc-hsa/Interface/SCHSAInterface.h"
 #include "device/gpu/gpuprintf.hpp"
-#include "newcore.h"
+#include "hsa.h"
 //! \namespace gpu GPU Device Implementation
 namespace gpu {

@@ -885,7 +885,7 @@ public:

    //! Returns AQL packet in CPU memory 
    //! if the kerenl arguments were successfully loaded, otherwise NULL
-    HsaAqlDispatchPacket* loadArguments(
+    hsa_dispatch_packet_t* loadArguments(
        VirtualGPU&                     gpu,        //!< Running GPU context
        const amd::Kernel&              kernel,     //!< AMD kernel object
        const amd::NDRangeContainer&    sizes,      //!< NDrange container
@@ -16,7 +16,8 @@
 #include <cstdio>
 #include "utils/options.hpp"
 #include "utils/libUtils.h"
-#include "newcore.h"
+#include "hsa.h"
+#include "hsa_ext_image.h"

 extern "C" bool
 ACL_API_ENTRY aclHsaLoader(
@@ -80,8 +81,8 @@ void
 GetSamplerObjectParams(uint32_t* size, uint32_t* alignment)
 {
    if (GPU_DIRECT_SRD) {
-        *size = HSA_SAMPLER_OBJECT_SIZE;
-        *alignment = HSA_SAMPLER_OBJECT_ALIGNMENT;
+        *size = gpu::HsaSamplerObjectSize;
+        *alignment = gpu::HsaSamplerObjectAlignment;
    }
    else {
        *size = sizeof(uint64_t);
@@ -94,45 +95,46 @@ InitializeSamplerObject(void* userData, uint64_t offset, bool unnormalize,
    uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW)
 {
    assert((addrU == addrV && addrV == addrW) && "GSL supports single address mode");
-    HsaSamplerFilterType filter = static_cast<HsaSamplerFilterType>(fltr);
-    HsaSamplerAddressMode boundaryU = static_cast<HsaSamplerAddressMode>(addrU);
+    hsa_ext_sampler_filter_mode_t filter =
+        static_cast<hsa_ext_sampler_filter_mode_t>(fltr);
+    hsa_ext_sampler_addressing_mode_t boundaryU =
+        static_cast<hsa_ext_sampler_addressing_mode_t>(addrU);

    uint32_t    state = (unnormalize) ?
        amd::Sampler::StateNormalizedCoordsFalse : amd::Sampler::StateNormalizedCoordsTrue;
-    if (filter == HSA_SAMP_FILTER_NEAREST) {
+    if (filter == HSA_EXT_SAMPLER_FILTER_LINEAR) {
        state |= amd::Sampler::StateFilterNearest;
    }
-    else if (filter == HSA_SAMP_FILTER_LINEAR) {
+    else if (filter == HSA_EXT_SAMPLER_FILTER_LINEAR) {
        state |= amd::Sampler::StateFilterLinear;
    }
    switch (boundaryU) {
-    case HSA_SAMP_ADDRESS_CLAMPEDGE:
+    case HSA_EXT_SAMPLER_ADDRESSING_CLAMP_TO_EDGE:
        state |= amd::Sampler::StateAddressClampToEdge;
        break;
-    case HSA_SAMP_ADDRESS_CLAMPBORDER:
+    case HSA_EXT_SAMPLER_ADDRESSING_CLAMP_TO_BORDER:
        state |= amd::Sampler::StateAddressClamp;
        break;
-    case HSA_SAMP_ADDRESS_WRAP:
+    case HSA_EXT_SAMPLER_ADDRESSING_REPEAT:
        state |= amd::Sampler::StateAddressRepeat;
        break;
-    case HSA_SAMP_ADDRESS_MIRROR:
+    case HSA_EXT_SAMPLER_ADDRESSING_MIRRORED_REPEAT:
        state |= amd::Sampler::StateAddressMirroredRepeat;
        break;
-    case HSA_SAMP_ADDRESS_MIRRORONCE:
-    case HSA_SAMP_ADDRESS_NONE:
+    case HSA_EXT_SAMPLER_ADDRESSING_UNDEFINED:
    default:
        break;
    }

    gpu::HSAILProgram* prog = reinterpret_cast<gpu::HSAILProgram*>(userData);
    if (prog->dev().settings().hsailDirectSRD_) {
-        char *pCPUbuf = new char[HSA_SAMPLER_OBJECT_SIZE];
+        char *pCPUbuf = new char[gpu::HsaSamplerObjectSize];
        if (!pCPUbuf) {
          assert(false);
          return;
        }
-        prog->dev().fillHwSampler(state, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE);
-        DmaMemoryCopy(userData, offset, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE);
+        prog->dev().fillHwSampler(state, pCPUbuf, gpu::HsaSamplerObjectSize);
+        DmaMemoryCopy(userData, offset, pCPUbuf, gpu::HsaSamplerObjectSize);
        delete pCPUbuf;
    }
    else {
@@ -4,7 +4,7 @@
 #ifndef GPUSCHED_HPP_
 #define GPUSCHED_HPP_

-#include "newcore.h"
+#include "hsa.h"

 namespace gpu {

@@ -47,7 +47,7 @@ struct AmdAqlWrap {
    uint64_t wait_list;     //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
    uint32_t wait_num;      //!< [LWO/SRO] The number of cl_event_wait objects    
    uint32_t reserved[5];   //!< For the future usage
-    HsaAqlDispatchPacket aql;  //!< [LWO/SRO] AQL packet – 64 bytes AQL packet
+    hsa_dispatch_packet_t aql;  //!< [LWO/SRO] AQL packet – 64 bytes AQL packet
 };

 struct AmdEvent {
@@ -14,7 +14,7 @@
 #include "device/gpu/gputhreadtrace.hpp"
 #include "device/gpu/gputimestamp.hpp"
 #include "device/gpu/gpublit.hpp"
-#include "newcore.h"
+#include "hsa.h"
 #include "sc-hsa/Interface/SCHSAInterface.h"
 #include <fstream>
 #include <sstream>
@@ -342,7 +342,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
    AmdAqlWrap* slots = reinterpret_cast<AmdAqlWrap*>(&header[1]);
    for (uint i = 0; i < numSlots; ++i) {
        uint64_t argStart = vaBase + argOffs + i * singleArgSize;
-        slots[i].aql.kernel_arg_address = argStart;
+        slots[i].aql.kernarg_address = argStart;
        slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64;
    }
    // Upload data back to local memory
@@ -576,7 +576,8 @@ VirtualGPU::create(
 bool
 VirtualGPU::allocHsaQueueMem()
 {
-    amd_queue_t queue = {0};
+    amd_queue_t queue;
+    memset(&queue, 0, sizeof(queue));
    hsaQueueMem_ = new gpu::Memory(dev(), sizeof(queue));
    if (hsaQueueMem_ == NULL) {
        return false;
@@ -1726,7 +1727,7 @@ VirtualGPU::submitKernelInternalHSA(
    }

    // Program the kernel arguments for the GPU execution
-    HsaAqlDispatchPacket*   aqlPkt =
+    hsa_dispatch_packet_t*  aqlPkt =
        hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem,
        vmDefQueue, &vmParentWrap, memList);
    if (NULL == aqlPkt) {
@@ -1793,12 +1794,12 @@ VirtualGPU::submitKernelInternalHSA(
                        print << "\tState: " << eventD->state <<
                                 "; Counter: " << eventD->counter << "\n";
                    }
-                    print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size[0] << ", ";
-                    print << wraps[i].aql.workgroup_size[1] << ", ";
-                    print << wraps[i].aql.workgroup_size[2] << "]\n";
-                    print << "GridSize[ " << wraps[i].aql.grid_size[0] << ", ";
-                    print << wraps[i].aql.grid_size[1] << ", ";
-                    print << wraps[i].aql.grid_size[2] << "]\n";
+                    print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
+                    print << wraps[i].aql.workgroup_size_y << ", ";
+                    print << wraps[i].aql.workgroup_size_z << "]\n";
+                    print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
+                    print << wraps[i].aql.grid_size_y << ", ";
+                    print << wraps[i].aql.grid_size_z << "]\n";

                    uint64_t* kernels = (uint64_t*)(
                        const_cast<Memory*>(hsaKernel.prog().kernelTable())->map(this));
@@ -1819,7 +1820,7 @@ VirtualGPU::submitKernelInternalHSA(
                        printf("Error: couldn't find child kernel!\n");
                        continue;
                    }
-                    uint offsArg = wraps[i].aql.kernel_arg_address -
+                    uint offsArg = wraps[i].aql.kernarg_address -
                        gpuDefQueue->virtualQueue_->vmAddress();
                    address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
                    print << "Kernel: " << child->name() << "\n";