From 692a1b9f9a35dfd5b8443bf3b7e88bfb06d0918b Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Tue, 21 Oct 2014 18:07:39 -0400
Subject: [PATCH] P4 to Git Change 1089823 by gandryey@gera-dev-w7 on
 2014/10/21 17:52:16

	ECR #304775 - Move OCL runtime to the latest HSA1.0 spec

Affected files ...

... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/brig_loader.cpp#14 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/build/Makefile.gpu#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/build/Makefile.gpu#57 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudefs.hpp#115 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#471 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#267 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#102 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#181 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#335 edit
---
 rocclr/runtime/device/gpu/gpudefs.hpp    |  6 ++
 rocclr/runtime/device/gpu/gpudevice.cpp  |  4 +-
 rocclr/runtime/device/gpu/gpukernel.cpp  | 70 ++++++++++++------------
 rocclr/runtime/device/gpu/gpukernel.hpp  |  4 +-
 rocclr/runtime/device/gpu/gpuprogram.cpp | 34 ++++++------
 rocclr/runtime/device/gpu/gpusched.hpp   |  4 +-
 rocclr/runtime/device/gpu/gpuvirtual.cpp | 23 ++++----
 7 files changed, 76 insertions(+), 69 deletions(-)

diff --git a/rocclr/runtime/device/gpu/gpudefs.hpp b/rocclr/runtime/device/gpu/gpudefs.hpp
index da68bd8600..8a3d271c1a 100644
--- a/rocclr/runtime/device/gpu/gpudefs.hpp
+++ b/rocclr/runtime/device/gpu/gpudefs.hpp
@@ -57,6 +57,12 @@ const static uint CalOfflineImpl = 0xffffffff;
 //! Alignment restriciton for the pinned memory
 const static size_t PinnedMemoryAlignment = 4 * Ki;
 
+//! HSA path specific defines for images
+const static uint HsaImageObjectSize = 48;
+const static uint HsaImageObjectAlignment = 16;
+const static uint HsaSamplerObjectSize = 32;
+const static uint HsaSamplerObjectAlignment = 16;
+
 //! Defines all supported ASIC families
 enum AsicFamilies {
     Family7xx,
diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp
index 004fc66b58..159c916109 100644
--- a/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -909,7 +909,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
 
     // Allocate SRD manager
     srdManager_ = new SrdManager(*this,
-        std::max(HSA_IMAGE_OBJECT_SIZE, HSA_SAMPLER_OBJECT_SIZE), 64 * Ki);
+        std::max(HsaImageObjectSize, HsaSamplerObjectSize), 64 * Ki);
     if (srdManager_ == NULL) {
         return false;
     }
@@ -2509,7 +2509,7 @@ Sampler::create(
     if (0 == hwSrd_) {
         return false;
     }
-    dev_.fillHwSampler(oclSamplerState, hwState_, HSA_SAMPLER_OBJECT_SIZE);
+    dev_.fillHwSampler(oclSamplerState, hwState_, HsaSamplerObjectSize);
     return true;
 }
 
diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp
index 4417275d52..ac166aa1f5 100644
--- a/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -3791,7 +3791,10 @@ WriteAqlArg(
     *dst += size;
 }
 
-HsaAqlDispatchPacket*
+const hsa_packet_header_t DISPATCH_PACKET_HEADER = {
+    HSA_PACKET_TYPE_DISPATCH, 1, HSA_FENCE_SCOPE_SYSTEM, HSA_FENCE_SCOPE_COMPONENT, 0 };
+
+hsa_dispatch_packet_t*
 HSAILKernel::loadArguments(
     VirtualGPU&                     gpu,
     const amd::Kernel&              kernel,
@@ -3944,7 +3947,7 @@ HSAILKernel::loadArguments(
             if (dev().settings().hsailDirectSRD_) {
                 // Image arguments are of size 48 bytes and aligned to 16 bytes
                 WriteAqlArg(&aqlArgBuf, image->hwState(),
-                    HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
+                    HsaImageObjectSize, HsaImageObjectAlignment);
             }
             else {
                 //! \note Special case for the image views.
@@ -3952,9 +3955,9 @@ HSAILKernel::loadArguments(
                 //! this view without a wait for SRD resource.
                 if (image->memoryType() == Resource::ImageView) {
                     // Copy the current structre into CB1
-                    memcpy(aqlStruct, image->hwState(), HSA_IMAGE_OBJECT_SIZE);
+                    memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
                     ConstBuffer* cb = gpu.constBufs_[1];
-                    cb->uploadDataToHw(HSA_IMAGE_OBJECT_SIZE);
+                    cb->uploadDataToHw(HsaImageObjectSize);
                     // Then use a pointer in aqlArgBuffer to CB1
                     uint64_t srd = cb->vmAddress() + cb->wrtOffset();
                     WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
@@ -3983,7 +3986,7 @@ HSAILKernel::loadArguments(
                     (sampler->getDeviceSampler(dev()));
             if (dev().settings().hsailDirectSRD_) {
                 WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
-                    HSA_SAMPLER_OBJECT_SIZE, HSA_SAMPLER_OBJECT_ALIGNMENT);
+                    HsaSamplerObjectSize, HsaSamplerObjectAlignment);
             }
             else {
                 uint64_t srd = gpuSampler->hwSrd();
@@ -4025,46 +4028,41 @@ HSAILKernel::loadArguments(
         "Size and the number of arguments don't match!");
     uint  argBufSize = amd::alignUp(
         static_cast<uint>(argsBufferSize()), sizeof(uint32_t));
-    HsaAqlDispatchPacket*   aqlPkt = reinterpret_cast<HsaAqlDispatchPacket*>(
+    hsa_dispatch_packet_t*   hsaDisp = reinterpret_cast<hsa_dispatch_packet_t*>(
         gpu.cb(0)->sysMemCopy() + argBufSize);
 
-    amd::NDRange    local(sizes.local());
-    amd::NDRange    global(sizes.global());
-
-    // Initialize the Global, Local and Offset values
-    aqlPkt->dimensions = sizes.dimensions();
-    // Initialize the work grid parameter
-    for (uint idx = 0; idx < 3; idx++) {
-        aqlPkt->grid_size[idx] = 1;
-        aqlPkt->workgroup_size[idx] = 1;
-    }
+    amd::NDRange        local(sizes.local());
+    const amd::NDRange& global = sizes.global();
 
     // Check if runtime has to find local workgroup size
-    findLocalWorkSize(aqlPkt->dimensions, global, local);
-    for (uint idx = 0; idx < aqlPkt->dimensions; idx++) {
-        aqlPkt->grid_size[idx] = global[idx];
-        aqlPkt->workgroup_size[idx] = local[idx];
-    }
+    findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
 
-    // Initialize if dispatch should enable profiling
-    aqlPkt->reserved2 = 0;  //config->profile ? 1:0;
+    hsaDisp->header = DISPATCH_PACKET_HEADER;
+    hsaDisp->dimensions = sizes.dimensions();
+    hsaDisp->reserved = 0;
+
+    hsaDisp->workgroup_size_x = local[0];
+    hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
+    hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
+
+    hsaDisp->grid_size_x = global[0];
+    hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
+    hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
+    hsaDisp->reserved2 = 0;
 
     // Initialize kernel ISA and execution buffer requirements
-    aqlPkt->kernel_object_address = gpuAqlCode()->vmAddress();
-    aqlPkt->group_segment_size_bytes = ldsAddress - ldsSize();
-    aqlPkt->private_segment_size_bytes = spillSegSize();
-
-    // Initialize cache flush configuration for the dispatch
-    //! @todo Currently not used in emulation
-    aqlPkt->barrier = 1;
-    aqlPkt->release_fence_scope = 1;
-    aqlPkt->acquire_fence_scope = 2;
-    aqlPkt->invalidate_instruction_cache = 1;
+    hsaDisp->private_segment_size   = spillSegSize();
+    hsaDisp->group_segment_size     = ldsAddress - ldsSize();
+    hsaDisp->kernel_object_address  = gpuAqlCode()->vmAddress();
 
     ConstBuffer* cb = gpu.constBufs_[0];
-    cb->uploadDataToHw(argBufSize + sizeof(HsaAqlDispatchPacket));
+    cb->uploadDataToHw(argBufSize + sizeof(hsa_dispatch_packet_t));
     uint64_t argList = cb->vmAddress() + cb->wrtOffset();
-    aqlPkt->kernel_arg_address = argList;
+
+    hsaDisp->kernarg_address = argList;
+    hsaDisp->reserved3 = 0;
+    hsaDisp->completion_signal = 0;
+
     memList.push_back(cb);
     memList.push_back(gpuAqlCode());
     if (NULL != prog().globalStore()) {
@@ -4078,7 +4076,7 @@ HSAILKernel::loadArguments(
         dev().srds().fillResourceList(memList);
     }
 
-    return aqlPkt;
+    return hsaDisp;
 }
 
 } // namespace gpu
diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp
index 5e5be612a9..36c1b1c42f 100644
--- a/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -15,7 +15,7 @@
 #include "device/gpu/gpuvirtual.hpp"
 #include "sc-hsa/Interface/SCHSAInterface.h"
 #include "device/gpu/gpuprintf.hpp"
-#include "newcore.h"
+#include "hsa.h"
 //! \namespace gpu GPU Device Implementation
 namespace gpu {
 
@@ -885,7 +885,7 @@ public:
 
     //! Returns AQL packet in CPU memory 
     //! if the kerenl arguments were successfully loaded, otherwise NULL
-    HsaAqlDispatchPacket* loadArguments(
+    hsa_dispatch_packet_t* loadArguments(
         VirtualGPU&                     gpu,        //!< Running GPU context
         const amd::Kernel&              kernel,     //!< AMD kernel object
         const amd::NDRangeContainer&    sizes,      //!< NDrange container
diff --git a/rocclr/runtime/device/gpu/gpuprogram.cpp b/rocclr/runtime/device/gpu/gpuprogram.cpp
index 9769fafecc..50f1b93391 100644
--- a/rocclr/runtime/device/gpu/gpuprogram.cpp
+++ b/rocclr/runtime/device/gpu/gpuprogram.cpp
@@ -16,7 +16,8 @@
 #include <cstdio>
 #include "utils/options.hpp"
 #include "utils/libUtils.h"
-#include "newcore.h"
+#include "hsa.h"
+#include "hsa_ext_image.h"
 
 extern "C" bool
 ACL_API_ENTRY aclHsaLoader(
@@ -80,8 +81,8 @@ void
 GetSamplerObjectParams(uint32_t* size, uint32_t* alignment)
 {
     if (GPU_DIRECT_SRD) {
-        *size = HSA_SAMPLER_OBJECT_SIZE;
-        *alignment = HSA_SAMPLER_OBJECT_ALIGNMENT;
+        *size = gpu::HsaSamplerObjectSize;
+        *alignment = gpu::HsaSamplerObjectAlignment;
     }
     else {
         *size = sizeof(uint64_t);
@@ -94,45 +95,46 @@ InitializeSamplerObject(void* userData, uint64_t offset, bool unnormalize,
     uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW)
 {
     assert((addrU == addrV && addrV == addrW) && "GSL supports single address mode");
-    HsaSamplerFilterType filter = static_cast<HsaSamplerFilterType>(fltr);
-    HsaSamplerAddressMode boundaryU = static_cast<HsaSamplerAddressMode>(addrU);
+    hsa_ext_sampler_filter_mode_t filter =
+        static_cast<hsa_ext_sampler_filter_mode_t>(fltr);
+    hsa_ext_sampler_addressing_mode_t boundaryU =
+        static_cast<hsa_ext_sampler_addressing_mode_t>(addrU);
 
     uint32_t    state = (unnormalize) ?
         amd::Sampler::StateNormalizedCoordsFalse : amd::Sampler::StateNormalizedCoordsTrue;
-    if (filter == HSA_SAMP_FILTER_NEAREST) {
+    if (filter == HSA_EXT_SAMPLER_FILTER_LINEAR) {
         state |= amd::Sampler::StateFilterNearest;
     }
-    else if (filter == HSA_SAMP_FILTER_LINEAR) {
+    else if (filter == HSA_EXT_SAMPLER_FILTER_LINEAR) {
         state |= amd::Sampler::StateFilterLinear;
     }
     switch (boundaryU) {
-    case HSA_SAMP_ADDRESS_CLAMPEDGE:
+    case HSA_EXT_SAMPLER_ADDRESSING_CLAMP_TO_EDGE:
         state |= amd::Sampler::StateAddressClampToEdge;
         break;
-    case HSA_SAMP_ADDRESS_CLAMPBORDER:
+    case HSA_EXT_SAMPLER_ADDRESSING_CLAMP_TO_BORDER:
         state |= amd::Sampler::StateAddressClamp;
         break;
-    case HSA_SAMP_ADDRESS_WRAP:
+    case HSA_EXT_SAMPLER_ADDRESSING_REPEAT:
         state |= amd::Sampler::StateAddressRepeat;
         break;
-    case HSA_SAMP_ADDRESS_MIRROR:
+    case HSA_EXT_SAMPLER_ADDRESSING_MIRRORED_REPEAT:
         state |= amd::Sampler::StateAddressMirroredRepeat;
         break;
-    case HSA_SAMP_ADDRESS_MIRRORONCE:
-    case HSA_SAMP_ADDRESS_NONE:
+    case HSA_EXT_SAMPLER_ADDRESSING_UNDEFINED:
     default:
         break;
     }
 
     gpu::HSAILProgram* prog = reinterpret_cast<gpu::HSAILProgram*>(userData);
     if (prog->dev().settings().hsailDirectSRD_) {
-        char *pCPUbuf = new char[HSA_SAMPLER_OBJECT_SIZE];
+        char *pCPUbuf = new char[gpu::HsaSamplerObjectSize];
         if (!pCPUbuf) {
           assert(false);
           return;
         }
-        prog->dev().fillHwSampler(state, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE);
-        DmaMemoryCopy(userData, offset, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE);
+        prog->dev().fillHwSampler(state, pCPUbuf, gpu::HsaSamplerObjectSize);
+        DmaMemoryCopy(userData, offset, pCPUbuf, gpu::HsaSamplerObjectSize);
         delete pCPUbuf;
     }
     else {
diff --git a/rocclr/runtime/device/gpu/gpusched.hpp b/rocclr/runtime/device/gpu/gpusched.hpp
index 242580dd48..b4eb224633 100644
--- a/rocclr/runtime/device/gpu/gpusched.hpp
+++ b/rocclr/runtime/device/gpu/gpusched.hpp
@@ -4,7 +4,7 @@
 #ifndef GPUSCHED_HPP_
 #define GPUSCHED_HPP_
 
-#include "newcore.h"
+#include "hsa.h"
 
 namespace gpu {
 
@@ -47,7 +47,7 @@ struct AmdAqlWrap {
     uint64_t wait_list;     //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
     uint32_t wait_num;      //!< [LWO/SRO] The number of cl_event_wait objects    
     uint32_t reserved[5];   //!< For the future usage
-    HsaAqlDispatchPacket aql;  //!< [LWO/SRO] AQL packet – 64 bytes AQL packet
+    hsa_dispatch_packet_t aql;  //!< [LWO/SRO] AQL packet – 64 bytes AQL packet
 };
 
 struct AmdEvent {
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 272273b476..598fb5c325 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -14,7 +14,7 @@
 #include "device/gpu/gputhreadtrace.hpp"
 #include "device/gpu/gputimestamp.hpp"
 #include "device/gpu/gpublit.hpp"
-#include "newcore.h"
+#include "hsa.h"
 #include "sc-hsa/Interface/SCHSAInterface.h"
 #include <fstream>
 #include <sstream>
@@ -342,7 +342,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
     AmdAqlWrap* slots = reinterpret_cast<AmdAqlWrap*>(&header[1]);
     for (uint i = 0; i < numSlots; ++i) {
         uint64_t argStart = vaBase + argOffs + i * singleArgSize;
-        slots[i].aql.kernel_arg_address = argStart;
+        slots[i].aql.kernarg_address = argStart;
         slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64;
     }
     // Upload data back to local memory
@@ -576,7 +576,8 @@ VirtualGPU::create(
 bool
 VirtualGPU::allocHsaQueueMem()
 {
-    amd_queue_t queue = {0};
+    amd_queue_t queue;
+    memset(&queue, 0, sizeof(queue));
     hsaQueueMem_ = new gpu::Memory(dev(), sizeof(queue));
     if (hsaQueueMem_ == NULL) {
         return false;
@@ -1726,7 +1727,7 @@ VirtualGPU::submitKernelInternalHSA(
     }
 
     // Program the kernel arguments for the GPU execution
-    HsaAqlDispatchPacket*   aqlPkt =
+    hsa_dispatch_packet_t*  aqlPkt =
         hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem,
         vmDefQueue, &vmParentWrap, memList);
     if (NULL == aqlPkt) {
@@ -1793,12 +1794,12 @@ VirtualGPU::submitKernelInternalHSA(
                         print << "\tState: " << eventD->state <<
                                  "; Counter: " << eventD->counter << "\n";
                     }
-                    print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size[0] << ", ";
-                    print << wraps[i].aql.workgroup_size[1] << ", ";
-                    print << wraps[i].aql.workgroup_size[2] << "]\n";
-                    print << "GridSize[ " << wraps[i].aql.grid_size[0] << ", ";
-                    print << wraps[i].aql.grid_size[1] << ", ";
-                    print << wraps[i].aql.grid_size[2] << "]\n";
+                    print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
+                    print << wraps[i].aql.workgroup_size_y << ", ";
+                    print << wraps[i].aql.workgroup_size_z << "]\n";
+                    print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
+                    print << wraps[i].aql.grid_size_y << ", ";
+                    print << wraps[i].aql.grid_size_z << "]\n";
 
                     uint64_t* kernels = (uint64_t*)(
                         const_cast<Memory*>(hsaKernel.prog().kernelTable())->map(this));
@@ -1819,7 +1820,7 @@ VirtualGPU::submitKernelInternalHSA(
                         printf("Error: couldn't find child kernel!\n");
                         continue;
                     }
-                    uint offsArg = wraps[i].aql.kernel_arg_address -
+                    uint offsArg = wraps[i].aql.kernarg_address -
                         gpuDefQueue->virtualQueue_->vmAddress();
                     address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
                     print << "Kernel: " << child->name() << "\n";