From 692a1b9f9a35dfd5b8443bf3b7e88bfb06d0918b Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 21 Oct 2014 18:07:39 -0400
Subject: [PATCH] P4 to Git Change 1089823 by gandryey@gera-dev-w7 on
2014/10/21 17:52:16
ECR #304775 - Move OCL runtime to the latest HSA1.0 spec
Affected files ...
... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/brig_loader.cpp#14 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/build/Makefile.gpu#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/build/Makefile.gpu#57 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudefs.hpp#115 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#471 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#267 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#102 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#181 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#335 edit
---
rocclr/runtime/device/gpu/gpudefs.hpp | 6 ++
rocclr/runtime/device/gpu/gpudevice.cpp | 4 +-
rocclr/runtime/device/gpu/gpukernel.cpp | 70 ++++++++++++------------
rocclr/runtime/device/gpu/gpukernel.hpp | 4 +-
rocclr/runtime/device/gpu/gpuprogram.cpp | 34 ++++++------
rocclr/runtime/device/gpu/gpusched.hpp | 4 +-
rocclr/runtime/device/gpu/gpuvirtual.cpp | 23 ++++----
7 files changed, 76 insertions(+), 69 deletions(-)
diff --git a/rocclr/runtime/device/gpu/gpudefs.hpp b/rocclr/runtime/device/gpu/gpudefs.hpp
index da68bd8600..8a3d271c1a 100644
--- a/rocclr/runtime/device/gpu/gpudefs.hpp
+++ b/rocclr/runtime/device/gpu/gpudefs.hpp
@@ -57,6 +57,12 @@ const static uint CalOfflineImpl = 0xffffffff;
//! Alignment restriciton for the pinned memory
const static size_t PinnedMemoryAlignment = 4 * Ki;
+//! HSA path specific defines for images
+const static uint HsaImageObjectSize = 48;
+const static uint HsaImageObjectAlignment = 16;
+const static uint HsaSamplerObjectSize = 32;
+const static uint HsaSamplerObjectAlignment = 16;
+
//! Defines all supported ASIC families
enum AsicFamilies {
Family7xx,
diff --git a/rocclr/runtime/device/gpu/gpudevice.cpp b/rocclr/runtime/device/gpu/gpudevice.cpp
index 004fc66b58..159c916109 100644
--- a/rocclr/runtime/device/gpu/gpudevice.cpp
+++ b/rocclr/runtime/device/gpu/gpudevice.cpp
@@ -909,7 +909,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
// Allocate SRD manager
srdManager_ = new SrdManager(*this,
- std::max(HSA_IMAGE_OBJECT_SIZE, HSA_SAMPLER_OBJECT_SIZE), 64 * Ki);
+ std::max(HsaImageObjectSize, HsaSamplerObjectSize), 64 * Ki);
if (srdManager_ == NULL) {
return false;
}
@@ -2509,7 +2509,7 @@ Sampler::create(
if (0 == hwSrd_) {
return false;
}
- dev_.fillHwSampler(oclSamplerState, hwState_, HSA_SAMPLER_OBJECT_SIZE);
+ dev_.fillHwSampler(oclSamplerState, hwState_, HsaSamplerObjectSize);
return true;
}
diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp
index 4417275d52..ac166aa1f5 100644
--- a/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -3791,7 +3791,10 @@ WriteAqlArg(
*dst += size;
}
-HsaAqlDispatchPacket*
+const hsa_packet_header_t DISPATCH_PACKET_HEADER = {
+ HSA_PACKET_TYPE_DISPATCH, 1, HSA_FENCE_SCOPE_SYSTEM, HSA_FENCE_SCOPE_COMPONENT, 0 };
+
+hsa_dispatch_packet_t*
HSAILKernel::loadArguments(
VirtualGPU& gpu,
const amd::Kernel& kernel,
@@ -3944,7 +3947,7 @@ HSAILKernel::loadArguments(
if (dev().settings().hsailDirectSRD_) {
// Image arguments are of size 48 bytes and aligned to 16 bytes
WriteAqlArg(&aqlArgBuf, image->hwState(),
- HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
+ HsaImageObjectSize, HsaImageObjectAlignment);
}
else {
//! \note Special case for the image views.
@@ -3952,9 +3955,9 @@ HSAILKernel::loadArguments(
//! this view without a wait for SRD resource.
if (image->memoryType() == Resource::ImageView) {
// Copy the current structre into CB1
- memcpy(aqlStruct, image->hwState(), HSA_IMAGE_OBJECT_SIZE);
+ memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
ConstBuffer* cb = gpu.constBufs_[1];
- cb->uploadDataToHw(HSA_IMAGE_OBJECT_SIZE);
+ cb->uploadDataToHw(HsaImageObjectSize);
// Then use a pointer in aqlArgBuffer to CB1
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
@@ -3983,7 +3986,7 @@ HSAILKernel::loadArguments(
(sampler->getDeviceSampler(dev()));
if (dev().settings().hsailDirectSRD_) {
WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
- HSA_SAMPLER_OBJECT_SIZE, HSA_SAMPLER_OBJECT_ALIGNMENT);
+ HsaSamplerObjectSize, HsaSamplerObjectAlignment);
}
else {
uint64_t srd = gpuSampler->hwSrd();
@@ -4025,46 +4028,41 @@ HSAILKernel::loadArguments(
"Size and the number of arguments don't match!");
uint argBufSize = amd::alignUp(
static_cast(argsBufferSize()), sizeof(uint32_t));
- HsaAqlDispatchPacket* aqlPkt = reinterpret_cast(
+ hsa_dispatch_packet_t* hsaDisp = reinterpret_cast(
gpu.cb(0)->sysMemCopy() + argBufSize);
- amd::NDRange local(sizes.local());
- amd::NDRange global(sizes.global());
-
- // Initialize the Global, Local and Offset values
- aqlPkt->dimensions = sizes.dimensions();
- // Initialize the work grid parameter
- for (uint idx = 0; idx < 3; idx++) {
- aqlPkt->grid_size[idx] = 1;
- aqlPkt->workgroup_size[idx] = 1;
- }
+ amd::NDRange local(sizes.local());
+ const amd::NDRange& global = sizes.global();
// Check if runtime has to find local workgroup size
- findLocalWorkSize(aqlPkt->dimensions, global, local);
- for (uint idx = 0; idx < aqlPkt->dimensions; idx++) {
- aqlPkt->grid_size[idx] = global[idx];
- aqlPkt->workgroup_size[idx] = local[idx];
- }
+ findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
- // Initialize if dispatch should enable profiling
- aqlPkt->reserved2 = 0; //config->profile ? 1:0;
+ hsaDisp->header = DISPATCH_PACKET_HEADER;
+ hsaDisp->dimensions = sizes.dimensions();
+ hsaDisp->reserved = 0;
+
+ hsaDisp->workgroup_size_x = local[0];
+ hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
+ hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
+
+ hsaDisp->grid_size_x = global[0];
+ hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
+ hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
+ hsaDisp->reserved2 = 0;
// Initialize kernel ISA and execution buffer requirements
- aqlPkt->kernel_object_address = gpuAqlCode()->vmAddress();
- aqlPkt->group_segment_size_bytes = ldsAddress - ldsSize();
- aqlPkt->private_segment_size_bytes = spillSegSize();
-
- // Initialize cache flush configuration for the dispatch
- //! @todo Currently not used in emulation
- aqlPkt->barrier = 1;
- aqlPkt->release_fence_scope = 1;
- aqlPkt->acquire_fence_scope = 2;
- aqlPkt->invalidate_instruction_cache = 1;
+ hsaDisp->private_segment_size = spillSegSize();
+ hsaDisp->group_segment_size = ldsAddress - ldsSize();
+ hsaDisp->kernel_object_address = gpuAqlCode()->vmAddress();
ConstBuffer* cb = gpu.constBufs_[0];
- cb->uploadDataToHw(argBufSize + sizeof(HsaAqlDispatchPacket));
+ cb->uploadDataToHw(argBufSize + sizeof(hsa_dispatch_packet_t));
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
- aqlPkt->kernel_arg_address = argList;
+
+ hsaDisp->kernarg_address = argList;
+ hsaDisp->reserved3 = 0;
+ hsaDisp->completion_signal = 0;
+
memList.push_back(cb);
memList.push_back(gpuAqlCode());
if (NULL != prog().globalStore()) {
@@ -4078,7 +4076,7 @@ HSAILKernel::loadArguments(
dev().srds().fillResourceList(memList);
}
- return aqlPkt;
+ return hsaDisp;
}
} // namespace gpu
diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp
index 5e5be612a9..36c1b1c42f 100644
--- a/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -15,7 +15,7 @@
#include "device/gpu/gpuvirtual.hpp"
#include "sc-hsa/Interface/SCHSAInterface.h"
#include "device/gpu/gpuprintf.hpp"
-#include "newcore.h"
+#include "hsa.h"
//! \namespace gpu GPU Device Implementation
namespace gpu {
@@ -885,7 +885,7 @@ public:
//! Returns AQL packet in CPU memory
//! if the kerenl arguments were successfully loaded, otherwise NULL
- HsaAqlDispatchPacket* loadArguments(
+ hsa_dispatch_packet_t* loadArguments(
VirtualGPU& gpu, //!< Running GPU context
const amd::Kernel& kernel, //!< AMD kernel object
const amd::NDRangeContainer& sizes, //!< NDrange container
diff --git a/rocclr/runtime/device/gpu/gpuprogram.cpp b/rocclr/runtime/device/gpu/gpuprogram.cpp
index 9769fafecc..50f1b93391 100644
--- a/rocclr/runtime/device/gpu/gpuprogram.cpp
+++ b/rocclr/runtime/device/gpu/gpuprogram.cpp
@@ -16,7 +16,8 @@
#include
#include "utils/options.hpp"
#include "utils/libUtils.h"
-#include "newcore.h"
+#include "hsa.h"
+#include "hsa_ext_image.h"
extern "C" bool
ACL_API_ENTRY aclHsaLoader(
@@ -80,8 +81,8 @@ void
GetSamplerObjectParams(uint32_t* size, uint32_t* alignment)
{
if (GPU_DIRECT_SRD) {
- *size = HSA_SAMPLER_OBJECT_SIZE;
- *alignment = HSA_SAMPLER_OBJECT_ALIGNMENT;
+ *size = gpu::HsaSamplerObjectSize;
+ *alignment = gpu::HsaSamplerObjectAlignment;
}
else {
*size = sizeof(uint64_t);
@@ -94,45 +95,46 @@ InitializeSamplerObject(void* userData, uint64_t offset, bool unnormalize,
uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW)
{
assert((addrU == addrV && addrV == addrW) && "GSL supports single address mode");
- HsaSamplerFilterType filter = static_cast(fltr);
- HsaSamplerAddressMode boundaryU = static_cast(addrU);
+ hsa_ext_sampler_filter_mode_t filter =
+ static_cast(fltr);
+ hsa_ext_sampler_addressing_mode_t boundaryU =
+ static_cast(addrU);
uint32_t state = (unnormalize) ?
amd::Sampler::StateNormalizedCoordsFalse : amd::Sampler::StateNormalizedCoordsTrue;
- if (filter == HSA_SAMP_FILTER_NEAREST) {
+ if (filter == HSA_EXT_SAMPLER_FILTER_LINEAR) {
state |= amd::Sampler::StateFilterNearest;
}
- else if (filter == HSA_SAMP_FILTER_LINEAR) {
+ else if (filter == HSA_EXT_SAMPLER_FILTER_LINEAR) {
state |= amd::Sampler::StateFilterLinear;
}
switch (boundaryU) {
- case HSA_SAMP_ADDRESS_CLAMPEDGE:
+ case HSA_EXT_SAMPLER_ADDRESSING_CLAMP_TO_EDGE:
state |= amd::Sampler::StateAddressClampToEdge;
break;
- case HSA_SAMP_ADDRESS_CLAMPBORDER:
+ case HSA_EXT_SAMPLER_ADDRESSING_CLAMP_TO_BORDER:
state |= amd::Sampler::StateAddressClamp;
break;
- case HSA_SAMP_ADDRESS_WRAP:
+ case HSA_EXT_SAMPLER_ADDRESSING_REPEAT:
state |= amd::Sampler::StateAddressRepeat;
break;
- case HSA_SAMP_ADDRESS_MIRROR:
+ case HSA_EXT_SAMPLER_ADDRESSING_MIRRORED_REPEAT:
state |= amd::Sampler::StateAddressMirroredRepeat;
break;
- case HSA_SAMP_ADDRESS_MIRRORONCE:
- case HSA_SAMP_ADDRESS_NONE:
+ case HSA_EXT_SAMPLER_ADDRESSING_UNDEFINED:
default:
break;
}
gpu::HSAILProgram* prog = reinterpret_cast(userData);
if (prog->dev().settings().hsailDirectSRD_) {
- char *pCPUbuf = new char[HSA_SAMPLER_OBJECT_SIZE];
+ char *pCPUbuf = new char[gpu::HsaSamplerObjectSize];
if (!pCPUbuf) {
assert(false);
return;
}
- prog->dev().fillHwSampler(state, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE);
- DmaMemoryCopy(userData, offset, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE);
+ prog->dev().fillHwSampler(state, pCPUbuf, gpu::HsaSamplerObjectSize);
+ DmaMemoryCopy(userData, offset, pCPUbuf, gpu::HsaSamplerObjectSize);
delete pCPUbuf;
}
else {
diff --git a/rocclr/runtime/device/gpu/gpusched.hpp b/rocclr/runtime/device/gpu/gpusched.hpp
index 242580dd48..b4eb224633 100644
--- a/rocclr/runtime/device/gpu/gpusched.hpp
+++ b/rocclr/runtime/device/gpu/gpusched.hpp
@@ -4,7 +4,7 @@
#ifndef GPUSCHED_HPP_
#define GPUSCHED_HPP_
-#include "newcore.h"
+#include "hsa.h"
namespace gpu {
@@ -47,7 +47,7 @@ struct AmdAqlWrap {
uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects
uint32_t reserved[5]; //!< For the future usage
- HsaAqlDispatchPacket aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet
+ hsa_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet
};
struct AmdEvent {
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 272273b476..598fb5c325 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -14,7 +14,7 @@
#include "device/gpu/gputhreadtrace.hpp"
#include "device/gpu/gputimestamp.hpp"
#include "device/gpu/gpublit.hpp"
-#include "newcore.h"
+#include "hsa.h"
#include "sc-hsa/Interface/SCHSAInterface.h"
#include
#include
@@ -342,7 +342,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
AmdAqlWrap* slots = reinterpret_cast(&header[1]);
for (uint i = 0; i < numSlots; ++i) {
uint64_t argStart = vaBase + argOffs + i * singleArgSize;
- slots[i].aql.kernel_arg_address = argStart;
+ slots[i].aql.kernarg_address = argStart;
slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64;
}
// Upload data back to local memory
@@ -576,7 +576,8 @@ VirtualGPU::create(
bool
VirtualGPU::allocHsaQueueMem()
{
- amd_queue_t queue = {0};
+ amd_queue_t queue;
+ memset(&queue, 0, sizeof(queue));
hsaQueueMem_ = new gpu::Memory(dev(), sizeof(queue));
if (hsaQueueMem_ == NULL) {
return false;
@@ -1726,7 +1727,7 @@ VirtualGPU::submitKernelInternalHSA(
}
// Program the kernel arguments for the GPU execution
- HsaAqlDispatchPacket* aqlPkt =
+ hsa_dispatch_packet_t* aqlPkt =
hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem,
vmDefQueue, &vmParentWrap, memList);
if (NULL == aqlPkt) {
@@ -1793,12 +1794,12 @@ VirtualGPU::submitKernelInternalHSA(
print << "\tState: " << eventD->state <<
"; Counter: " << eventD->counter << "\n";
}
- print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size[0] << ", ";
- print << wraps[i].aql.workgroup_size[1] << ", ";
- print << wraps[i].aql.workgroup_size[2] << "]\n";
- print << "GridSize[ " << wraps[i].aql.grid_size[0] << ", ";
- print << wraps[i].aql.grid_size[1] << ", ";
- print << wraps[i].aql.grid_size[2] << "]\n";
+ print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
+ print << wraps[i].aql.workgroup_size_y << ", ";
+ print << wraps[i].aql.workgroup_size_z << "]\n";
+ print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
+ print << wraps[i].aql.grid_size_y << ", ";
+ print << wraps[i].aql.grid_size_z << "]\n";
uint64_t* kernels = (uint64_t*)(
const_cast(hsaKernel.prog().kernelTable())->map(this));
@@ -1819,7 +1820,7 @@ VirtualGPU::submitKernelInternalHSA(
printf("Error: couldn't find child kernel!\n");
continue;
}
- uint offsArg = wraps[i].aql.kernel_arg_address -
+ uint offsArg = wraps[i].aql.kernarg_address -
gpuDefQueue->virtualQueue_->vmAddress();
address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
print << "Kernel: " << child->name() << "\n";