P4 to Git Change 1089823 by gandryey@gera-dev-w7 on 2014/10/21 17:52:16

ECR #304775 - Move OCL runtime to the latest HSA1.0 spec

Affected files ...

... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/brig_loader.cpp#14 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/build/Makefile.gpu#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/build/Makefile.gpu#57 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudefs.hpp#115 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#471 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#267 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#102 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#181 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#14 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#335 edit
Этот коммит содержится в:
foreman
2014-10-21 18:07:39 -04:00
родитель 1a5e2fabe0
Коммит 692a1b9f9a
7 изменённых файлов: 76 добавлений и 69 удалений
+6
Просмотреть файл
@@ -57,6 +57,12 @@ const static uint CalOfflineImpl = 0xffffffff;
//! Alignment restriciton for the pinned memory
const static size_t PinnedMemoryAlignment = 4 * Ki;
//! HSA path specific defines for images
const static uint HsaImageObjectSize = 48;
const static uint HsaImageObjectAlignment = 16;
const static uint HsaSamplerObjectSize = 32;
const static uint HsaSamplerObjectAlignment = 16;
//! Defines all supported ASIC families
enum AsicFamilies {
Family7xx,
+2 -2
Просмотреть файл
@@ -909,7 +909,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
// Allocate SRD manager
srdManager_ = new SrdManager(*this,
std::max(HSA_IMAGE_OBJECT_SIZE, HSA_SAMPLER_OBJECT_SIZE), 64 * Ki);
std::max(HsaImageObjectSize, HsaSamplerObjectSize), 64 * Ki);
if (srdManager_ == NULL) {
return false;
}
@@ -2509,7 +2509,7 @@ Sampler::create(
if (0 == hwSrd_) {
return false;
}
dev_.fillHwSampler(oclSamplerState, hwState_, HSA_SAMPLER_OBJECT_SIZE);
dev_.fillHwSampler(oclSamplerState, hwState_, HsaSamplerObjectSize);
return true;
}
+34 -36
Просмотреть файл
@@ -3791,7 +3791,10 @@ WriteAqlArg(
*dst += size;
}
HsaAqlDispatchPacket*
const hsa_packet_header_t DISPATCH_PACKET_HEADER = {
HSA_PACKET_TYPE_DISPATCH, 1, HSA_FENCE_SCOPE_SYSTEM, HSA_FENCE_SCOPE_COMPONENT, 0 };
hsa_dispatch_packet_t*
HSAILKernel::loadArguments(
VirtualGPU& gpu,
const amd::Kernel& kernel,
@@ -3944,7 +3947,7 @@ HSAILKernel::loadArguments(
if (dev().settings().hsailDirectSRD_) {
// Image arguments are of size 48 bytes and aligned to 16 bytes
WriteAqlArg(&aqlArgBuf, image->hwState(),
HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
HsaImageObjectSize, HsaImageObjectAlignment);
}
else {
//! \note Special case for the image views.
@@ -3952,9 +3955,9 @@ HSAILKernel::loadArguments(
//! this view without a wait for SRD resource.
if (image->memoryType() == Resource::ImageView) {
// Copy the current structre into CB1
memcpy(aqlStruct, image->hwState(), HSA_IMAGE_OBJECT_SIZE);
memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(HSA_IMAGE_OBJECT_SIZE);
cb->uploadDataToHw(HsaImageObjectSize);
// Then use a pointer in aqlArgBuffer to CB1
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
@@ -3983,7 +3986,7 @@ HSAILKernel::loadArguments(
(sampler->getDeviceSampler(dev()));
if (dev().settings().hsailDirectSRD_) {
WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
HSA_SAMPLER_OBJECT_SIZE, HSA_SAMPLER_OBJECT_ALIGNMENT);
HsaSamplerObjectSize, HsaSamplerObjectAlignment);
}
else {
uint64_t srd = gpuSampler->hwSrd();
@@ -4025,46 +4028,41 @@ HSAILKernel::loadArguments(
"Size and the number of arguments don't match!");
uint argBufSize = amd::alignUp(
static_cast<uint>(argsBufferSize()), sizeof(uint32_t));
HsaAqlDispatchPacket* aqlPkt = reinterpret_cast<HsaAqlDispatchPacket*>(
hsa_dispatch_packet_t* hsaDisp = reinterpret_cast<hsa_dispatch_packet_t*>(
gpu.cb(0)->sysMemCopy() + argBufSize);
amd::NDRange local(sizes.local());
amd::NDRange global(sizes.global());
// Initialize the Global, Local and Offset values
aqlPkt->dimensions = sizes.dimensions();
// Initialize the work grid parameter
for (uint idx = 0; idx < 3; idx++) {
aqlPkt->grid_size[idx] = 1;
aqlPkt->workgroup_size[idx] = 1;
}
amd::NDRange local(sizes.local());
const amd::NDRange& global = sizes.global();
// Check if runtime has to find local workgroup size
findLocalWorkSize(aqlPkt->dimensions, global, local);
for (uint idx = 0; idx < aqlPkt->dimensions; idx++) {
aqlPkt->grid_size[idx] = global[idx];
aqlPkt->workgroup_size[idx] = local[idx];
}
findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
// Initialize if dispatch should enable profiling
aqlPkt->reserved2 = 0; //config->profile ? 1:0;
hsaDisp->header = DISPATCH_PACKET_HEADER;
hsaDisp->dimensions = sizes.dimensions();
hsaDisp->reserved = 0;
hsaDisp->workgroup_size_x = local[0];
hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
hsaDisp->grid_size_x = global[0];
hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
hsaDisp->reserved2 = 0;
// Initialize kernel ISA and execution buffer requirements
aqlPkt->kernel_object_address = gpuAqlCode()->vmAddress();
aqlPkt->group_segment_size_bytes = ldsAddress - ldsSize();
aqlPkt->private_segment_size_bytes = spillSegSize();
// Initialize cache flush configuration for the dispatch
//! @todo Currently not used in emulation
aqlPkt->barrier = 1;
aqlPkt->release_fence_scope = 1;
aqlPkt->acquire_fence_scope = 2;
aqlPkt->invalidate_instruction_cache = 1;
hsaDisp->private_segment_size = spillSegSize();
hsaDisp->group_segment_size = ldsAddress - ldsSize();
hsaDisp->kernel_object_address = gpuAqlCode()->vmAddress();
ConstBuffer* cb = gpu.constBufs_[0];
cb->uploadDataToHw(argBufSize + sizeof(HsaAqlDispatchPacket));
cb->uploadDataToHw(argBufSize + sizeof(hsa_dispatch_packet_t));
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
aqlPkt->kernel_arg_address = argList;
hsaDisp->kernarg_address = argList;
hsaDisp->reserved3 = 0;
hsaDisp->completion_signal = 0;
memList.push_back(cb);
memList.push_back(gpuAqlCode());
if (NULL != prog().globalStore()) {
@@ -4078,7 +4076,7 @@ HSAILKernel::loadArguments(
dev().srds().fillResourceList(memList);
}
return aqlPkt;
return hsaDisp;
}
} // namespace gpu
+2 -2
Просмотреть файл
@@ -15,7 +15,7 @@
#include "device/gpu/gpuvirtual.hpp"
#include "sc-hsa/Interface/SCHSAInterface.h"
#include "device/gpu/gpuprintf.hpp"
#include "newcore.h"
#include "hsa.h"
//! \namespace gpu GPU Device Implementation
namespace gpu {
@@ -885,7 +885,7 @@ public:
//! Returns AQL packet in CPU memory
//! if the kerenl arguments were successfully loaded, otherwise NULL
HsaAqlDispatchPacket* loadArguments(
hsa_dispatch_packet_t* loadArguments(
VirtualGPU& gpu, //!< Running GPU context
const amd::Kernel& kernel, //!< AMD kernel object
const amd::NDRangeContainer& sizes, //!< NDrange container
+18 -16
Просмотреть файл
@@ -16,7 +16,8 @@
#include <cstdio>
#include "utils/options.hpp"
#include "utils/libUtils.h"
#include "newcore.h"
#include "hsa.h"
#include "hsa_ext_image.h"
extern "C" bool
ACL_API_ENTRY aclHsaLoader(
@@ -80,8 +81,8 @@ void
GetSamplerObjectParams(uint32_t* size, uint32_t* alignment)
{
if (GPU_DIRECT_SRD) {
*size = HSA_SAMPLER_OBJECT_SIZE;
*alignment = HSA_SAMPLER_OBJECT_ALIGNMENT;
*size = gpu::HsaSamplerObjectSize;
*alignment = gpu::HsaSamplerObjectAlignment;
}
else {
*size = sizeof(uint64_t);
@@ -94,45 +95,46 @@ InitializeSamplerObject(void* userData, uint64_t offset, bool unnormalize,
uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW)
{
assert((addrU == addrV && addrV == addrW) && "GSL supports single address mode");
HsaSamplerFilterType filter = static_cast<HsaSamplerFilterType>(fltr);
HsaSamplerAddressMode boundaryU = static_cast<HsaSamplerAddressMode>(addrU);
hsa_ext_sampler_filter_mode_t filter =
static_cast<hsa_ext_sampler_filter_mode_t>(fltr);
hsa_ext_sampler_addressing_mode_t boundaryU =
static_cast<hsa_ext_sampler_addressing_mode_t>(addrU);
uint32_t state = (unnormalize) ?
amd::Sampler::StateNormalizedCoordsFalse : amd::Sampler::StateNormalizedCoordsTrue;
if (filter == HSA_SAMP_FILTER_NEAREST) {
if (filter == HSA_EXT_SAMPLER_FILTER_LINEAR) {
state |= amd::Sampler::StateFilterNearest;
}
else if (filter == HSA_SAMP_FILTER_LINEAR) {
else if (filter == HSA_EXT_SAMPLER_FILTER_LINEAR) {
state |= amd::Sampler::StateFilterLinear;
}
switch (boundaryU) {
case HSA_SAMP_ADDRESS_CLAMPEDGE:
case HSA_EXT_SAMPLER_ADDRESSING_CLAMP_TO_EDGE:
state |= amd::Sampler::StateAddressClampToEdge;
break;
case HSA_SAMP_ADDRESS_CLAMPBORDER:
case HSA_EXT_SAMPLER_ADDRESSING_CLAMP_TO_BORDER:
state |= amd::Sampler::StateAddressClamp;
break;
case HSA_SAMP_ADDRESS_WRAP:
case HSA_EXT_SAMPLER_ADDRESSING_REPEAT:
state |= amd::Sampler::StateAddressRepeat;
break;
case HSA_SAMP_ADDRESS_MIRROR:
case HSA_EXT_SAMPLER_ADDRESSING_MIRRORED_REPEAT:
state |= amd::Sampler::StateAddressMirroredRepeat;
break;
case HSA_SAMP_ADDRESS_MIRRORONCE:
case HSA_SAMP_ADDRESS_NONE:
case HSA_EXT_SAMPLER_ADDRESSING_UNDEFINED:
default:
break;
}
gpu::HSAILProgram* prog = reinterpret_cast<gpu::HSAILProgram*>(userData);
if (prog->dev().settings().hsailDirectSRD_) {
char *pCPUbuf = new char[HSA_SAMPLER_OBJECT_SIZE];
char *pCPUbuf = new char[gpu::HsaSamplerObjectSize];
if (!pCPUbuf) {
assert(false);
return;
}
prog->dev().fillHwSampler(state, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE);
DmaMemoryCopy(userData, offset, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE);
prog->dev().fillHwSampler(state, pCPUbuf, gpu::HsaSamplerObjectSize);
DmaMemoryCopy(userData, offset, pCPUbuf, gpu::HsaSamplerObjectSize);
delete pCPUbuf;
}
else {
+2 -2
Просмотреть файл
@@ -4,7 +4,7 @@
#ifndef GPUSCHED_HPP_
#define GPUSCHED_HPP_
#include "newcore.h"
#include "hsa.h"
namespace gpu {
@@ -47,7 +47,7 @@ struct AmdAqlWrap {
uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects
uint32_t reserved[5]; //!< For the future usage
HsaAqlDispatchPacket aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet
hsa_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet
};
struct AmdEvent {
+12 -11
Просмотреть файл
@@ -14,7 +14,7 @@
#include "device/gpu/gputhreadtrace.hpp"
#include "device/gpu/gputimestamp.hpp"
#include "device/gpu/gpublit.hpp"
#include "newcore.h"
#include "hsa.h"
#include "sc-hsa/Interface/SCHSAInterface.h"
#include <fstream>
#include <sstream>
@@ -342,7 +342,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
AmdAqlWrap* slots = reinterpret_cast<AmdAqlWrap*>(&header[1]);
for (uint i = 0; i < numSlots; ++i) {
uint64_t argStart = vaBase + argOffs + i * singleArgSize;
slots[i].aql.kernel_arg_address = argStart;
slots[i].aql.kernarg_address = argStart;
slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64;
}
// Upload data back to local memory
@@ -576,7 +576,8 @@ VirtualGPU::create(
bool
VirtualGPU::allocHsaQueueMem()
{
amd_queue_t queue = {0};
amd_queue_t queue;
memset(&queue, 0, sizeof(queue));
hsaQueueMem_ = new gpu::Memory(dev(), sizeof(queue));
if (hsaQueueMem_ == NULL) {
return false;
@@ -1726,7 +1727,7 @@ VirtualGPU::submitKernelInternalHSA(
}
// Program the kernel arguments for the GPU execution
HsaAqlDispatchPacket* aqlPkt =
hsa_dispatch_packet_t* aqlPkt =
hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem,
vmDefQueue, &vmParentWrap, memList);
if (NULL == aqlPkt) {
@@ -1793,12 +1794,12 @@ VirtualGPU::submitKernelInternalHSA(
print << "\tState: " << eventD->state <<
"; Counter: " << eventD->counter << "\n";
}
print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size[0] << ", ";
print << wraps[i].aql.workgroup_size[1] << ", ";
print << wraps[i].aql.workgroup_size[2] << "]\n";
print << "GridSize[ " << wraps[i].aql.grid_size[0] << ", ";
print << wraps[i].aql.grid_size[1] << ", ";
print << wraps[i].aql.grid_size[2] << "]\n";
print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
print << wraps[i].aql.workgroup_size_y << ", ";
print << wraps[i].aql.workgroup_size_z << "]\n";
print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
print << wraps[i].aql.grid_size_y << ", ";
print << wraps[i].aql.grid_size_z << "]\n";
uint64_t* kernels = (uint64_t*)(
const_cast<Memory*>(hsaKernel.prog().kernelTable())->map(this));
@@ -1819,7 +1820,7 @@ VirtualGPU::submitKernelInternalHSA(
printf("Error: couldn't find child kernel!\n");
continue;
}
uint offsArg = wraps[i].aql.kernel_arg_address -
uint offsArg = wraps[i].aql.kernarg_address -
gpuDefQueue->virtualQueue_->vmAddress();
address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
print << "Kernel: " << child->name() << "\n";