P4 to Git Change 1089823 by gandryey@gera-dev-w7 on 2014/10/21 17:52:16
ECR #304775 - Move OCL runtime to the latest HSA1.0 spec Affected files ... ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/brig_loader.cpp#14 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/build/Makefile.gpu#24 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/build/Makefile.gpu#57 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudefs.hpp#115 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#471 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#267 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#102 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#181 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusched.hpp#14 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#335 edit
Этот коммит содержится в:
@@ -57,6 +57,12 @@ const static uint CalOfflineImpl = 0xffffffff;
|
||||
//! Alignment restriciton for the pinned memory
|
||||
const static size_t PinnedMemoryAlignment = 4 * Ki;
|
||||
|
||||
//! HSA path specific defines for images
|
||||
const static uint HsaImageObjectSize = 48;
|
||||
const static uint HsaImageObjectAlignment = 16;
|
||||
const static uint HsaSamplerObjectSize = 32;
|
||||
const static uint HsaSamplerObjectAlignment = 16;
|
||||
|
||||
//! Defines all supported ASIC families
|
||||
enum AsicFamilies {
|
||||
Family7xx,
|
||||
|
||||
@@ -909,7 +909,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
|
||||
|
||||
// Allocate SRD manager
|
||||
srdManager_ = new SrdManager(*this,
|
||||
std::max(HSA_IMAGE_OBJECT_SIZE, HSA_SAMPLER_OBJECT_SIZE), 64 * Ki);
|
||||
std::max(HsaImageObjectSize, HsaSamplerObjectSize), 64 * Ki);
|
||||
if (srdManager_ == NULL) {
|
||||
return false;
|
||||
}
|
||||
@@ -2509,7 +2509,7 @@ Sampler::create(
|
||||
if (0 == hwSrd_) {
|
||||
return false;
|
||||
}
|
||||
dev_.fillHwSampler(oclSamplerState, hwState_, HSA_SAMPLER_OBJECT_SIZE);
|
||||
dev_.fillHwSampler(oclSamplerState, hwState_, HsaSamplerObjectSize);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
@@ -3791,7 +3791,10 @@ WriteAqlArg(
|
||||
*dst += size;
|
||||
}
|
||||
|
||||
HsaAqlDispatchPacket*
|
||||
const hsa_packet_header_t DISPATCH_PACKET_HEADER = {
|
||||
HSA_PACKET_TYPE_DISPATCH, 1, HSA_FENCE_SCOPE_SYSTEM, HSA_FENCE_SCOPE_COMPONENT, 0 };
|
||||
|
||||
hsa_dispatch_packet_t*
|
||||
HSAILKernel::loadArguments(
|
||||
VirtualGPU& gpu,
|
||||
const amd::Kernel& kernel,
|
||||
@@ -3944,7 +3947,7 @@ HSAILKernel::loadArguments(
|
||||
if (dev().settings().hsailDirectSRD_) {
|
||||
// Image arguments are of size 48 bytes and aligned to 16 bytes
|
||||
WriteAqlArg(&aqlArgBuf, image->hwState(),
|
||||
HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
|
||||
HsaImageObjectSize, HsaImageObjectAlignment);
|
||||
}
|
||||
else {
|
||||
//! \note Special case for the image views.
|
||||
@@ -3952,9 +3955,9 @@ HSAILKernel::loadArguments(
|
||||
//! this view without a wait for SRD resource.
|
||||
if (image->memoryType() == Resource::ImageView) {
|
||||
// Copy the current structre into CB1
|
||||
memcpy(aqlStruct, image->hwState(), HSA_IMAGE_OBJECT_SIZE);
|
||||
memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
|
||||
ConstBuffer* cb = gpu.constBufs_[1];
|
||||
cb->uploadDataToHw(HSA_IMAGE_OBJECT_SIZE);
|
||||
cb->uploadDataToHw(HsaImageObjectSize);
|
||||
// Then use a pointer in aqlArgBuffer to CB1
|
||||
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
|
||||
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
||||
@@ -3983,7 +3986,7 @@ HSAILKernel::loadArguments(
|
||||
(sampler->getDeviceSampler(dev()));
|
||||
if (dev().settings().hsailDirectSRD_) {
|
||||
WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
|
||||
HSA_SAMPLER_OBJECT_SIZE, HSA_SAMPLER_OBJECT_ALIGNMENT);
|
||||
HsaSamplerObjectSize, HsaSamplerObjectAlignment);
|
||||
}
|
||||
else {
|
||||
uint64_t srd = gpuSampler->hwSrd();
|
||||
@@ -4025,46 +4028,41 @@ HSAILKernel::loadArguments(
|
||||
"Size and the number of arguments don't match!");
|
||||
uint argBufSize = amd::alignUp(
|
||||
static_cast<uint>(argsBufferSize()), sizeof(uint32_t));
|
||||
HsaAqlDispatchPacket* aqlPkt = reinterpret_cast<HsaAqlDispatchPacket*>(
|
||||
hsa_dispatch_packet_t* hsaDisp = reinterpret_cast<hsa_dispatch_packet_t*>(
|
||||
gpu.cb(0)->sysMemCopy() + argBufSize);
|
||||
|
||||
amd::NDRange local(sizes.local());
|
||||
amd::NDRange global(sizes.global());
|
||||
|
||||
// Initialize the Global, Local and Offset values
|
||||
aqlPkt->dimensions = sizes.dimensions();
|
||||
// Initialize the work grid parameter
|
||||
for (uint idx = 0; idx < 3; idx++) {
|
||||
aqlPkt->grid_size[idx] = 1;
|
||||
aqlPkt->workgroup_size[idx] = 1;
|
||||
}
|
||||
amd::NDRange local(sizes.local());
|
||||
const amd::NDRange& global = sizes.global();
|
||||
|
||||
// Check if runtime has to find local workgroup size
|
||||
findLocalWorkSize(aqlPkt->dimensions, global, local);
|
||||
for (uint idx = 0; idx < aqlPkt->dimensions; idx++) {
|
||||
aqlPkt->grid_size[idx] = global[idx];
|
||||
aqlPkt->workgroup_size[idx] = local[idx];
|
||||
}
|
||||
findLocalWorkSize(sizes.dimensions(), sizes.global(), local);
|
||||
|
||||
// Initialize if dispatch should enable profiling
|
||||
aqlPkt->reserved2 = 0; //config->profile ? 1:0;
|
||||
hsaDisp->header = DISPATCH_PACKET_HEADER;
|
||||
hsaDisp->dimensions = sizes.dimensions();
|
||||
hsaDisp->reserved = 0;
|
||||
|
||||
hsaDisp->workgroup_size_x = local[0];
|
||||
hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
|
||||
hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
|
||||
|
||||
hsaDisp->grid_size_x = global[0];
|
||||
hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
|
||||
hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
|
||||
hsaDisp->reserved2 = 0;
|
||||
|
||||
// Initialize kernel ISA and execution buffer requirements
|
||||
aqlPkt->kernel_object_address = gpuAqlCode()->vmAddress();
|
||||
aqlPkt->group_segment_size_bytes = ldsAddress - ldsSize();
|
||||
aqlPkt->private_segment_size_bytes = spillSegSize();
|
||||
|
||||
// Initialize cache flush configuration for the dispatch
|
||||
//! @todo Currently not used in emulation
|
||||
aqlPkt->barrier = 1;
|
||||
aqlPkt->release_fence_scope = 1;
|
||||
aqlPkt->acquire_fence_scope = 2;
|
||||
aqlPkt->invalidate_instruction_cache = 1;
|
||||
hsaDisp->private_segment_size = spillSegSize();
|
||||
hsaDisp->group_segment_size = ldsAddress - ldsSize();
|
||||
hsaDisp->kernel_object_address = gpuAqlCode()->vmAddress();
|
||||
|
||||
ConstBuffer* cb = gpu.constBufs_[0];
|
||||
cb->uploadDataToHw(argBufSize + sizeof(HsaAqlDispatchPacket));
|
||||
cb->uploadDataToHw(argBufSize + sizeof(hsa_dispatch_packet_t));
|
||||
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
|
||||
aqlPkt->kernel_arg_address = argList;
|
||||
|
||||
hsaDisp->kernarg_address = argList;
|
||||
hsaDisp->reserved3 = 0;
|
||||
hsaDisp->completion_signal = 0;
|
||||
|
||||
memList.push_back(cb);
|
||||
memList.push_back(gpuAqlCode());
|
||||
if (NULL != prog().globalStore()) {
|
||||
@@ -4078,7 +4076,7 @@ HSAILKernel::loadArguments(
|
||||
dev().srds().fillResourceList(memList);
|
||||
}
|
||||
|
||||
return aqlPkt;
|
||||
return hsaDisp;
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
#include "device/gpu/gpuvirtual.hpp"
|
||||
#include "sc-hsa/Interface/SCHSAInterface.h"
|
||||
#include "device/gpu/gpuprintf.hpp"
|
||||
#include "newcore.h"
|
||||
#include "hsa.h"
|
||||
//! \namespace gpu GPU Device Implementation
|
||||
namespace gpu {
|
||||
|
||||
@@ -885,7 +885,7 @@ public:
|
||||
|
||||
//! Returns AQL packet in CPU memory
|
||||
//! if the kerenl arguments were successfully loaded, otherwise NULL
|
||||
HsaAqlDispatchPacket* loadArguments(
|
||||
hsa_dispatch_packet_t* loadArguments(
|
||||
VirtualGPU& gpu, //!< Running GPU context
|
||||
const amd::Kernel& kernel, //!< AMD kernel object
|
||||
const amd::NDRangeContainer& sizes, //!< NDrange container
|
||||
|
||||
@@ -16,7 +16,8 @@
|
||||
#include <cstdio>
|
||||
#include "utils/options.hpp"
|
||||
#include "utils/libUtils.h"
|
||||
#include "newcore.h"
|
||||
#include "hsa.h"
|
||||
#include "hsa_ext_image.h"
|
||||
|
||||
extern "C" bool
|
||||
ACL_API_ENTRY aclHsaLoader(
|
||||
@@ -80,8 +81,8 @@ void
|
||||
GetSamplerObjectParams(uint32_t* size, uint32_t* alignment)
|
||||
{
|
||||
if (GPU_DIRECT_SRD) {
|
||||
*size = HSA_SAMPLER_OBJECT_SIZE;
|
||||
*alignment = HSA_SAMPLER_OBJECT_ALIGNMENT;
|
||||
*size = gpu::HsaSamplerObjectSize;
|
||||
*alignment = gpu::HsaSamplerObjectAlignment;
|
||||
}
|
||||
else {
|
||||
*size = sizeof(uint64_t);
|
||||
@@ -94,45 +95,46 @@ InitializeSamplerObject(void* userData, uint64_t offset, bool unnormalize,
|
||||
uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW)
|
||||
{
|
||||
assert((addrU == addrV && addrV == addrW) && "GSL supports single address mode");
|
||||
HsaSamplerFilterType filter = static_cast<HsaSamplerFilterType>(fltr);
|
||||
HsaSamplerAddressMode boundaryU = static_cast<HsaSamplerAddressMode>(addrU);
|
||||
hsa_ext_sampler_filter_mode_t filter =
|
||||
static_cast<hsa_ext_sampler_filter_mode_t>(fltr);
|
||||
hsa_ext_sampler_addressing_mode_t boundaryU =
|
||||
static_cast<hsa_ext_sampler_addressing_mode_t>(addrU);
|
||||
|
||||
uint32_t state = (unnormalize) ?
|
||||
amd::Sampler::StateNormalizedCoordsFalse : amd::Sampler::StateNormalizedCoordsTrue;
|
||||
if (filter == HSA_SAMP_FILTER_NEAREST) {
|
||||
if (filter == HSA_EXT_SAMPLER_FILTER_LINEAR) {
|
||||
state |= amd::Sampler::StateFilterNearest;
|
||||
}
|
||||
else if (filter == HSA_SAMP_FILTER_LINEAR) {
|
||||
else if (filter == HSA_EXT_SAMPLER_FILTER_LINEAR) {
|
||||
state |= amd::Sampler::StateFilterLinear;
|
||||
}
|
||||
switch (boundaryU) {
|
||||
case HSA_SAMP_ADDRESS_CLAMPEDGE:
|
||||
case HSA_EXT_SAMPLER_ADDRESSING_CLAMP_TO_EDGE:
|
||||
state |= amd::Sampler::StateAddressClampToEdge;
|
||||
break;
|
||||
case HSA_SAMP_ADDRESS_CLAMPBORDER:
|
||||
case HSA_EXT_SAMPLER_ADDRESSING_CLAMP_TO_BORDER:
|
||||
state |= amd::Sampler::StateAddressClamp;
|
||||
break;
|
||||
case HSA_SAMP_ADDRESS_WRAP:
|
||||
case HSA_EXT_SAMPLER_ADDRESSING_REPEAT:
|
||||
state |= amd::Sampler::StateAddressRepeat;
|
||||
break;
|
||||
case HSA_SAMP_ADDRESS_MIRROR:
|
||||
case HSA_EXT_SAMPLER_ADDRESSING_MIRRORED_REPEAT:
|
||||
state |= amd::Sampler::StateAddressMirroredRepeat;
|
||||
break;
|
||||
case HSA_SAMP_ADDRESS_MIRRORONCE:
|
||||
case HSA_SAMP_ADDRESS_NONE:
|
||||
case HSA_EXT_SAMPLER_ADDRESSING_UNDEFINED:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
gpu::HSAILProgram* prog = reinterpret_cast<gpu::HSAILProgram*>(userData);
|
||||
if (prog->dev().settings().hsailDirectSRD_) {
|
||||
char *pCPUbuf = new char[HSA_SAMPLER_OBJECT_SIZE];
|
||||
char *pCPUbuf = new char[gpu::HsaSamplerObjectSize];
|
||||
if (!pCPUbuf) {
|
||||
assert(false);
|
||||
return;
|
||||
}
|
||||
prog->dev().fillHwSampler(state, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE);
|
||||
DmaMemoryCopy(userData, offset, pCPUbuf, HSA_SAMPLER_OBJECT_SIZE);
|
||||
prog->dev().fillHwSampler(state, pCPUbuf, gpu::HsaSamplerObjectSize);
|
||||
DmaMemoryCopy(userData, offset, pCPUbuf, gpu::HsaSamplerObjectSize);
|
||||
delete pCPUbuf;
|
||||
}
|
||||
else {
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
#ifndef GPUSCHED_HPP_
|
||||
#define GPUSCHED_HPP_
|
||||
|
||||
#include "newcore.h"
|
||||
#include "hsa.h"
|
||||
|
||||
namespace gpu {
|
||||
|
||||
@@ -47,7 +47,7 @@ struct AmdAqlWrap {
|
||||
uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
|
||||
uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects
|
||||
uint32_t reserved[5]; //!< For the future usage
|
||||
HsaAqlDispatchPacket aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet
|
||||
hsa_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet
|
||||
};
|
||||
|
||||
struct AmdEvent {
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
#include "device/gpu/gputhreadtrace.hpp"
|
||||
#include "device/gpu/gputimestamp.hpp"
|
||||
#include "device/gpu/gpublit.hpp"
|
||||
#include "newcore.h"
|
||||
#include "hsa.h"
|
||||
#include "sc-hsa/Interface/SCHSAInterface.h"
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
@@ -342,7 +342,7 @@ VirtualGPU::createVirtualQueue(uint deviceQueueSize)
|
||||
AmdAqlWrap* slots = reinterpret_cast<AmdAqlWrap*>(&header[1]);
|
||||
for (uint i = 0; i < numSlots; ++i) {
|
||||
uint64_t argStart = vaBase + argOffs + i * singleArgSize;
|
||||
slots[i].aql.kernel_arg_address = argStart;
|
||||
slots[i].aql.kernarg_address = argStart;
|
||||
slots[i].wait_list = argStart + dev().info().maxParameterSize_ + 64;
|
||||
}
|
||||
// Upload data back to local memory
|
||||
@@ -576,7 +576,8 @@ VirtualGPU::create(
|
||||
bool
|
||||
VirtualGPU::allocHsaQueueMem()
|
||||
{
|
||||
amd_queue_t queue = {0};
|
||||
amd_queue_t queue;
|
||||
memset(&queue, 0, sizeof(queue));
|
||||
hsaQueueMem_ = new gpu::Memory(dev(), sizeof(queue));
|
||||
if (hsaQueueMem_ == NULL) {
|
||||
return false;
|
||||
@@ -1726,7 +1727,7 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
}
|
||||
|
||||
// Program the kernel arguments for the GPU execution
|
||||
HsaAqlDispatchPacket* aqlPkt =
|
||||
hsa_dispatch_packet_t* aqlPkt =
|
||||
hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem,
|
||||
vmDefQueue, &vmParentWrap, memList);
|
||||
if (NULL == aqlPkt) {
|
||||
@@ -1793,12 +1794,12 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
print << "\tState: " << eventD->state <<
|
||||
"; Counter: " << eventD->counter << "\n";
|
||||
}
|
||||
print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size[0] << ", ";
|
||||
print << wraps[i].aql.workgroup_size[1] << ", ";
|
||||
print << wraps[i].aql.workgroup_size[2] << "]\n";
|
||||
print << "GridSize[ " << wraps[i].aql.grid_size[0] << ", ";
|
||||
print << wraps[i].aql.grid_size[1] << ", ";
|
||||
print << wraps[i].aql.grid_size[2] << "]\n";
|
||||
print << "WorkGroupSize[ " << wraps[i].aql.workgroup_size_x << ", ";
|
||||
print << wraps[i].aql.workgroup_size_y << ", ";
|
||||
print << wraps[i].aql.workgroup_size_z << "]\n";
|
||||
print << "GridSize[ " << wraps[i].aql.grid_size_x << ", ";
|
||||
print << wraps[i].aql.grid_size_y << ", ";
|
||||
print << wraps[i].aql.grid_size_z << "]\n";
|
||||
|
||||
uint64_t* kernels = (uint64_t*)(
|
||||
const_cast<Memory*>(hsaKernel.prog().kernelTable())->map(this));
|
||||
@@ -1819,7 +1820,7 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
printf("Error: couldn't find child kernel!\n");
|
||||
continue;
|
||||
}
|
||||
uint offsArg = wraps[i].aql.kernel_arg_address -
|
||||
uint offsArg = wraps[i].aql.kernarg_address -
|
||||
gpuDefQueue->virtualQueue_->vmAddress();
|
||||
address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
|
||||
print << "Kernel: " << child->name() << "\n";
|
||||
|
||||
Ссылка в новой задаче
Block a user