P4 to Git Change 1370387 by wchau@wchau_OCL_boltzmann on 2017/02/07 17:00:35
SWDEV-103424 - [ROCm CQE][OCL] OCLRuntime - OCLCreateBuffer tests are failing. The failure is due to AQL cannot support global size > 32bit range. Adding dispatch split support for ROCm, similar to that of GSL (CL#1159349), to resolve the issue. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#13 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#56 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#31 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#8 edit ... //depot/stg/opencl/drivers/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp#6 edit
This commit is contained in:
@@ -147,7 +147,21 @@ public:
|
||||
//! Return printf info array
|
||||
const std::vector<PrintfInfo>& printfInfo() const {return printf_;}
|
||||
|
||||
//! Return TRUE if kernel is internal blit kernel
|
||||
bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; }
|
||||
|
||||
//! set internal kernel flag
|
||||
void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; }
|
||||
|
||||
private:
|
||||
union Flags {
|
||||
struct {
|
||||
uint internalKernel_: 1; //!< Is a blit kernel?
|
||||
};
|
||||
uint value_;
|
||||
Flags(): value_(0) {}
|
||||
} flags_;
|
||||
|
||||
//! Populates hsailArgList_
|
||||
void initArguments(const aclArgData* aclArg);
|
||||
#if defined(WITH_LIGHTNING_COMPILER)
|
||||
@@ -164,7 +178,6 @@ private:
|
||||
|
||||
HSAILProgram *program_; //!< The roc::HSAILProgram context
|
||||
std::vector<Argument*> hsailArgList_; //!< Vector list of HSAIL Arguments
|
||||
std::string compileOptions_; //!< compile used for finalizing this kernel
|
||||
uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t)
|
||||
const uint32_t workgroupGroupSegmentByteSize_;
|
||||
const uint32_t workitemPrivateSegmentByteSize_;
|
||||
|
||||
@@ -1080,6 +1080,7 @@ HSAILProgram::setKernels_LC(amd::option::Options *options, void* binary, size_t
|
||||
return false;
|
||||
}
|
||||
aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
|
||||
aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != std::string::npos);
|
||||
kernels()[kernelName] = aKernel;
|
||||
}
|
||||
|
||||
@@ -1336,6 +1337,7 @@ HSAILProgram::linkImpl(amd::option::Options *options)
|
||||
return false;
|
||||
}
|
||||
aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
|
||||
aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != std::string::npos);
|
||||
kernels()[kernelName] = aKernel;
|
||||
}
|
||||
saveBinaryAndSetType(TYPE_EXECUTABLE);
|
||||
|
||||
@@ -1513,210 +1513,257 @@ VirtualGPU::submitKernelInternal(
|
||||
return false;
|
||||
}
|
||||
|
||||
// Allocate buffer to hold kernel arguments
|
||||
address argBuffer = (address)allocKernArg(
|
||||
gpuKernel.KernargSegmentByteSize(),
|
||||
gpuKernel.KernargSegmentAlignment());
|
||||
|
||||
if (argBuffer == NULL) {
|
||||
LogError("Out of memory");
|
||||
return false;
|
||||
}
|
||||
|
||||
address argPtr = argBuffer;
|
||||
const amd::KernelSignature& signature = kernel.signature();
|
||||
const amd::KernelParameters& kernelParams = kernel.parameters();
|
||||
|
||||
// Find all parameters for the current kernel
|
||||
for (auto arg : gpuKernel.hsailArgs()) {
|
||||
const_address srcArgPtr = NULL;
|
||||
if (arg->index_ != uint(-1)) {
|
||||
srcArgPtr = parameters + signature.at(arg->index_).offset_;
|
||||
}
|
||||
size_t newOffset[3] = {0, 0, 0};
|
||||
size_t newGlobalSize[3] = {0, 0, 0};
|
||||
|
||||
// Handle the hidden arguments first, as they do not have a
|
||||
// matching parameter in the OCL signature (not a valid arg->index_)
|
||||
switch (arg->type_) {
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
|
||||
size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0;
|
||||
assert(arg->size_ == sizeof(offset_x) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
|
||||
size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0;
|
||||
assert(arg->size_ == sizeof(offset_y) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
|
||||
size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0;
|
||||
assert(arg->size_ == sizeof(offset_z) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
|
||||
address bufferPtr = printfDbg()->dbgBuffer();
|
||||
assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
|
||||
case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION:
|
||||
case ROC_ARGTYPE_HIDDEN_NONE: {
|
||||
void* zero = 0;
|
||||
assert(arg->size_ <= sizeof(zero) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_POINTER: {
|
||||
if (arg->addrQual_ == ROC_ADDRESS_LOCAL) {
|
||||
// Align the LDS on the alignment requirement of type pointed to
|
||||
ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_);
|
||||
argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_);
|
||||
ldsUsage += *reinterpret_cast<const size_t *>(srcArgPtr);
|
||||
int dim = -1;
|
||||
int iteration = 1;
|
||||
size_t globalStep = 0;
|
||||
for (uint i = 0; i < sizes.dimensions(); i++) {
|
||||
newGlobalSize[i] = sizes.global()[i];
|
||||
newOffset[i] = sizes.offset()[i];
|
||||
}
|
||||
|
||||
if (gpuKernel.isInternalKernel()) {
|
||||
// Calculate new group size for each submission
|
||||
for (uint i = 0; i < sizes.dimensions(); i++) {
|
||||
if (sizes.global()[i] > static_cast<size_t>(0xffffffff)) {
|
||||
dim = i;
|
||||
iteration = sizes.global()[i] / 0xC0000000
|
||||
+ ((sizes.global()[i] % 0xC0000000) ? 1: 0);
|
||||
globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration
|
||||
* sizes.local()[dim];
|
||||
if (timestamp_ != nullptr) {
|
||||
timestamp_->setSplittedDispatch();
|
||||
}
|
||||
break;
|
||||
}
|
||||
assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL
|
||||
|| arg->addrQual_ == ROC_ADDRESS_CONSTANT)
|
||||
&& "Unsupported address qualifier");
|
||||
if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) {
|
||||
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
amd::Memory* mem = *reinterpret_cast<amd::Memory* const*>(srcArgPtr);
|
||||
if (mem == NULL) {
|
||||
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
|
||||
Memory *devMem = static_cast<Memory *>(mem->getDeviceMemory(dev()));
|
||||
//! @todo add multi-devices synchronization when supported.
|
||||
void* globalAddress = devMem->getDeviceMemory();
|
||||
argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_);
|
||||
|
||||
//! @todo Compiler has to return read/write attributes
|
||||
const cl_mem_flags flags = mem->getMemFlags();
|
||||
if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
|
||||
mem->signalWrite(&dev());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_REFERENCE: {
|
||||
void *mem = allocKernArg(arg->size_, arg->alignment_);
|
||||
if (mem == NULL) {
|
||||
LogError("Out of memory");
|
||||
return false;
|
||||
}
|
||||
memcpy(mem, srcArgPtr, arg->size_);
|
||||
argPtr = addArg(argPtr, &mem, sizeof(void*));
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_VALUE:
|
||||
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
|
||||
break;
|
||||
case ROC_ARGTYPE_IMAGE: {
|
||||
amd::Memory* mem = *reinterpret_cast<amd::Memory* const*>(srcArgPtr);
|
||||
Image* image = static_cast<Image *>(mem->getDeviceMemory(dev()));
|
||||
if (image == NULL) {
|
||||
LogError("Kernel image argument is not an image object");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (dev().settings().enableImageHandle_) {
|
||||
const uint64_t image_srd = image->getHsaImageObject().handle;
|
||||
assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
|
||||
argPtr = addArg(argPtr, &image_srd, sizeof(image_srd));
|
||||
for (int j = 0; j < iteration; j++) {
|
||||
// Reset global size for dimension dim if split is needed
|
||||
if (dim != -1) {
|
||||
newOffset[dim] = sizes.offset()[dim] + globalStep * j;
|
||||
if (((newOffset[dim] + globalStep) < sizes.global()[dim]) &&
|
||||
(j != (iteration - 1))) {
|
||||
newGlobalSize[dim] = globalStep;
|
||||
}
|
||||
else {
|
||||
// Image arguments are of size 48 bytes and are aligned to 16 bytes
|
||||
argPtr = addArg(argPtr, (void *)image->getHsaImageObject().handle,
|
||||
HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
|
||||
newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim];
|
||||
}
|
||||
|
||||
//! @todo Compiler has to return read/write attributes
|
||||
const cl_mem_flags flags = mem->getMemFlags();
|
||||
if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
|
||||
mem->signalWrite(&dev());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_SAMPLER: {
|
||||
amd::Sampler* sampler = *reinterpret_cast<amd::Sampler* const*>(srcArgPtr);
|
||||
if (sampler == NULL) {
|
||||
LogError("Kernel sampler argument is not an sampler object");
|
||||
return false;
|
||||
}
|
||||
|
||||
hsa_ext_sampler_descriptor_t samplerDescriptor;
|
||||
fillSampleDescriptor(samplerDescriptor, *sampler);
|
||||
// Find all parameters for the current kernel
|
||||
|
||||
hsa_ext_sampler_t hsa_sampler;
|
||||
hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(),
|
||||
&samplerDescriptor, &hsa_sampler);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
LogError("Error creating device sampler object!");
|
||||
return false;
|
||||
}
|
||||
// Allocate buffer to hold kernel arguments
|
||||
address argBuffer = (address)allocKernArg(
|
||||
gpuKernel.KernargSegmentByteSize(),
|
||||
gpuKernel.KernargSegmentAlignment());
|
||||
|
||||
if (dev().settings().enableImageHandle_) {
|
||||
uint64_t sampler_srd = hsa_sampler.handle;
|
||||
argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd));
|
||||
samplerList_.push_back(hsa_sampler);
|
||||
// TODO: destroy sampler.
|
||||
}
|
||||
else {
|
||||
argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
|
||||
|
||||
memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
|
||||
argPtr += HSA_SAMPLER_OBJECT_SIZE;
|
||||
hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
if (argBuffer == NULL) {
|
||||
LogError("Out of memory");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check there is no arguments' buffer overflow
|
||||
assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize());
|
||||
address argPtr = argBuffer;
|
||||
for (auto arg : gpuKernel.hsailArgs()) {
|
||||
const_address srcArgPtr = NULL;
|
||||
if (arg->index_ != uint(-1)) {
|
||||
srcArgPtr = parameters + signature.at(arg->index_).offset_;
|
||||
}
|
||||
|
||||
// Check for group memory overflow
|
||||
//! @todo Check should be in HSA - here we should have at most an assert
|
||||
assert(roc_device_.info().localMemSizePerCU_ > 0);
|
||||
if (ldsUsage > roc_device_.info().localMemSizePerCU_) {
|
||||
LogError("No local memory available\n");
|
||||
return false;
|
||||
}
|
||||
// Handle the hidden arguments first, as they do not have a
|
||||
// matching parameter in the OCL signature (not a valid arg->index_)
|
||||
switch (arg->type_) {
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
|
||||
size_t offset_x = sizes.dimensions() >= 1 ? newOffset[0] : 0;
|
||||
assert(arg->size_ == sizeof(offset_x) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
|
||||
size_t offset_y = sizes.dimensions() >= 2 ? newOffset[1] : 0;
|
||||
assert(arg->size_ == sizeof(offset_y) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
|
||||
size_t offset_z = sizes.dimensions() == 3 ? newOffset[2] : 0;
|
||||
assert(arg->size_ == sizeof(offset_z) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
|
||||
address bufferPtr = printfDbg()->dbgBuffer();
|
||||
assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
|
||||
case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION:
|
||||
case ROC_ARGTYPE_HIDDEN_NONE: {
|
||||
void* zero = 0;
|
||||
assert(arg->size_ <= sizeof(zero) && "check the sizes");
|
||||
argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_POINTER: {
|
||||
if (arg->addrQual_ == ROC_ADDRESS_LOCAL) {
|
||||
// Align the LDS on the alignment requirement of type pointed to
|
||||
ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_);
|
||||
argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_);
|
||||
ldsUsage += *reinterpret_cast<const size_t *>(srcArgPtr);
|
||||
break;
|
||||
}
|
||||
assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL
|
||||
|| arg->addrQual_ == ROC_ADDRESS_CONSTANT)
|
||||
&& "Unsupported address qualifier");
|
||||
if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) {
|
||||
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
amd::Memory* mem = *reinterpret_cast<amd::Memory* const*>(srcArgPtr);
|
||||
if (mem == NULL) {
|
||||
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
|
||||
break;
|
||||
}
|
||||
|
||||
//Initialize the dispatch Packet
|
||||
hsa_kernel_dispatch_packet_t dispatchPacket;
|
||||
memset(&dispatchPacket, 0, sizeof(dispatchPacket));
|
||||
Memory *devMem = static_cast<Memory *>(mem->getDeviceMemory(dev()));
|
||||
//! @todo add multi-devices synchronization when supported.
|
||||
void* globalAddress = devMem->getDeviceMemory();
|
||||
argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_);
|
||||
|
||||
dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();
|
||||
//! @todo Compiler has to return read/write attributes
|
||||
const cl_mem_flags flags = mem->getMemFlags();
|
||||
if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
|
||||
mem->signalWrite(&dev());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_REFERENCE: {
|
||||
void *mem = allocKernArg(arg->size_, arg->alignment_);
|
||||
if (mem == NULL) {
|
||||
LogError("Out of memory");
|
||||
return false;
|
||||
}
|
||||
memcpy(mem, srcArgPtr, arg->size_);
|
||||
argPtr = addArg(argPtr, &mem, sizeof(void*));
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_VALUE:
|
||||
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
|
||||
break;
|
||||
case ROC_ARGTYPE_IMAGE: {
|
||||
amd::Memory* mem = *reinterpret_cast<amd::Memory* const*>(srcArgPtr);
|
||||
Image* image = static_cast<Image *>(mem->getDeviceMemory(dev()));
|
||||
if (image == NULL) {
|
||||
LogError("Kernel image argument is not an image object");
|
||||
return false;
|
||||
}
|
||||
|
||||
dispatchPacket.header = aqlHeader_;
|
||||
dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
dispatchPacket.grid_size_x = sizes.dimensions()>0 ? sizes.global()[0] : 1;
|
||||
dispatchPacket.grid_size_y = sizes.dimensions()>1 ? sizes.global()[1] : 1;
|
||||
dispatchPacket.grid_size_z = sizes.dimensions()>2 ? sizes.global()[2] : 1;
|
||||
if (dev().settings().enableImageHandle_) {
|
||||
const uint64_t image_srd = image->getHsaImageObject().handle;
|
||||
assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
|
||||
argPtr = addArg(argPtr, &image_srd, sizeof(image_srd));
|
||||
}
|
||||
else {
|
||||
// Image arguments are of size 48 bytes and are aligned to 16 bytes
|
||||
argPtr = addArg(argPtr, (void *)image->getHsaImageObject().handle,
|
||||
HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
|
||||
}
|
||||
|
||||
const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
|
||||
if (sizes.local().product() != 0) {
|
||||
dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? sizes.local()[0] : 1;
|
||||
dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? sizes.local()[1] : 1;
|
||||
dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? sizes.local()[2] : 1;
|
||||
} else {
|
||||
setRuntimeCompilerLocalSize(dispatchPacket, sizes, compile_size, dev());
|
||||
}
|
||||
dispatchPacket.kernarg_address = argBuffer;
|
||||
dispatchPacket.group_segment_size = ldsUsage;
|
||||
dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
|
||||
//! @todo Compiler has to return read/write attributes
|
||||
const cl_mem_flags flags = mem->getMemFlags();
|
||||
if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
|
||||
mem->signalWrite(&dev());
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ROC_ARGTYPE_SAMPLER: {
|
||||
amd::Sampler* sampler = *reinterpret_cast<amd::Sampler* const*>(srcArgPtr);
|
||||
if (sampler == NULL) {
|
||||
LogError("Kernel sampler argument is not an sampler object");
|
||||
return false;
|
||||
}
|
||||
|
||||
//Dispatch the packet
|
||||
if (!dispatchAqlPacket(&dispatchPacket, GPU_FLUSH_ON_EXECUTION)){
|
||||
return false;
|
||||
hsa_ext_sampler_descriptor_t samplerDescriptor;
|
||||
fillSampleDescriptor(samplerDescriptor, *sampler);
|
||||
|
||||
hsa_ext_sampler_t hsa_sampler;
|
||||
hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(),
|
||||
&samplerDescriptor, &hsa_sampler);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
LogError("Error creating device sampler object!");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (dev().settings().enableImageHandle_) {
|
||||
uint64_t sampler_srd = hsa_sampler.handle;
|
||||
argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd));
|
||||
samplerList_.push_back(hsa_sampler);
|
||||
// TODO: destroy sampler.
|
||||
}
|
||||
else {
|
||||
argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
|
||||
|
||||
memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
|
||||
argPtr += HSA_SAMPLER_OBJECT_SIZE;
|
||||
hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Check there is no arguments' buffer overflow
|
||||
assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize());
|
||||
|
||||
// Check for group memory overflow
|
||||
//! @todo Check should be in HSA - here we should have at most an assert
|
||||
assert(roc_device_.info().localMemSizePerCU_ > 0);
|
||||
if (ldsUsage > roc_device_.info().localMemSizePerCU_) {
|
||||
LogError("No local memory available\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
//Initialize the dispatch Packet
|
||||
hsa_kernel_dispatch_packet_t dispatchPacket;
|
||||
memset(&dispatchPacket, 0, sizeof(dispatchPacket));
|
||||
|
||||
dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();
|
||||
|
||||
dispatchPacket.header = aqlHeader_;
|
||||
dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
|
||||
dispatchPacket.grid_size_x = sizes.dimensions()>0 ? newGlobalSize[0] : 1;
|
||||
dispatchPacket.grid_size_y = sizes.dimensions()>1 ? newGlobalSize[1] : 1;
|
||||
dispatchPacket.grid_size_z = sizes.dimensions()>2 ? newGlobalSize[2] : 1;
|
||||
|
||||
const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
|
||||
if (sizes.local().product() != 0) {
|
||||
dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? sizes.local()[0] : 1;
|
||||
dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? sizes.local()[1] : 1;
|
||||
dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? sizes.local()[2] : 1;
|
||||
} else {
|
||||
amd::NDRangeContainer tmpSizes(sizes.dimensions(),
|
||||
&newOffset[0], &newGlobalSize[0],
|
||||
&(const_cast<amd::NDRangeContainer&>(sizes).local()[0]));
|
||||
|
||||
setRuntimeCompilerLocalSize(dispatchPacket, tmpSizes, compile_size, dev());
|
||||
}
|
||||
dispatchPacket.kernarg_address = argBuffer;
|
||||
dispatchPacket.group_segment_size = ldsUsage;
|
||||
dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
|
||||
|
||||
//Dispatch the packet
|
||||
if (!dispatchAqlPacket(&dispatchPacket, GPU_FLUSH_ON_EXECUTION)){
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Mark the flag indicating if a dispatch is outstanding.
|
||||
|
||||
@@ -29,53 +29,80 @@ struct ProfilingSignal : public amd::HeapObject
|
||||
// including EnqueueNDRangeKernel and clEnqueueCopyBuffer.
|
||||
class Timestamp {
|
||||
private:
|
||||
uint64_t start_;
|
||||
uint64_t end_;
|
||||
ProfilingSignal* profilingSignal_;
|
||||
hsa_agent_t agent_;
|
||||
static double ticksToTime_;
|
||||
uint64_t start_;
|
||||
uint64_t end_;
|
||||
ProfilingSignal* profilingSignal_;
|
||||
hsa_agent_t agent_;
|
||||
static double ticksToTime_;
|
||||
bool splittedDispatch_;
|
||||
std::vector<hsa_signal_t> splittedSignals_;
|
||||
|
||||
public:
|
||||
uint64_t getStart() { checkGpuTime(); return start_; }
|
||||
uint64_t getStart() { checkGpuTime(); return start_; }
|
||||
|
||||
uint64_t getEnd() { checkGpuTime(); return end_; }
|
||||
uint64_t getEnd() { checkGpuTime(); return end_; }
|
||||
|
||||
void setProfilingSignal(ProfilingSignal* signal) { profilingSignal_ = signal; }
|
||||
void setProfilingSignal(ProfilingSignal* signal) {
|
||||
profilingSignal_ = signal;
|
||||
if (splittedDispatch_) {
|
||||
splittedSignals_.push_back(profilingSignal_->signal_);
|
||||
}
|
||||
}
|
||||
const ProfilingSignal* getProfilingSignal() const { return profilingSignal_; }
|
||||
|
||||
const ProfilingSignal* getProfilingSignal() const { return profilingSignal_; }
|
||||
void setAgent(hsa_agent_t agent) { agent_ = agent; }
|
||||
|
||||
void setAgent(hsa_agent_t agent) { agent_ = agent; }
|
||||
Timestamp() : start_(0), end_(0), profilingSignal_(nullptr), splittedDispatch_(false) {
|
||||
agent_.handle = 0;
|
||||
}
|
||||
|
||||
Timestamp() : start_(0), end_(0), profilingSignal_(nullptr) {
|
||||
agent_.handle = 0;
|
||||
}
|
||||
~Timestamp() {}
|
||||
|
||||
~Timestamp() {}
|
||||
//! Finds execution ticks on GPU
|
||||
void checkGpuTime() {
|
||||
if (profilingSignal_ != nullptr) {
|
||||
hsa_amd_profiling_dispatch_time_t time;
|
||||
|
||||
//! Finds execution ticks on GPU
|
||||
void checkGpuTime() {
|
||||
if (profilingSignal_ != nullptr) {
|
||||
hsa_amd_profiling_dispatch_time_t time;
|
||||
hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time);
|
||||
start_ = time.start * ticksToTime_;
|
||||
end_ = time.end * ticksToTime_;
|
||||
profilingSignal_->ts_ = nullptr;
|
||||
profilingSignal_ = nullptr;
|
||||
}
|
||||
}
|
||||
if (splittedDispatch_) {
|
||||
uint64_t start = UINT64_MAX;
|
||||
uint64_t end = 0;
|
||||
for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) {
|
||||
hsa_amd_profiling_get_dispatch_time(agent_, *it, &time);
|
||||
if (time.start < start) {
|
||||
start = time.start;
|
||||
}
|
||||
if (time.end > end) {
|
||||
end = time.end;
|
||||
}
|
||||
}
|
||||
start_ = start * ticksToTime_;
|
||||
end_ = end * ticksToTime_;
|
||||
}
|
||||
else {
|
||||
hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time);
|
||||
start_ = time.start * ticksToTime_;
|
||||
end_ = time.end * ticksToTime_;
|
||||
}
|
||||
profilingSignal_->ts_ = nullptr;
|
||||
profilingSignal_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// Start a timestamp (get timestamp from OS)
|
||||
void start() {
|
||||
start_ = amd::Os::timeNanos();
|
||||
}
|
||||
// Start a timestamp (get timestamp from OS)
|
||||
void start() {
|
||||
start_ = amd::Os::timeNanos();
|
||||
}
|
||||
|
||||
// End a timestamp (get timestamp from OS)
|
||||
void end() {
|
||||
end_ = amd::Os::timeNanos();
|
||||
}
|
||||
// End a timestamp (get timestamp from OS)
|
||||
void end() {
|
||||
end_ = amd::Os::timeNanos();
|
||||
}
|
||||
|
||||
static void setGpuTicksToTime(double ticksToTime) { ticksToTime_=ticksToTime; }
|
||||
static double getGpuTicksToTime() { return ticksToTime_; }
|
||||
bool isSplittedDispatch() const { return splittedDispatch_; }
|
||||
void setSplittedDispatch() { splittedDispatch_ = true; }
|
||||
|
||||
static void setGpuTicksToTime(double ticksToTime) { ticksToTime_=ticksToTime; }
|
||||
static double getGpuTicksToTime() { return ticksToTime_; }
|
||||
};
|
||||
|
||||
class VirtualGPU : public device::VirtualDevice {
|
||||
|
||||
Reference in New Issue
Block a user