From dadd21fcedf5507d18bb429d26e4e6ec5758153a Mon Sep 17 00:00:00 2001
From: foreman
Date: Tue, 7 Feb 2017 17:13:15 -0500
Subject: [PATCH] P4 to Git Change 1370387 by wchau@wchau_OCL_boltzmann on
2017/02/07 17:00:35
SWDEV-103424 - [ROCm CQE][OCL] OCLRuntime - OCLCreateBuffer tests are failing. The failure is due to AQL cannot support global size > 32bit range. Adding dispatch split support for ROCm, similar to that of GSL (CL#1159349), to resolve the issue.
Affected files ...
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#56 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#31 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#8 edit
... //depot/stg/opencl/drivers/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp#6 edit
---
rocclr/runtime/device/rocm/rockernel.hpp | 15 +-
rocclr/runtime/device/rocm/rocprogram.cpp | 2 +
rocclr/runtime/device/rocm/rocvirtual.cpp | 407 ++++++++++++----------
rocclr/runtime/device/rocm/rocvirtual.hpp | 97 ++++--
4 files changed, 305 insertions(+), 216 deletions(-)
diff --git a/rocclr/runtime/device/rocm/rockernel.hpp b/rocclr/runtime/device/rocm/rockernel.hpp
index 5bddbc3225..75eda35238 100644
--- a/rocclr/runtime/device/rocm/rockernel.hpp
+++ b/rocclr/runtime/device/rocm/rockernel.hpp
@@ -147,7 +147,21 @@ public:
//! Return printf info array
const std::vector& printfInfo() const {return printf_;}
+ //! Return TRUE if kernel is internal blit kernel
+ bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; }
+
+ //! set internal kernel flag
+ void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; }
+
private:
+ union Flags {
+ struct {
+ uint internalKernel_: 1; //!< Is a blit kernel?
+ };
+ uint value_;
+ Flags(): value_(0) {}
+ } flags_;
+
//! Populates hsailArgList_
void initArguments(const aclArgData* aclArg);
#if defined(WITH_LIGHTNING_COMPILER)
@@ -164,7 +178,6 @@ private:
HSAILProgram *program_; //!< The roc::HSAILProgram context
std::vector hsailArgList_; //!< Vector list of HSAIL Arguments
- std::string compileOptions_; //!< compile used for finalizing this kernel
uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t)
const uint32_t workgroupGroupSegmentByteSize_;
const uint32_t workitemPrivateSegmentByteSize_;
diff --git a/rocclr/runtime/device/rocm/rocprogram.cpp b/rocclr/runtime/device/rocm/rocprogram.cpp
index 8e17aea1bc..19db3e70ff 100644
--- a/rocclr/runtime/device/rocm/rocprogram.cpp
+++ b/rocclr/runtime/device/rocm/rocprogram.cpp
@@ -1080,6 +1080,7 @@ HSAILProgram::setKernels_LC(amd::option::Options *options, void* binary, size_t
return false;
}
aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
+ aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != std::string::npos);
kernels()[kernelName] = aKernel;
}
@@ -1336,6 +1337,7 @@ HSAILProgram::linkImpl(amd::option::Options *options)
return false;
}
aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
+ aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != std::string::npos);
kernels()[kernelName] = aKernel;
}
saveBinaryAndSetType(TYPE_EXECUTABLE);
diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp
index 1d7c9f20db..33f6516d21 100644
--- a/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -1513,210 +1513,257 @@ VirtualGPU::submitKernelInternal(
return false;
}
- // Allocate buffer to hold kernel arguments
- address argBuffer = (address)allocKernArg(
- gpuKernel.KernargSegmentByteSize(),
- gpuKernel.KernargSegmentAlignment());
-
- if (argBuffer == NULL) {
- LogError("Out of memory");
- return false;
- }
-
- address argPtr = argBuffer;
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
- // Find all parameters for the current kernel
- for (auto arg : gpuKernel.hsailArgs()) {
- const_address srcArgPtr = NULL;
- if (arg->index_ != uint(-1)) {
- srcArgPtr = parameters + signature.at(arg->index_).offset_;
- }
+ size_t newOffset[3] = {0, 0, 0};
+ size_t newGlobalSize[3] = {0, 0, 0};
- // Handle the hidden arguments first, as they do not have a
- // matching parameter in the OCL signature (not a valid arg->index_)
- switch (arg->type_) {
- case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
- size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0;
- assert(arg->size_ == sizeof(offset_x) && "check the sizes");
- argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_);
- break;
- }
- case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
- size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0;
- assert(arg->size_ == sizeof(offset_y) && "check the sizes");
- argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_);
- break;
- }
- case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
- size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0;
- assert(arg->size_ == sizeof(offset_z) && "check the sizes");
- argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_);
- break;
- }
- case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
- address bufferPtr = printfDbg()->dbgBuffer();
- assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
- argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_);
- break;
- }
- case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
- case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION:
- case ROC_ARGTYPE_HIDDEN_NONE: {
- void* zero = 0;
- assert(arg->size_ <= sizeof(zero) && "check the sizes");
- argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_);
- break;
- }
- case ROC_ARGTYPE_POINTER: {
- if (arg->addrQual_ == ROC_ADDRESS_LOCAL) {
- // Align the LDS on the alignment requirement of type pointed to
- ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_);
- argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_);
- ldsUsage += *reinterpret_cast(srcArgPtr);
+ int dim = -1;
+ int iteration = 1;
+ size_t globalStep = 0;
+ for (uint i = 0; i < sizes.dimensions(); i++) {
+ newGlobalSize[i] = sizes.global()[i];
+ newOffset[i] = sizes.offset()[i];
+ }
+
+ if (gpuKernel.isInternalKernel()) {
+ // Calculate new group size for each submission
+ for (uint i = 0; i < sizes.dimensions(); i++) {
+ if (sizes.global()[i] > static_cast(0xffffffff)) {
+ dim = i;
+ iteration = sizes.global()[i] / 0xC0000000
+ + ((sizes.global()[i] % 0xC0000000) ? 1: 0);
+ globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration
+ * sizes.local()[dim];
+ if (timestamp_ != nullptr) {
+ timestamp_->setSplittedDispatch();
+ }
break;
}
- assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL
- || arg->addrQual_ == ROC_ADDRESS_CONSTANT)
- && "Unsupported address qualifier");
- if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) {
- argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
- break;
- }
- amd::Memory* mem = *reinterpret_cast(srcArgPtr);
- if (mem == NULL) {
- argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
- break;
- }
-
- Memory *devMem = static_cast(mem->getDeviceMemory(dev()));
- //! @todo add multi-devices synchronization when supported.
- void* globalAddress = devMem->getDeviceMemory();
- argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_);
-
- //! @todo Compiler has to return read/write attributes
- const cl_mem_flags flags = mem->getMemFlags();
- if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
- mem->signalWrite(&dev());
- }
- break;
}
- case ROC_ARGTYPE_REFERENCE: {
- void *mem = allocKernArg(arg->size_, arg->alignment_);
- if (mem == NULL) {
- LogError("Out of memory");
- return false;
- }
- memcpy(mem, srcArgPtr, arg->size_);
- argPtr = addArg(argPtr, &mem, sizeof(void*));
- break;
- }
- case ROC_ARGTYPE_VALUE:
- argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
- break;
- case ROC_ARGTYPE_IMAGE: {
- amd::Memory* mem = *reinterpret_cast(srcArgPtr);
- Image* image = static_cast(mem->getDeviceMemory(dev()));
- if (image == NULL) {
- LogError("Kernel image argument is not an image object");
- return false;
- }
+ }
- if (dev().settings().enableImageHandle_) {
- const uint64_t image_srd = image->getHsaImageObject().handle;
- assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
- argPtr = addArg(argPtr, &image_srd, sizeof(image_srd));
+ for (int j = 0; j < iteration; j++) {
+ // Reset global size for dimension dim if split is needed
+ if (dim != -1) {
+ newOffset[dim] = sizes.offset()[dim] + globalStep * j;
+ if (((newOffset[dim] + globalStep) < sizes.global()[dim]) &&
+ (j != (iteration - 1))) {
+ newGlobalSize[dim] = globalStep;
}
else {
- // Image arguments are of size 48 bytes and are aligned to 16 bytes
- argPtr = addArg(argPtr, (void *)image->getHsaImageObject().handle,
- HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
+ newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim];
}
-
- //! @todo Compiler has to return read/write attributes
- const cl_mem_flags flags = mem->getMemFlags();
- if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
- mem->signalWrite(&dev());
- }
- break;
}
- case ROC_ARGTYPE_SAMPLER: {
- amd::Sampler* sampler = *reinterpret_cast(srcArgPtr);
- if (sampler == NULL) {
- LogError("Kernel sampler argument is not an sampler object");
- return false;
- }
- hsa_ext_sampler_descriptor_t samplerDescriptor;
- fillSampleDescriptor(samplerDescriptor, *sampler);
+ // Find all parameters for the current kernel
- hsa_ext_sampler_t hsa_sampler;
- hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(),
- &samplerDescriptor, &hsa_sampler);
- if (status != HSA_STATUS_SUCCESS) {
- LogError("Error creating device sampler object!");
- return false;
- }
+ // Allocate buffer to hold kernel arguments
+ address argBuffer = (address)allocKernArg(
+ gpuKernel.KernargSegmentByteSize(),
+ gpuKernel.KernargSegmentAlignment());
- if (dev().settings().enableImageHandle_) {
- uint64_t sampler_srd = hsa_sampler.handle;
- argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd));
- samplerList_.push_back(hsa_sampler);
- // TODO: destroy sampler.
- }
- else {
- argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
-
- memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
- argPtr += HSA_SAMPLER_OBJECT_SIZE;
- hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
- }
- break;
- }
- default:
+ if (argBuffer == NULL) {
+ LogError("Out of memory");
return false;
}
- }
- // Check there is no arguments' buffer overflow
- assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize());
+ address argPtr = argBuffer;
+ for (auto arg : gpuKernel.hsailArgs()) {
+ const_address srcArgPtr = NULL;
+ if (arg->index_ != uint(-1)) {
+ srcArgPtr = parameters + signature.at(arg->index_).offset_;
+ }
- // Check for group memory overflow
- //! @todo Check should be in HSA - here we should have at most an assert
- assert(roc_device_.info().localMemSizePerCU_ > 0);
- if (ldsUsage > roc_device_.info().localMemSizePerCU_) {
- LogError("No local memory available\n");
- return false;
- }
+ // Handle the hidden arguments first, as they do not have a
+ // matching parameter in the OCL signature (not a valid arg->index_)
+ switch (arg->type_) {
+ case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
+ size_t offset_x = sizes.dimensions() >= 1 ? newOffset[0] : 0;
+ assert(arg->size_ == sizeof(offset_x) && "check the sizes");
+ argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_);
+ break;
+ }
+ case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
+ size_t offset_y = sizes.dimensions() >= 2 ? newOffset[1] : 0;
+ assert(arg->size_ == sizeof(offset_y) && "check the sizes");
+ argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_);
+ break;
+ }
+ case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
+ size_t offset_z = sizes.dimensions() == 3 ? newOffset[2] : 0;
+ assert(arg->size_ == sizeof(offset_z) && "check the sizes");
+ argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_);
+ break;
+ }
+ case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
+ address bufferPtr = printfDbg()->dbgBuffer();
+ assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
+ argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_);
+ break;
+ }
+ case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
+ case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION:
+ case ROC_ARGTYPE_HIDDEN_NONE: {
+ void* zero = 0;
+ assert(arg->size_ <= sizeof(zero) && "check the sizes");
+ argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_);
+ break;
+ }
+ case ROC_ARGTYPE_POINTER: {
+ if (arg->addrQual_ == ROC_ADDRESS_LOCAL) {
+ // Align the LDS on the alignment requirement of type pointed to
+ ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_);
+ argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_);
+ ldsUsage += *reinterpret_cast(srcArgPtr);
+ break;
+ }
+ assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL
+ || arg->addrQual_ == ROC_ADDRESS_CONSTANT)
+ && "Unsupported address qualifier");
+ if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) {
+ argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
+ break;
+ }
+ amd::Memory* mem = *reinterpret_cast(srcArgPtr);
+ if (mem == NULL) {
+ argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
+ break;
+ }
- //Initialize the dispatch Packet
- hsa_kernel_dispatch_packet_t dispatchPacket;
- memset(&dispatchPacket, 0, sizeof(dispatchPacket));
+ Memory *devMem = static_cast(mem->getDeviceMemory(dev()));
+ //! @todo add multi-devices synchronization when supported.
+ void* globalAddress = devMem->getDeviceMemory();
+ argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_);
- dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();
+ //! @todo Compiler has to return read/write attributes
+ const cl_mem_flags flags = mem->getMemFlags();
+ if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
+ mem->signalWrite(&dev());
+ }
+ break;
+ }
+ case ROC_ARGTYPE_REFERENCE: {
+ void *mem = allocKernArg(arg->size_, arg->alignment_);
+ if (mem == NULL) {
+ LogError("Out of memory");
+ return false;
+ }
+ memcpy(mem, srcArgPtr, arg->size_);
+ argPtr = addArg(argPtr, &mem, sizeof(void*));
+ break;
+ }
+ case ROC_ARGTYPE_VALUE:
+ argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
+ break;
+ case ROC_ARGTYPE_IMAGE: {
+ amd::Memory* mem = *reinterpret_cast(srcArgPtr);
+ Image* image = static_cast(mem->getDeviceMemory(dev()));
+ if (image == NULL) {
+ LogError("Kernel image argument is not an image object");
+ return false;
+ }
- dispatchPacket.header = aqlHeader_;
- dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
- dispatchPacket.grid_size_x = sizes.dimensions()>0 ? sizes.global()[0] : 1;
- dispatchPacket.grid_size_y = sizes.dimensions()>1 ? sizes.global()[1] : 1;
- dispatchPacket.grid_size_z = sizes.dimensions()>2 ? sizes.global()[2] : 1;
+ if (dev().settings().enableImageHandle_) {
+ const uint64_t image_srd = image->getHsaImageObject().handle;
+ assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
+ argPtr = addArg(argPtr, &image_srd, sizeof(image_srd));
+ }
+ else {
+ // Image arguments are of size 48 bytes and are aligned to 16 bytes
+ argPtr = addArg(argPtr, (void *)image->getHsaImageObject().handle,
+ HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
+ }
- const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
- if (sizes.local().product() != 0) {
- dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? sizes.local()[0] : 1;
- dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? sizes.local()[1] : 1;
- dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? sizes.local()[2] : 1;
- } else {
- setRuntimeCompilerLocalSize(dispatchPacket, sizes, compile_size, dev());
- }
- dispatchPacket.kernarg_address = argBuffer;
- dispatchPacket.group_segment_size = ldsUsage;
- dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
+ //! @todo Compiler has to return read/write attributes
+ const cl_mem_flags flags = mem->getMemFlags();
+ if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
+ mem->signalWrite(&dev());
+ }
+ break;
+ }
+ case ROC_ARGTYPE_SAMPLER: {
+ amd::Sampler* sampler = *reinterpret_cast(srcArgPtr);
+ if (sampler == NULL) {
+ LogError("Kernel sampler argument is not an sampler object");
+ return false;
+ }
- //Dispatch the packet
- if (!dispatchAqlPacket(&dispatchPacket, GPU_FLUSH_ON_EXECUTION)){
- return false;
+ hsa_ext_sampler_descriptor_t samplerDescriptor;
+ fillSampleDescriptor(samplerDescriptor, *sampler);
+
+ hsa_ext_sampler_t hsa_sampler;
+ hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(),
+ &samplerDescriptor, &hsa_sampler);
+ if (status != HSA_STATUS_SUCCESS) {
+ LogError("Error creating device sampler object!");
+ return false;
+ }
+
+ if (dev().settings().enableImageHandle_) {
+ uint64_t sampler_srd = hsa_sampler.handle;
+ argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd));
+ samplerList_.push_back(hsa_sampler);
+ // TODO: destroy sampler.
+ }
+ else {
+ argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
+
+ memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
+ argPtr += HSA_SAMPLER_OBJECT_SIZE;
+ hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
+ }
+ break;
+ }
+ default:
+ return false;
+ }
+ }
+
+ // Check there is no arguments' buffer overflow
+ assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize());
+
+ // Check for group memory overflow
+ //! @todo Check should be in HSA - here we should have at most an assert
+ assert(roc_device_.info().localMemSizePerCU_ > 0);
+ if (ldsUsage > roc_device_.info().localMemSizePerCU_) {
+ LogError("No local memory available\n");
+ return false;
+ }
+
+ //Initialize the dispatch Packet
+ hsa_kernel_dispatch_packet_t dispatchPacket;
+ memset(&dispatchPacket, 0, sizeof(dispatchPacket));
+
+ dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();
+
+ dispatchPacket.header = aqlHeader_;
+ dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+ dispatchPacket.grid_size_x = sizes.dimensions()>0 ? newGlobalSize[0] : 1;
+ dispatchPacket.grid_size_y = sizes.dimensions()>1 ? newGlobalSize[1] : 1;
+ dispatchPacket.grid_size_z = sizes.dimensions()>2 ? newGlobalSize[2] : 1;
+
+ const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
+ if (sizes.local().product() != 0) {
+ dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? sizes.local()[0] : 1;
+ dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? sizes.local()[1] : 1;
+ dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? sizes.local()[2] : 1;
+ } else {
+ amd::NDRangeContainer tmpSizes(sizes.dimensions(),
+ &newOffset[0], &newGlobalSize[0],
+ &(const_cast(sizes).local()[0]));
+
+ setRuntimeCompilerLocalSize(dispatchPacket, tmpSizes, compile_size, dev());
+ }
+ dispatchPacket.kernarg_address = argBuffer;
+ dispatchPacket.group_segment_size = ldsUsage;
+ dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
+
+ //Dispatch the packet
+ if (!dispatchAqlPacket(&dispatchPacket, GPU_FLUSH_ON_EXECUTION)){
+ return false;
+ }
}
// Mark the flag indicating if a dispatch is outstanding.
diff --git a/rocclr/runtime/device/rocm/rocvirtual.hpp b/rocclr/runtime/device/rocm/rocvirtual.hpp
index 95b6a4337d..b06c9428cc 100644
--- a/rocclr/runtime/device/rocm/rocvirtual.hpp
+++ b/rocclr/runtime/device/rocm/rocvirtual.hpp
@@ -29,53 +29,80 @@ struct ProfilingSignal : public amd::HeapObject
// including EnqueueNDRangeKernel and clEnqueueCopyBuffer.
class Timestamp {
private:
- uint64_t start_;
- uint64_t end_;
- ProfilingSignal* profilingSignal_;
- hsa_agent_t agent_;
- static double ticksToTime_;
+ uint64_t start_;
+ uint64_t end_;
+ ProfilingSignal* profilingSignal_;
+ hsa_agent_t agent_;
+ static double ticksToTime_;
+ bool splittedDispatch_;
+ std::vector splittedSignals_;
public:
- uint64_t getStart() { checkGpuTime(); return start_; }
+ uint64_t getStart() { checkGpuTime(); return start_; }
- uint64_t getEnd() { checkGpuTime(); return end_; }
+ uint64_t getEnd() { checkGpuTime(); return end_; }
- void setProfilingSignal(ProfilingSignal* signal) { profilingSignal_ = signal; }
+ void setProfilingSignal(ProfilingSignal* signal) {
+ profilingSignal_ = signal;
+ if (splittedDispatch_) {
+ splittedSignals_.push_back(profilingSignal_->signal_);
+ }
+ }
+ const ProfilingSignal* getProfilingSignal() const { return profilingSignal_; }
- const ProfilingSignal* getProfilingSignal() const { return profilingSignal_; }
+ void setAgent(hsa_agent_t agent) { agent_ = agent; }
- void setAgent(hsa_agent_t agent) { agent_ = agent; }
+ Timestamp() : start_(0), end_(0), profilingSignal_(nullptr), splittedDispatch_(false) {
+ agent_.handle = 0;
+ }
- Timestamp() : start_(0), end_(0), profilingSignal_(nullptr) {
- agent_.handle = 0;
- }
+ ~Timestamp() {}
- ~Timestamp() {}
+ //! Finds execution ticks on GPU
+ void checkGpuTime() {
+ if (profilingSignal_ != nullptr) {
+ hsa_amd_profiling_dispatch_time_t time;
- //! Finds execution ticks on GPU
- void checkGpuTime() {
- if (profilingSignal_ != nullptr) {
- hsa_amd_profiling_dispatch_time_t time;
- hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time);
- start_ = time.start * ticksToTime_;
- end_ = time.end * ticksToTime_;
- profilingSignal_->ts_ = nullptr;
- profilingSignal_ = nullptr;
- }
- }
+ if (splittedDispatch_) {
+ uint64_t start = UINT64_MAX;
+ uint64_t end = 0;
+ for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) {
+ hsa_amd_profiling_get_dispatch_time(agent_, *it, &time);
+ if (time.start < start) {
+ start = time.start;
+ }
+ if (time.end > end) {
+ end = time.end;
+ }
+ }
+ start_ = start * ticksToTime_;
+ end_ = end * ticksToTime_;
+ }
+ else {
+ hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time);
+ start_ = time.start * ticksToTime_;
+ end_ = time.end * ticksToTime_;
+ }
+ profilingSignal_->ts_ = nullptr;
+ profilingSignal_ = nullptr;
+ }
+ }
- // Start a timestamp (get timestamp from OS)
- void start() {
- start_ = amd::Os::timeNanos();
- }
+ // Start a timestamp (get timestamp from OS)
+ void start() {
+ start_ = amd::Os::timeNanos();
+ }
- // End a timestamp (get timestamp from OS)
- void end() {
- end_ = amd::Os::timeNanos();
- }
+ // End a timestamp (get timestamp from OS)
+ void end() {
+ end_ = amd::Os::timeNanos();
+ }
- static void setGpuTicksToTime(double ticksToTime) { ticksToTime_=ticksToTime; }
- static double getGpuTicksToTime() { return ticksToTime_; }
+ bool isSplittedDispatch() const { return splittedDispatch_; }
+ void setSplittedDispatch() { splittedDispatch_ = true; }
+
+ static void setGpuTicksToTime(double ticksToTime) { ticksToTime_=ticksToTime; }
+ static double getGpuTicksToTime() { return ticksToTime_; }
};
class VirtualGPU : public device::VirtualDevice {