From dadd21fcedf5507d18bb429d26e4e6ec5758153a Mon Sep 17 00:00:00 2001 From: foreman Date: Tue, 7 Feb 2017 17:13:15 -0500 Subject: [PATCH] P4 to Git Change 1370387 by wchau@wchau_OCL_boltzmann on 2017/02/07 17:00:35 SWDEV-103424 - [ROCm CQE][OCL] OCLRuntime - OCLCreateBuffer tests are failing. The failure is due to AQL cannot support global size > 32bit range. Adding dispatch split support for ROCm, similar to that of GSL (CL#1159349), to resolve the issue. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#13 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#56 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#31 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#8 edit ... //depot/stg/opencl/drivers/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp#6 edit --- rocclr/runtime/device/rocm/rockernel.hpp | 15 +- rocclr/runtime/device/rocm/rocprogram.cpp | 2 + rocclr/runtime/device/rocm/rocvirtual.cpp | 407 ++++++++++++---------- rocclr/runtime/device/rocm/rocvirtual.hpp | 97 ++++-- 4 files changed, 305 insertions(+), 216 deletions(-) diff --git a/rocclr/runtime/device/rocm/rockernel.hpp b/rocclr/runtime/device/rocm/rockernel.hpp index 5bddbc3225..75eda35238 100644 --- a/rocclr/runtime/device/rocm/rockernel.hpp +++ b/rocclr/runtime/device/rocm/rockernel.hpp @@ -147,7 +147,21 @@ public: //! Return printf info array const std::vector& printfInfo() const {return printf_;} + //! Return TRUE if kernel is internal blit kernel + bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; } + + //! set internal kernel flag + void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; } + private: + union Flags { + struct { + uint internalKernel_: 1; //!< Is a blit kernel? + }; + uint value_; + Flags(): value_(0) {} + } flags_; + //! Populates hsailArgList_ void initArguments(const aclArgData* aclArg); #if defined(WITH_LIGHTNING_COMPILER) @@ -164,7 +178,6 @@ private: HSAILProgram *program_; //!< The roc::HSAILProgram context std::vector hsailArgList_; //!< Vector list of HSAIL Arguments - std::string compileOptions_; //!< compile used for finalizing this kernel uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t) const uint32_t workgroupGroupSegmentByteSize_; const uint32_t workitemPrivateSegmentByteSize_; diff --git a/rocclr/runtime/device/rocm/rocprogram.cpp b/rocclr/runtime/device/rocm/rocprogram.cpp index 8e17aea1bc..19db3e70ff 100644 --- a/rocclr/runtime/device/rocm/rocprogram.cpp +++ b/rocclr/runtime/device/rocm/rocprogram.cpp @@ -1080,6 +1080,7 @@ HSAILProgram::setKernels_LC(amd::option::Options *options, void* binary, size_t return false; } aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); + aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != std::string::npos); kernels()[kernelName] = aKernel; } @@ -1336,6 +1337,7 @@ HSAILProgram::linkImpl(amd::option::Options *options) return false; } aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); + aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != std::string::npos); kernels()[kernelName] = aKernel; } saveBinaryAndSetType(TYPE_EXECUTABLE); diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp index 1d7c9f20db..33f6516d21 100644 --- a/rocclr/runtime/device/rocm/rocvirtual.cpp +++ b/rocclr/runtime/device/rocm/rocvirtual.cpp @@ -1513,210 +1513,257 @@ VirtualGPU::submitKernelInternal( return false; } - // Allocate buffer to hold kernel arguments - address argBuffer = (address)allocKernArg( - gpuKernel.KernargSegmentByteSize(), - gpuKernel.KernargSegmentAlignment()); - - if (argBuffer == NULL) { - LogError("Out of memory"); - return false; - } - - address argPtr = argBuffer; const amd::KernelSignature& signature = kernel.signature(); const amd::KernelParameters& kernelParams = kernel.parameters(); - // Find all parameters for the current kernel - for (auto arg : gpuKernel.hsailArgs()) { - const_address srcArgPtr = NULL; - if (arg->index_ != uint(-1)) { - srcArgPtr = parameters + signature.at(arg->index_).offset_; - } + size_t newOffset[3] = {0, 0, 0}; + size_t newGlobalSize[3] = {0, 0, 0}; - // Handle the hidden arguments first, as they do not have a - // matching parameter in the OCL signature (not a valid arg->index_) - switch (arg->type_) { - case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: { - size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0; - assert(arg->size_ == sizeof(offset_x) && "check the sizes"); - argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_); - break; - } - case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: { - size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0; - assert(arg->size_ == sizeof(offset_y) && "check the sizes"); - argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_); - break; - } - case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: { - size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0; - assert(arg->size_ == sizeof(offset_z) && "check the sizes"); - argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_); - break; - } - case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: { - address bufferPtr = printfDbg()->dbgBuffer(); - assert(arg->size_ == sizeof(bufferPtr) && "check the sizes"); - argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_); - break; - } - case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE: - case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION: - case ROC_ARGTYPE_HIDDEN_NONE: { - void* zero = 0; - assert(arg->size_ <= sizeof(zero) && "check the sizes"); - argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_); - break; - } - case ROC_ARGTYPE_POINTER: { - if (arg->addrQual_ == ROC_ADDRESS_LOCAL) { - // Align the LDS on the alignment requirement of type pointed to - ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_); - argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_); - ldsUsage += *reinterpret_cast(srcArgPtr); + int dim = -1; + int iteration = 1; + size_t globalStep = 0; + for (uint i = 0; i < sizes.dimensions(); i++) { + newGlobalSize[i] = sizes.global()[i]; + newOffset[i] = sizes.offset()[i]; + } + + if (gpuKernel.isInternalKernel()) { + // Calculate new group size for each submission + for (uint i = 0; i < sizes.dimensions(); i++) { + if (sizes.global()[i] > static_cast(0xffffffff)) { + dim = i; + iteration = sizes.global()[i] / 0xC0000000 + + ((sizes.global()[i] % 0xC0000000) ? 1: 0); + globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration + * sizes.local()[dim]; + if (timestamp_ != nullptr) { + timestamp_->setSplittedDispatch(); + } break; } - assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL - || arg->addrQual_ == ROC_ADDRESS_CONSTANT) - && "Unsupported address qualifier"); - if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) { - argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); - break; - } - amd::Memory* mem = *reinterpret_cast(srcArgPtr); - if (mem == NULL) { - argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); - break; - } - - Memory *devMem = static_cast(mem->getDeviceMemory(dev())); - //! @todo add multi-devices synchronization when supported. - void* globalAddress = devMem->getDeviceMemory(); - argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_); - - //! @todo Compiler has to return read/write attributes - const cl_mem_flags flags = mem->getMemFlags(); - if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { - mem->signalWrite(&dev()); - } - break; } - case ROC_ARGTYPE_REFERENCE: { - void *mem = allocKernArg(arg->size_, arg->alignment_); - if (mem == NULL) { - LogError("Out of memory"); - return false; - } - memcpy(mem, srcArgPtr, arg->size_); - argPtr = addArg(argPtr, &mem, sizeof(void*)); - break; - } - case ROC_ARGTYPE_VALUE: - argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); - break; - case ROC_ARGTYPE_IMAGE: { - amd::Memory* mem = *reinterpret_cast(srcArgPtr); - Image* image = static_cast(mem->getDeviceMemory(dev())); - if (image == NULL) { - LogError("Kernel image argument is not an image object"); - return false; - } + } - if (dev().settings().enableImageHandle_) { - const uint64_t image_srd = image->getHsaImageObject().handle; - assert(amd::isMultipleOf(image_srd, sizeof(image_srd))); - argPtr = addArg(argPtr, &image_srd, sizeof(image_srd)); + for (int j = 0; j < iteration; j++) { + // Reset global size for dimension dim if split is needed + if (dim != -1) { + newOffset[dim] = sizes.offset()[dim] + globalStep * j; + if (((newOffset[dim] + globalStep) < sizes.global()[dim]) && + (j != (iteration - 1))) { + newGlobalSize[dim] = globalStep; } else { - // Image arguments are of size 48 bytes and are aligned to 16 bytes - argPtr = addArg(argPtr, (void *)image->getHsaImageObject().handle, - HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT); + newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim]; } - - //! @todo Compiler has to return read/write attributes - const cl_mem_flags flags = mem->getMemFlags(); - if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { - mem->signalWrite(&dev()); - } - break; } - case ROC_ARGTYPE_SAMPLER: { - amd::Sampler* sampler = *reinterpret_cast(srcArgPtr); - if (sampler == NULL) { - LogError("Kernel sampler argument is not an sampler object"); - return false; - } - hsa_ext_sampler_descriptor_t samplerDescriptor; - fillSampleDescriptor(samplerDescriptor, *sampler); + // Find all parameters for the current kernel - hsa_ext_sampler_t hsa_sampler; - hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(), - &samplerDescriptor, &hsa_sampler); - if (status != HSA_STATUS_SUCCESS) { - LogError("Error creating device sampler object!"); - return false; - } + // Allocate buffer to hold kernel arguments + address argBuffer = (address)allocKernArg( + gpuKernel.KernargSegmentByteSize(), + gpuKernel.KernargSegmentAlignment()); - if (dev().settings().enableImageHandle_) { - uint64_t sampler_srd = hsa_sampler.handle; - argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd)); - samplerList_.push_back(hsa_sampler); - // TODO: destroy sampler. - } - else { - argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT); - - memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE); - argPtr += HSA_SAMPLER_OBJECT_SIZE; - hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler); - } - break; - } - default: + if (argBuffer == NULL) { + LogError("Out of memory"); return false; } - } - // Check there is no arguments' buffer overflow - assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize()); + address argPtr = argBuffer; + for (auto arg : gpuKernel.hsailArgs()) { + const_address srcArgPtr = NULL; + if (arg->index_ != uint(-1)) { + srcArgPtr = parameters + signature.at(arg->index_).offset_; + } - // Check for group memory overflow - //! @todo Check should be in HSA - here we should have at most an assert - assert(roc_device_.info().localMemSizePerCU_ > 0); - if (ldsUsage > roc_device_.info().localMemSizePerCU_) { - LogError("No local memory available\n"); - return false; - } + // Handle the hidden arguments first, as they do not have a + // matching parameter in the OCL signature (not a valid arg->index_) + switch (arg->type_) { + case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: { + size_t offset_x = sizes.dimensions() >= 1 ? newOffset[0] : 0; + assert(arg->size_ == sizeof(offset_x) && "check the sizes"); + argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_); + break; + } + case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: { + size_t offset_y = sizes.dimensions() >= 2 ? newOffset[1] : 0; + assert(arg->size_ == sizeof(offset_y) && "check the sizes"); + argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_); + break; + } + case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: { + size_t offset_z = sizes.dimensions() == 3 ? newOffset[2] : 0; + assert(arg->size_ == sizeof(offset_z) && "check the sizes"); + argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_); + break; + } + case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: { + address bufferPtr = printfDbg()->dbgBuffer(); + assert(arg->size_ == sizeof(bufferPtr) && "check the sizes"); + argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_); + break; + } + case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE: + case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION: + case ROC_ARGTYPE_HIDDEN_NONE: { + void* zero = 0; + assert(arg->size_ <= sizeof(zero) && "check the sizes"); + argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_); + break; + } + case ROC_ARGTYPE_POINTER: { + if (arg->addrQual_ == ROC_ADDRESS_LOCAL) { + // Align the LDS on the alignment requirement of type pointed to + ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_); + argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_); + ldsUsage += *reinterpret_cast(srcArgPtr); + break; + } + assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL + || arg->addrQual_ == ROC_ADDRESS_CONSTANT) + && "Unsupported address qualifier"); + if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) { + argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); + break; + } + amd::Memory* mem = *reinterpret_cast(srcArgPtr); + if (mem == NULL) { + argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); + break; + } - //Initialize the dispatch Packet - hsa_kernel_dispatch_packet_t dispatchPacket; - memset(&dispatchPacket, 0, sizeof(dispatchPacket)); + Memory *devMem = static_cast(mem->getDeviceMemory(dev())); + //! @todo add multi-devices synchronization when supported. + void* globalAddress = devMem->getDeviceMemory(); + argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_); - dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle(); + //! @todo Compiler has to return read/write attributes + const cl_mem_flags flags = mem->getMemFlags(); + if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { + mem->signalWrite(&dev()); + } + break; + } + case ROC_ARGTYPE_REFERENCE: { + void *mem = allocKernArg(arg->size_, arg->alignment_); + if (mem == NULL) { + LogError("Out of memory"); + return false; + } + memcpy(mem, srcArgPtr, arg->size_); + argPtr = addArg(argPtr, &mem, sizeof(void*)); + break; + } + case ROC_ARGTYPE_VALUE: + argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_); + break; + case ROC_ARGTYPE_IMAGE: { + amd::Memory* mem = *reinterpret_cast(srcArgPtr); + Image* image = static_cast(mem->getDeviceMemory(dev())); + if (image == NULL) { + LogError("Kernel image argument is not an image object"); + return false; + } - dispatchPacket.header = aqlHeader_; - dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - dispatchPacket.grid_size_x = sizes.dimensions()>0 ? sizes.global()[0] : 1; - dispatchPacket.grid_size_y = sizes.dimensions()>1 ? sizes.global()[1] : 1; - dispatchPacket.grid_size_z = sizes.dimensions()>2 ? sizes.global()[2] : 1; + if (dev().settings().enableImageHandle_) { + const uint64_t image_srd = image->getHsaImageObject().handle; + assert(amd::isMultipleOf(image_srd, sizeof(image_srd))); + argPtr = addArg(argPtr, &image_srd, sizeof(image_srd)); + } + else { + // Image arguments are of size 48 bytes and are aligned to 16 bytes + argPtr = addArg(argPtr, (void *)image->getHsaImageObject().handle, + HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT); + } - const size_t* compile_size = devKernel->workGroupInfo()->compileSize_; - if (sizes.local().product() != 0) { - dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? sizes.local()[0] : 1; - dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? sizes.local()[1] : 1; - dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? sizes.local()[2] : 1; - } else { - setRuntimeCompilerLocalSize(dispatchPacket, sizes, compile_size, dev()); - } - dispatchPacket.kernarg_address = argBuffer; - dispatchPacket.group_segment_size = ldsUsage; - dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_; + //! @todo Compiler has to return read/write attributes + const cl_mem_flags flags = mem->getMemFlags(); + if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) { + mem->signalWrite(&dev()); + } + break; + } + case ROC_ARGTYPE_SAMPLER: { + amd::Sampler* sampler = *reinterpret_cast(srcArgPtr); + if (sampler == NULL) { + LogError("Kernel sampler argument is not an sampler object"); + return false; + } - //Dispatch the packet - if (!dispatchAqlPacket(&dispatchPacket, GPU_FLUSH_ON_EXECUTION)){ - return false; + hsa_ext_sampler_descriptor_t samplerDescriptor; + fillSampleDescriptor(samplerDescriptor, *sampler); + + hsa_ext_sampler_t hsa_sampler; + hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(), + &samplerDescriptor, &hsa_sampler); + if (status != HSA_STATUS_SUCCESS) { + LogError("Error creating device sampler object!"); + return false; + } + + if (dev().settings().enableImageHandle_) { + uint64_t sampler_srd = hsa_sampler.handle; + argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd)); + samplerList_.push_back(hsa_sampler); + // TODO: destroy sampler. + } + else { + argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT); + + memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE); + argPtr += HSA_SAMPLER_OBJECT_SIZE; + hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler); + } + break; + } + default: + return false; + } + } + + // Check there is no arguments' buffer overflow + assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize()); + + // Check for group memory overflow + //! @todo Check should be in HSA - here we should have at most an assert + assert(roc_device_.info().localMemSizePerCU_ > 0); + if (ldsUsage > roc_device_.info().localMemSizePerCU_) { + LogError("No local memory available\n"); + return false; + } + + //Initialize the dispatch Packet + hsa_kernel_dispatch_packet_t dispatchPacket; + memset(&dispatchPacket, 0, sizeof(dispatchPacket)); + + dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle(); + + dispatchPacket.header = aqlHeader_; + dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + dispatchPacket.grid_size_x = sizes.dimensions()>0 ? newGlobalSize[0] : 1; + dispatchPacket.grid_size_y = sizes.dimensions()>1 ? newGlobalSize[1] : 1; + dispatchPacket.grid_size_z = sizes.dimensions()>2 ? newGlobalSize[2] : 1; + + const size_t* compile_size = devKernel->workGroupInfo()->compileSize_; + if (sizes.local().product() != 0) { + dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? sizes.local()[0] : 1; + dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? sizes.local()[1] : 1; + dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? sizes.local()[2] : 1; + } else { + amd::NDRangeContainer tmpSizes(sizes.dimensions(), + &newOffset[0], &newGlobalSize[0], + &(const_cast(sizes).local()[0])); + + setRuntimeCompilerLocalSize(dispatchPacket, tmpSizes, compile_size, dev()); + } + dispatchPacket.kernarg_address = argBuffer; + dispatchPacket.group_segment_size = ldsUsage; + dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_; + + //Dispatch the packet + if (!dispatchAqlPacket(&dispatchPacket, GPU_FLUSH_ON_EXECUTION)){ + return false; + } } // Mark the flag indicating if a dispatch is outstanding. diff --git a/rocclr/runtime/device/rocm/rocvirtual.hpp b/rocclr/runtime/device/rocm/rocvirtual.hpp index 95b6a4337d..b06c9428cc 100644 --- a/rocclr/runtime/device/rocm/rocvirtual.hpp +++ b/rocclr/runtime/device/rocm/rocvirtual.hpp @@ -29,53 +29,80 @@ struct ProfilingSignal : public amd::HeapObject // including EnqueueNDRangeKernel and clEnqueueCopyBuffer. class Timestamp { private: - uint64_t start_; - uint64_t end_; - ProfilingSignal* profilingSignal_; - hsa_agent_t agent_; - static double ticksToTime_; + uint64_t start_; + uint64_t end_; + ProfilingSignal* profilingSignal_; + hsa_agent_t agent_; + static double ticksToTime_; + bool splittedDispatch_; + std::vector splittedSignals_; public: - uint64_t getStart() { checkGpuTime(); return start_; } + uint64_t getStart() { checkGpuTime(); return start_; } - uint64_t getEnd() { checkGpuTime(); return end_; } + uint64_t getEnd() { checkGpuTime(); return end_; } - void setProfilingSignal(ProfilingSignal* signal) { profilingSignal_ = signal; } + void setProfilingSignal(ProfilingSignal* signal) { + profilingSignal_ = signal; + if (splittedDispatch_) { + splittedSignals_.push_back(profilingSignal_->signal_); + } + } + const ProfilingSignal* getProfilingSignal() const { return profilingSignal_; } - const ProfilingSignal* getProfilingSignal() const { return profilingSignal_; } + void setAgent(hsa_agent_t agent) { agent_ = agent; } - void setAgent(hsa_agent_t agent) { agent_ = agent; } + Timestamp() : start_(0), end_(0), profilingSignal_(nullptr), splittedDispatch_(false) { + agent_.handle = 0; + } - Timestamp() : start_(0), end_(0), profilingSignal_(nullptr) { - agent_.handle = 0; - } + ~Timestamp() {} - ~Timestamp() {} + //! Finds execution ticks on GPU + void checkGpuTime() { + if (profilingSignal_ != nullptr) { + hsa_amd_profiling_dispatch_time_t time; - //! Finds execution ticks on GPU - void checkGpuTime() { - if (profilingSignal_ != nullptr) { - hsa_amd_profiling_dispatch_time_t time; - hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time); - start_ = time.start * ticksToTime_; - end_ = time.end * ticksToTime_; - profilingSignal_->ts_ = nullptr; - profilingSignal_ = nullptr; - } - } + if (splittedDispatch_) { + uint64_t start = UINT64_MAX; + uint64_t end = 0; + for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) { + hsa_amd_profiling_get_dispatch_time(agent_, *it, &time); + if (time.start < start) { + start = time.start; + } + if (time.end > end) { + end = time.end; + } + } + start_ = start * ticksToTime_; + end_ = end * ticksToTime_; + } + else { + hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time); + start_ = time.start * ticksToTime_; + end_ = time.end * ticksToTime_; + } + profilingSignal_->ts_ = nullptr; + profilingSignal_ = nullptr; + } + } - // Start a timestamp (get timestamp from OS) - void start() { - start_ = amd::Os::timeNanos(); - } + // Start a timestamp (get timestamp from OS) + void start() { + start_ = amd::Os::timeNanos(); + } - // End a timestamp (get timestamp from OS) - void end() { - end_ = amd::Os::timeNanos(); - } + // End a timestamp (get timestamp from OS) + void end() { + end_ = amd::Os::timeNanos(); + } - static void setGpuTicksToTime(double ticksToTime) { ticksToTime_=ticksToTime; } - static double getGpuTicksToTime() { return ticksToTime_; } + bool isSplittedDispatch() const { return splittedDispatch_; } + void setSplittedDispatch() { splittedDispatch_ = true; } + + static void setGpuTicksToTime(double ticksToTime) { ticksToTime_=ticksToTime; } + static double getGpuTicksToTime() { return ticksToTime_; } }; class VirtualGPU : public device::VirtualDevice {