P4 to Git Change 1370387 by wchau@wchau_OCL_boltzmann on 2017/02/07 17:00:35

SWDEV-103424 - [ROCm CQE][OCL] OCLRuntime - OCLCreateBuffer tests are failing.  The failure is due to AQL cannot support global size > 32bit range.  Adding dispatch split support for ROCm, similar to that of GSL (CL#1159349), to resolve the issue.

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#13 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#56 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#31 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.hpp#8 edit
... //depot/stg/opencl/drivers/opencl/tests/ocltst/module/runtime/OCLCreateBuffer.cpp#6 edit
This commit is contained in:
foreman
2017-02-07 17:13:15 -05:00
rodzic ef61b25945
commit dadd21fced
4 zmienionych plików z 305 dodań i 216 usunięć
+14 -1
Wyświetl plik
@@ -147,7 +147,21 @@ public:
//! Return printf info array
const std::vector<PrintfInfo>& printfInfo() const {return printf_;}
//! Return TRUE if kernel is internal blit kernel
bool isInternalKernel() const { return (flags_.internalKernel_) ? true : false; }
//! set internal kernel flag
void setInternalKernelFlag(bool flag) { flags_.internalKernel_ = flag; }
private:
union Flags {
struct {
uint internalKernel_: 1; //!< Is a blit kernel?
};
uint value_;
Flags(): value_(0) {}
} flags_;
//! Populates hsailArgList_
void initArguments(const aclArgData* aclArg);
#if defined(WITH_LIGHTNING_COMPILER)
@@ -164,7 +178,6 @@ private:
HSAILProgram *program_; //!< The roc::HSAILProgram context
std::vector<Argument*> hsailArgList_; //!< Vector list of HSAIL Arguments
std::string compileOptions_; //!< compile used for finalizing this kernel
uint64_t kernelCodeHandle_; //!< Kernel code handle (aka amd_kernel_code_t)
const uint32_t workgroupGroupSegmentByteSize_;
const uint32_t workitemPrivateSegmentByteSize_;
@@ -1080,6 +1080,7 @@ HSAILProgram::setKernels_LC(amd::option::Options *options, void* binary, size_t
return false;
}
aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != std::string::npos);
kernels()[kernelName] = aKernel;
}
@@ -1336,6 +1337,7 @@ HSAILProgram::linkImpl(amd::option::Options *options)
return false;
}
aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
aKernel->setInternalKernelFlag(compileOptions_.find("-cl-internal-kernel") != std::string::npos);
kernels()[kernelName] = aKernel;
}
saveBinaryAndSetType(TYPE_EXECUTABLE);
+227 -180
Wyświetl plik
@@ -1513,210 +1513,257 @@ VirtualGPU::submitKernelInternal(
return false;
}
// Allocate buffer to hold kernel arguments
address argBuffer = (address)allocKernArg(
gpuKernel.KernargSegmentByteSize(),
gpuKernel.KernargSegmentAlignment());
if (argBuffer == NULL) {
LogError("Out of memory");
return false;
}
address argPtr = argBuffer;
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
// Find all parameters for the current kernel
for (auto arg : gpuKernel.hsailArgs()) {
const_address srcArgPtr = NULL;
if (arg->index_ != uint(-1)) {
srcArgPtr = parameters + signature.at(arg->index_).offset_;
}
size_t newOffset[3] = {0, 0, 0};
size_t newGlobalSize[3] = {0, 0, 0};
// Handle the hidden arguments first, as they do not have a
// matching parameter in the OCL signature (not a valid arg->index_)
switch (arg->type_) {
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0;
assert(arg->size_ == sizeof(offset_x) && "check the sizes");
argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_);
break;
}
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0;
assert(arg->size_ == sizeof(offset_y) && "check the sizes");
argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_);
break;
}
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0;
assert(arg->size_ == sizeof(offset_z) && "check the sizes");
argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_);
break;
}
case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
address bufferPtr = printfDbg()->dbgBuffer();
assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_);
break;
}
case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION:
case ROC_ARGTYPE_HIDDEN_NONE: {
void* zero = 0;
assert(arg->size_ <= sizeof(zero) && "check the sizes");
argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_);
break;
}
case ROC_ARGTYPE_POINTER: {
if (arg->addrQual_ == ROC_ADDRESS_LOCAL) {
// Align the LDS on the alignment requirement of type pointed to
ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_);
argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_);
ldsUsage += *reinterpret_cast<const size_t *>(srcArgPtr);
int dim = -1;
int iteration = 1;
size_t globalStep = 0;
for (uint i = 0; i < sizes.dimensions(); i++) {
newGlobalSize[i] = sizes.global()[i];
newOffset[i] = sizes.offset()[i];
}
if (gpuKernel.isInternalKernel()) {
// Calculate new group size for each submission
for (uint i = 0; i < sizes.dimensions(); i++) {
if (sizes.global()[i] > static_cast<size_t>(0xffffffff)) {
dim = i;
iteration = sizes.global()[i] / 0xC0000000
+ ((sizes.global()[i] % 0xC0000000) ? 1: 0);
globalStep = (sizes.global()[i] / sizes.local()[i]) / iteration
* sizes.local()[dim];
if (timestamp_ != nullptr) {
timestamp_->setSplittedDispatch();
}
break;
}
assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL
|| arg->addrQual_ == ROC_ADDRESS_CONSTANT)
&& "Unsupported address qualifier");
if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) {
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
break;
}
amd::Memory* mem = *reinterpret_cast<amd::Memory* const*>(srcArgPtr);
if (mem == NULL) {
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
break;
}
Memory *devMem = static_cast<Memory *>(mem->getDeviceMemory(dev()));
//! @todo add multi-devices synchronization when supported.
void* globalAddress = devMem->getDeviceMemory();
argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_);
//! @todo Compiler has to return read/write attributes
const cl_mem_flags flags = mem->getMemFlags();
if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
mem->signalWrite(&dev());
}
break;
}
case ROC_ARGTYPE_REFERENCE: {
void *mem = allocKernArg(arg->size_, arg->alignment_);
if (mem == NULL) {
LogError("Out of memory");
return false;
}
memcpy(mem, srcArgPtr, arg->size_);
argPtr = addArg(argPtr, &mem, sizeof(void*));
break;
}
case ROC_ARGTYPE_VALUE:
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
break;
case ROC_ARGTYPE_IMAGE: {
amd::Memory* mem = *reinterpret_cast<amd::Memory* const*>(srcArgPtr);
Image* image = static_cast<Image *>(mem->getDeviceMemory(dev()));
if (image == NULL) {
LogError("Kernel image argument is not an image object");
return false;
}
}
if (dev().settings().enableImageHandle_) {
const uint64_t image_srd = image->getHsaImageObject().handle;
assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
argPtr = addArg(argPtr, &image_srd, sizeof(image_srd));
for (int j = 0; j < iteration; j++) {
// Reset global size for dimension dim if split is needed
if (dim != -1) {
newOffset[dim] = sizes.offset()[dim] + globalStep * j;
if (((newOffset[dim] + globalStep) < sizes.global()[dim]) &&
(j != (iteration - 1))) {
newGlobalSize[dim] = globalStep;
}
else {
// Image arguments are of size 48 bytes and are aligned to 16 bytes
argPtr = addArg(argPtr, (void *)image->getHsaImageObject().handle,
HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
newGlobalSize[dim] = sizes.global()[dim] - newOffset[dim];
}
//! @todo Compiler has to return read/write attributes
const cl_mem_flags flags = mem->getMemFlags();
if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
mem->signalWrite(&dev());
}
break;
}
case ROC_ARGTYPE_SAMPLER: {
amd::Sampler* sampler = *reinterpret_cast<amd::Sampler* const*>(srcArgPtr);
if (sampler == NULL) {
LogError("Kernel sampler argument is not an sampler object");
return false;
}
hsa_ext_sampler_descriptor_t samplerDescriptor;
fillSampleDescriptor(samplerDescriptor, *sampler);
// Find all parameters for the current kernel
hsa_ext_sampler_t hsa_sampler;
hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(),
&samplerDescriptor, &hsa_sampler);
if (status != HSA_STATUS_SUCCESS) {
LogError("Error creating device sampler object!");
return false;
}
// Allocate buffer to hold kernel arguments
address argBuffer = (address)allocKernArg(
gpuKernel.KernargSegmentByteSize(),
gpuKernel.KernargSegmentAlignment());
if (dev().settings().enableImageHandle_) {
uint64_t sampler_srd = hsa_sampler.handle;
argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd));
samplerList_.push_back(hsa_sampler);
// TODO: destroy sampler.
}
else {
argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
argPtr += HSA_SAMPLER_OBJECT_SIZE;
hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
}
break;
}
default:
if (argBuffer == NULL) {
LogError("Out of memory");
return false;
}
}
// Check there is no arguments' buffer overflow
assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize());
address argPtr = argBuffer;
for (auto arg : gpuKernel.hsailArgs()) {
const_address srcArgPtr = NULL;
if (arg->index_ != uint(-1)) {
srcArgPtr = parameters + signature.at(arg->index_).offset_;
}
// Check for group memory overflow
//! @todo Check should be in HSA - here we should have at most an assert
assert(roc_device_.info().localMemSizePerCU_ > 0);
if (ldsUsage > roc_device_.info().localMemSizePerCU_) {
LogError("No local memory available\n");
return false;
}
// Handle the hidden arguments first, as they do not have a
// matching parameter in the OCL signature (not a valid arg->index_)
switch (arg->type_) {
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X: {
size_t offset_x = sizes.dimensions() >= 1 ? newOffset[0] : 0;
assert(arg->size_ == sizeof(offset_x) && "check the sizes");
argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_);
break;
}
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y: {
size_t offset_y = sizes.dimensions() >= 2 ? newOffset[1] : 0;
assert(arg->size_ == sizeof(offset_y) && "check the sizes");
argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_);
break;
}
case ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z: {
size_t offset_z = sizes.dimensions() == 3 ? newOffset[2] : 0;
assert(arg->size_ == sizeof(offset_z) && "check the sizes");
argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_);
break;
}
case ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER: {
address bufferPtr = printfDbg()->dbgBuffer();
assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_);
break;
}
case ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE:
case ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION:
case ROC_ARGTYPE_HIDDEN_NONE: {
void* zero = 0;
assert(arg->size_ <= sizeof(zero) && "check the sizes");
argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_);
break;
}
case ROC_ARGTYPE_POINTER: {
if (arg->addrQual_ == ROC_ADDRESS_LOCAL) {
// Align the LDS on the alignment requirement of type pointed to
ldsUsage = amd::alignUp(ldsUsage, arg->pointeeAlignment_);
argPtr = addArg(argPtr, &ldsUsage, arg->size_, arg->alignment_);
ldsUsage += *reinterpret_cast<const size_t *>(srcArgPtr);
break;
}
assert((arg->addrQual_ == ROC_ADDRESS_GLOBAL
|| arg->addrQual_ == ROC_ADDRESS_CONSTANT)
&& "Unsupported address qualifier");
if (kernelParams.boundToSvmPointer(dev(), parameters, arg->index_)) {
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
break;
}
amd::Memory* mem = *reinterpret_cast<amd::Memory* const*>(srcArgPtr);
if (mem == NULL) {
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
break;
}
//Initialize the dispatch Packet
hsa_kernel_dispatch_packet_t dispatchPacket;
memset(&dispatchPacket, 0, sizeof(dispatchPacket));
Memory *devMem = static_cast<Memory *>(mem->getDeviceMemory(dev()));
//! @todo add multi-devices synchronization when supported.
void* globalAddress = devMem->getDeviceMemory();
argPtr = addArg(argPtr, &globalAddress, arg->size_, arg->alignment_);
dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();
//! @todo Compiler has to return read/write attributes
const cl_mem_flags flags = mem->getMemFlags();
if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
mem->signalWrite(&dev());
}
break;
}
case ROC_ARGTYPE_REFERENCE: {
void *mem = allocKernArg(arg->size_, arg->alignment_);
if (mem == NULL) {
LogError("Out of memory");
return false;
}
memcpy(mem, srcArgPtr, arg->size_);
argPtr = addArg(argPtr, &mem, sizeof(void*));
break;
}
case ROC_ARGTYPE_VALUE:
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
break;
case ROC_ARGTYPE_IMAGE: {
amd::Memory* mem = *reinterpret_cast<amd::Memory* const*>(srcArgPtr);
Image* image = static_cast<Image *>(mem->getDeviceMemory(dev()));
if (image == NULL) {
LogError("Kernel image argument is not an image object");
return false;
}
dispatchPacket.header = aqlHeader_;
dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
dispatchPacket.grid_size_x = sizes.dimensions()>0 ? sizes.global()[0] : 1;
dispatchPacket.grid_size_y = sizes.dimensions()>1 ? sizes.global()[1] : 1;
dispatchPacket.grid_size_z = sizes.dimensions()>2 ? sizes.global()[2] : 1;
if (dev().settings().enableImageHandle_) {
const uint64_t image_srd = image->getHsaImageObject().handle;
assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
argPtr = addArg(argPtr, &image_srd, sizeof(image_srd));
}
else {
// Image arguments are of size 48 bytes and are aligned to 16 bytes
argPtr = addArg(argPtr, (void *)image->getHsaImageObject().handle,
HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
}
const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
if (sizes.local().product() != 0) {
dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? sizes.local()[0] : 1;
dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? sizes.local()[1] : 1;
dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? sizes.local()[2] : 1;
} else {
setRuntimeCompilerLocalSize(dispatchPacket, sizes, compile_size, dev());
}
dispatchPacket.kernarg_address = argBuffer;
dispatchPacket.group_segment_size = ldsUsage;
dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
//! @todo Compiler has to return read/write attributes
const cl_mem_flags flags = mem->getMemFlags();
if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
mem->signalWrite(&dev());
}
break;
}
case ROC_ARGTYPE_SAMPLER: {
amd::Sampler* sampler = *reinterpret_cast<amd::Sampler* const*>(srcArgPtr);
if (sampler == NULL) {
LogError("Kernel sampler argument is not an sampler object");
return false;
}
//Dispatch the packet
if (!dispatchAqlPacket(&dispatchPacket, GPU_FLUSH_ON_EXECUTION)){
return false;
hsa_ext_sampler_descriptor_t samplerDescriptor;
fillSampleDescriptor(samplerDescriptor, *sampler);
hsa_ext_sampler_t hsa_sampler;
hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(),
&samplerDescriptor, &hsa_sampler);
if (status != HSA_STATUS_SUCCESS) {
LogError("Error creating device sampler object!");
return false;
}
if (dev().settings().enableImageHandle_) {
uint64_t sampler_srd = hsa_sampler.handle;
argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd));
samplerList_.push_back(hsa_sampler);
// TODO: destroy sampler.
}
else {
argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
argPtr += HSA_SAMPLER_OBJECT_SIZE;
hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
}
break;
}
default:
return false;
}
}
// Check there is no arguments' buffer overflow
assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize());
// Check for group memory overflow
//! @todo Check should be in HSA - here we should have at most an assert
assert(roc_device_.info().localMemSizePerCU_ > 0);
if (ldsUsage > roc_device_.info().localMemSizePerCU_) {
LogError("No local memory available\n");
return false;
}
//Initialize the dispatch Packet
hsa_kernel_dispatch_packet_t dispatchPacket;
memset(&dispatchPacket, 0, sizeof(dispatchPacket));
dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();
dispatchPacket.header = aqlHeader_;
dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
dispatchPacket.grid_size_x = sizes.dimensions()>0 ? newGlobalSize[0] : 1;
dispatchPacket.grid_size_y = sizes.dimensions()>1 ? newGlobalSize[1] : 1;
dispatchPacket.grid_size_z = sizes.dimensions()>2 ? newGlobalSize[2] : 1;
const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
if (sizes.local().product() != 0) {
dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? sizes.local()[0] : 1;
dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? sizes.local()[1] : 1;
dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? sizes.local()[2] : 1;
} else {
amd::NDRangeContainer tmpSizes(sizes.dimensions(),
&newOffset[0], &newGlobalSize[0],
&(const_cast<amd::NDRangeContainer&>(sizes).local()[0]));
setRuntimeCompilerLocalSize(dispatchPacket, tmpSizes, compile_size, dev());
}
dispatchPacket.kernarg_address = argBuffer;
dispatchPacket.group_segment_size = ldsUsage;
dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
//Dispatch the packet
if (!dispatchAqlPacket(&dispatchPacket, GPU_FLUSH_ON_EXECUTION)){
return false;
}
}
// Mark the flag indicating if a dispatch is outstanding.
+62 -35
Wyświetl plik
@@ -29,53 +29,80 @@ struct ProfilingSignal : public amd::HeapObject
// including EnqueueNDRangeKernel and clEnqueueCopyBuffer.
class Timestamp {
private:
uint64_t start_;
uint64_t end_;
ProfilingSignal* profilingSignal_;
hsa_agent_t agent_;
static double ticksToTime_;
uint64_t start_;
uint64_t end_;
ProfilingSignal* profilingSignal_;
hsa_agent_t agent_;
static double ticksToTime_;
bool splittedDispatch_;
std::vector<hsa_signal_t> splittedSignals_;
public:
uint64_t getStart() { checkGpuTime(); return start_; }
uint64_t getStart() { checkGpuTime(); return start_; }
uint64_t getEnd() { checkGpuTime(); return end_; }
uint64_t getEnd() { checkGpuTime(); return end_; }
void setProfilingSignal(ProfilingSignal* signal) { profilingSignal_ = signal; }
void setProfilingSignal(ProfilingSignal* signal) {
profilingSignal_ = signal;
if (splittedDispatch_) {
splittedSignals_.push_back(profilingSignal_->signal_);
}
}
const ProfilingSignal* getProfilingSignal() const { return profilingSignal_; }
const ProfilingSignal* getProfilingSignal() const { return profilingSignal_; }
void setAgent(hsa_agent_t agent) { agent_ = agent; }
void setAgent(hsa_agent_t agent) { agent_ = agent; }
Timestamp() : start_(0), end_(0), profilingSignal_(nullptr), splittedDispatch_(false) {
agent_.handle = 0;
}
Timestamp() : start_(0), end_(0), profilingSignal_(nullptr) {
agent_.handle = 0;
}
~Timestamp() {}
~Timestamp() {}
//! Finds execution ticks on GPU
void checkGpuTime() {
if (profilingSignal_ != nullptr) {
hsa_amd_profiling_dispatch_time_t time;
//! Finds execution ticks on GPU
void checkGpuTime() {
if (profilingSignal_ != nullptr) {
hsa_amd_profiling_dispatch_time_t time;
hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time);
start_ = time.start * ticksToTime_;
end_ = time.end * ticksToTime_;
profilingSignal_->ts_ = nullptr;
profilingSignal_ = nullptr;
}
}
if (splittedDispatch_) {
uint64_t start = UINT64_MAX;
uint64_t end = 0;
for (auto it = splittedSignals_.begin(); it < splittedSignals_.end(); it++) {
hsa_amd_profiling_get_dispatch_time(agent_, *it, &time);
if (time.start < start) {
start = time.start;
}
if (time.end > end) {
end = time.end;
}
}
start_ = start * ticksToTime_;
end_ = end * ticksToTime_;
}
else {
hsa_amd_profiling_get_dispatch_time(agent_, profilingSignal_->signal_, &time);
start_ = time.start * ticksToTime_;
end_ = time.end * ticksToTime_;
}
profilingSignal_->ts_ = nullptr;
profilingSignal_ = nullptr;
}
}
// Start a timestamp (get timestamp from OS)
void start() {
start_ = amd::Os::timeNanos();
}
// Start a timestamp (get timestamp from OS)
void start() {
start_ = amd::Os::timeNanos();
}
// End a timestamp (get timestamp from OS)
void end() {
end_ = amd::Os::timeNanos();
}
// End a timestamp (get timestamp from OS)
void end() {
end_ = amd::Os::timeNanos();
}
static void setGpuTicksToTime(double ticksToTime) { ticksToTime_=ticksToTime; }
static double getGpuTicksToTime() { return ticksToTime_; }
bool isSplittedDispatch() const { return splittedDispatch_; }
void setSplittedDispatch() { splittedDispatch_ = true; }
static void setGpuTicksToTime(double ticksToTime) { ticksToTime_=ticksToTime; }
static double getGpuTicksToTime() { return ticksToTime_; }
};
class VirtualGPU : public device::VirtualDevice {