Files
rocm-systems/rocclr/runtime/device/rocm/rocvirtual.cpp
T
foreman dcdd293036 P4 to Git Change 1299127 by lmoriche@lmoriche_opencl_dev on 2016/08/04 14:18:16
SWDEV-94610 - Remove the g_complib instance from the ROCm device when building OpenCL/LC

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/build/Makefile.api#138 edit
... //depot/stg/opencl/drivers/opencl/runtime/build/Makefile.runtime#64 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#276 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/build/Makefile.oclrocm#6 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/roccompiler.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/roccompilerlib.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/roccompilerlib.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#5 edit
2016-08-04 14:24:34 -04:00

1722 lines
58 KiB
C++

//
// Copyright (c) 2013 Advanced Micro Devices, Inc. All rights reserved.
//
#include "device/rocm/rocdevice.hpp"
#include "device/rocm/rocvirtual.hpp"
#include "device/rocm/rockernel.hpp"
#include "device/rocm/rocmemory.hpp"
#include "device/rocm/rocblit.hpp"
#include "../../platform/kernel.hpp"
#include "../../platform/context.hpp"
#include "../../platform/command.hpp"
#include "../../platform/memory.hpp"
#include "../../platform/sampler.hpp"
#include "../../utils/debug.hpp"
#include "os/os.hpp"
#include "SCHSAInterface.h"
#include "amd_hsa_kernel_code.h"
#include <fstream>
#include <vector>
#include <string>
/**
* HSA image object size in bytes (see HSAIL spec)
*/
#define HSA_IMAGE_OBJECT_SIZE 48
/**
* HSA image object alignment in bytes (see HSAIL spec)
*/
#define HSA_IMAGE_OBJECT_ALIGNMENT 16
/**
* HSA sampler object size in bytes (see HSAIL spec)
*/
#define HSA_SAMPLER_OBJECT_SIZE 32
/**
* HSA sampler object alignment in bytes (see HSAIL spec)
*/
#define HSA_SAMPLER_OBJECT_ALIGNMENT 16
namespace roc {
// (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) invalidates I, K and L1
// (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE) invalidates L1, L2 and flushes L2
static const uint16_t kDispatchPacketHeaderNoSync =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
static const uint16_t kDispatchPacketHeader =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
static const uint16_t kBarrierPacketHeader =
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
static const uint16_t kBarrierPacketAcquireHeader =
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
static const uint16_t kBarrierPacketReleaseHeader =
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
static const hsa_barrier_and_packet_t kBarrierAcquirePacket =
{kBarrierPacketAcquireHeader,0,0,0,0,0,0,0,0,0};
static const hsa_barrier_and_packet_t kBarrierReleasePacket =
{kBarrierPacketReleaseHeader,0,0,0,0,0,0,0,0,0};
double Timestamp::ticksToTime_=0;
/**
* Set the ocl correlation handle (essentially the cl_event handle)
* to correlate the cl kernel launch and HSA kernel dispatch
*/
typedef hsa_status_t
(*hsa_ext_tools_set_correlation_handle)(const hsa_agent_t agent,
void *correlation_handle);
static void SetOclCorrelationHandle(void *tools_lib, const hsa_agent_t agent, void *handle) {
hsa_ext_tools_set_correlation_handle func =
(hsa_ext_tools_set_correlation_handle)amd::Os::getSymbol(tools_lib, "hsa_ext_tools_set_correlation_handler");
if (func) {
func(agent, handle);
}
return;
}
bool
VirtualGPU::MemoryDependency::create(size_t numMemObj)
{
if (numMemObj > 0) {
// Allocate the array of memory objects for dependency tracking
memObjectsInQueue_ = new MemoryState[numMemObj];
if (NULL == memObjectsInQueue_) {
return false;
}
memset(memObjectsInQueue_, 0, sizeof(MemoryState) * numMemObj);
maxMemObjectsInQueue_ = numMemObj;
}
return true;
}
void
VirtualGPU::MemoryDependency::validate(
VirtualGPU& gpu,
const Memory* memory,
bool readOnly)
{
bool flushL1Cache = false;
if (maxMemObjectsInQueue_ == 0) {
// Sync AQL packets
gpu.setAqlHeader(kDispatchPacketHeader);
return;
}
uint64_t curStart = reinterpret_cast<uint64_t>(memory->getDeviceMemory());
uint64_t curEnd = curStart + memory->size();
// Loop through all memory objects in the queue and find dependency
// @note don't include objects from the current kernel
for (size_t j = 0; j < endMemObjectsInQueue_; ++j) {
// Check if the queue already contains this mem object and
// GPU operations aren't readonly
uint64_t busyStart = memObjectsInQueue_[j].start_;
uint64_t busyEnd = memObjectsInQueue_[j].end_;
// Check if the start inside the busy region
if ((((curStart >= busyStart) && (curStart < busyEnd)) ||
// Check if the end inside the busy region
((curEnd > busyStart) && (curEnd <= busyEnd)) ||
// Check if the start/end cover the busy region
((curStart <= busyStart) && (curEnd >= busyEnd))) &&
// If the buys region was written or the current one is for write
(!memObjectsInQueue_[j].readOnly_ || !readOnly)) {
flushL1Cache = true;
break;
}
}
// Did we reach the limit?
if (maxMemObjectsInQueue_ <= (numMemObjectsInQueue_ + 1)) {
flushL1Cache = true;
}
if (flushL1Cache) {
// Sync AQL packets
gpu.setAqlHeader(kDispatchPacketHeader);
// Clear memory dependency state
const static bool All = true;
clear(!All);
}
// Insert current memory object into the queue always,
// since runtime calls flush before kernel execution and it has to keep
// current kernel in tracking
memObjectsInQueue_
[numMemObjectsInQueue_].start_ = curStart;
memObjectsInQueue_
[numMemObjectsInQueue_].end_ = curEnd;
memObjectsInQueue_
[numMemObjectsInQueue_].readOnly_ = readOnly;
numMemObjectsInQueue_++;
}
void
VirtualGPU::MemoryDependency::clear(bool all)
{
if (numMemObjectsInQueue_ > 0) {
size_t i, j;
if (all) {
endMemObjectsInQueue_ = numMemObjectsInQueue_;
}
// Preserve all objects from the current kernel
for (i = 0, j = endMemObjectsInQueue_; j < numMemObjectsInQueue_; i++, j++) {
memObjectsInQueue_[i].start_ = memObjectsInQueue_[j].start_;
memObjectsInQueue_[i].end_ = memObjectsInQueue_[j].end_;
memObjectsInQueue_[i].readOnly_ = memObjectsInQueue_[j].readOnly_;
}
// Clear all objects except current kernel
memset(&memObjectsInQueue_[i], 0, sizeof(amd::Memory*) * numMemObjectsInQueue_);
numMemObjectsInQueue_ -= endMemObjectsInQueue_;
endMemObjectsInQueue_ = 0;
}
}
bool
VirtualGPU::processMemObjects(
const amd::Kernel& kernel,
const_address params)
{
static const bool NoAlias = true;
const Kernel& hsaKernel = static_cast<const Kernel&>
(*(kernel.getDeviceKernel(dev(), NoAlias)));
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
// AQL packets
setAqlHeader(kDispatchPacketHeaderNoSync);
// Mark the tracker with a new kernel,
// so we can avoid checks of the aliased objects
memoryDependency().newKernel();
bool deviceSupportFGS = 0 != dev().isFineGrainedSystem(true);
bool supportFineGrainedSystem = deviceSupportFGS;
FGSStatus status = kernelParams.getSvmSystemPointersSupport();
switch (status) {
case FGS_YES:
if (!deviceSupportFGS) {
return false;
}
supportFineGrainedSystem = true;
break;
case FGS_NO:
supportFineGrainedSystem = false;
break;
case FGS_DEFAULT:
default:
break;
}
size_t count = kernelParams.getNumberOfSvmPtr();
size_t execInfoOffset = kernelParams.getExecInfoOffset();
bool sync = true;
amd::Memory* memory = NULL;
//get svm non arugment information
void* const* svmPtrArray =
reinterpret_cast<void* const*>(params + execInfoOffset);
for (size_t i = 0; i < count; i++) {
memory = amd::SvmManager::FindSvmBuffer(svmPtrArray[i]);
if (NULL == memory) {
if (!supportFineGrainedSystem) {
return false;
}
else if (sync) {
// Sync AQL packets
setAqlHeader(kDispatchPacketHeader);
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
continue;
}
}
else {
Memory* gpuMemory = static_cast<Memory*>(memory->getDeviceMemory(dev()));
if (NULL != gpuMemory) {
const static bool IsReadOnly = false;
// Validate SVM passed in the non argument list
memoryDependency().validate(*this, gpuMemory, IsReadOnly);
}
else {
return false;
}
}
}
// Check all parameters for the current kernel
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
const HsailKernelArg* arg = hsaKernel.hsailArgAt(i);
Memory* memory = NULL;
bool readOnly = false;
amd::Memory* svmMem = NULL;
// Find if current argument is a buffer
if ((desc.type_ == T_POINTER) && (arg->addrQual_ != HSAIL_ADDRESS_LOCAL)) {
if (kernelParams.boundToSvmPointer(dev(), params, i)) {
svmMem = amd::SvmManager::FindSvmBuffer(
*reinterpret_cast<void* const*>(params + desc.offset_));
if (!svmMem) {
// Sync AQL packets
setAqlHeader(kDispatchPacketHeader);
// Clear memory dependency state
const static bool All = true;
memoryDependency().clear(!All);
continue;
}
}
if (*reinterpret_cast<amd::Memory* const*>
(params + desc.offset_) != NULL) {
if (NULL == svmMem) {
memory = static_cast<Memory*>((*reinterpret_cast<amd::Memory* const*>
(params + desc.offset_))->getDeviceMemory(dev()));
}
else {
memory = static_cast<Memory*>(svmMem->getDeviceMemory(dev()));
}
}
if (memory != NULL) {
// Check image
readOnly = (desc.accessQualifier_ ==
CL_KERNEL_ARG_ACCESS_READ_ONLY) ? true : false;
// Check buffer
readOnly |= (arg->access_ == HSAIL_ACCESS_TYPE_RO) ? true : false;
// Validate memory for a dependency in the queue
memoryDependency().validate(*this, memory, readOnly);
}
}
}
return true;
}
template<typename AqlPacket>
bool
VirtualGPU::dispatchGenericAqlPacket(
AqlPacket* packet,
bool blocking)
{
const uint32_t queueSize = gpu_queue_->size;
const uint32_t queueMask = queueSize - 1;
//Check for queue full and wait if needed.
uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_);
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
hsa_signal_t signal;
// TODO: placeholder to setup the kernel to populate start and end timestamp.
if (timestamp_ != nullptr) {
// Find signal slot
ProfilingSignal* profilingSignal = &signal_pool_[index & queueMask];
// Make sure we save the old results in the TS structure
if (profilingSignal->ts_ != nullptr) {
profilingSignal->ts_->checkGpuTime();
}
// Update the new TS with the signal info
timestamp_->setProfilingSignal(profilingSignal);
packet->completion_signal = profilingSignal->signal_;
profilingSignal->ts_ = timestamp_;
timestamp_->setAgent(gpu_device_);
}
if ((index - read) == queueMask) {
if (packet->completion_signal.handle == 0) {
packet->completion_signal = barrier_signal_;
}
signal = packet->completion_signal;
// Initialize signal for a wait
hsa_signal_store_relaxed(signal, InitSignalValue);
blocking = true;
}
//Insert packet
((AqlPacket*)(
gpu_queue_->base_address))[index & queueMask] = *packet;
hsa_queue_store_write_index_release(gpu_queue_, index + 1);
hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index);
//Wait on signal ?
if (blocking) {
if (hsa_signal_wait_acquire(
signal, HSA_SIGNAL_CONDITION_LT, 1, uint64_t(-1),
HSA_WAIT_STATE_BLOCKED) != 0) {
LogPrintfError("Failed signal [0x%lx] wait", signal.handle);
return false;
}
// Release the pool, since runtime just drained the entire queue
resetKernArgPool();
}
return true;
}
bool
VirtualGPU::dispatchAqlPacket(
hsa_kernel_dispatch_packet_t* packet,
bool blocking)
{
return dispatchGenericAqlPacket(packet, blocking);
}
bool
VirtualGPU::dispatchAqlPacket(
hsa_barrier_and_packet_t* packet,
bool blocking)
{
return dispatchGenericAqlPacket(packet, blocking);
}
void
VirtualGPU::dispatchBarrierPacket(const hsa_barrier_and_packet_t* packet)
{
assert(packet->completion_signal.handle != 0);
const uint32_t queueSize = gpu_queue_->size;
const uint32_t queueMask = queueSize - 1;
uint64_t index = hsa_queue_load_write_index_relaxed(gpu_queue_);
((hsa_barrier_and_packet_t*)(
gpu_queue_->base_address))[index&queueMask] = *packet;
hsa_queue_store_write_index_relaxed(gpu_queue_, index + 1);
hsa_signal_store_relaxed(gpu_queue_->doorbell_signal, index);
}
/**
* @brief Waits on an outstanding kernel without regard to how
* it was dispatched - with or without a signal
*
* @return bool true if Wait returned successfully, false
* otherwise
*/
bool VirtualGPU::releaseGpuMemoryFence() {
// Return if there is no pending dispatch
if (!hasPendingDispatch_) {
return false;
}
// Initialize signal for the barrier packet.
hsa_signal_store_relaxed(barrier_signal_, InitSignalValue);
// Dispatch barrier packet into the queue and wait till it finishes.
dispatchBarrierPacket(&barrier_packet_);
if (hsa_signal_wait_acquire(
barrier_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1),
HSA_WAIT_STATE_BLOCKED) != 0) {
LogError("Barrier packet submission failed");
return false;
}
hasPendingDispatch_ = false;
// Release all memory dependencies
memoryDependency().clear();
// Release the pool, since runtime just completed a barrier
resetKernArgPool();
return true;
}
VirtualGPU::VirtualGPU(Device &device)
: device::VirtualDevice(device)
, roc_device_(device)
{
gpu_device_ = device.getBackendDevice();
// Initialize the last signal and dispatch flags
timestamp_ = NULL;
hasPendingDispatch_ = false;
tools_lib_ = NULL;
kernarg_pool_base_ = NULL;
kernarg_pool_size_ = 0;
kernarg_pool_cur_offset_ = 0;
aqlHeader_ = kDispatchPacketHeaderNoSync;
barrier_signal_.handle = 0;
}
VirtualGPU::~VirtualGPU()
{
if (timestamp_ != NULL) {
delete timestamp_;
timestamp_ = NULL;
LogError("There was a timestamp that was not used; deleting.");
}
if (printfdbg_ != NULL){
delete printfdbg_;
printfdbg_ = NULL;
}
tools_lib_ = NULL;
}
bool
VirtualGPU::create(bool profilingEna)
{
// Set the event handle to the tools lib if the env var
// Load the library using its advertised "soname"
std::string lib_name = amd::Os::getEnvironment("HSA_TOOLS_LIB");
if (lib_name != "") {
#if defined(_WIN32) || defined(__CYGWIN__)
const char *tools_lib_name = "hsa-runtime-tools" LP64_SWITCH("", "64") ".dll";
#else
const char *tools_lib_name = "libhsa-runtime-tools" LP64_SWITCH("", "64") ".so.1";
#endif
tools_lib_ = amd::Os::loadLibrary(tools_lib_name);
}
uint32_t queue_max_packets = 0;
if (HSA_STATUS_SUCCESS !=
hsa_agent_get_info(
gpu_device_, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &queue_max_packets)) {
return false;
}
//Pick a reasonable queue size
uint32_t queue_size = 1024;
queue_size = (queue_max_packets < queue_size) ? queue_max_packets : queue_size;
while (hsa_queue_create(gpu_device_,
queue_size, HSA_QUEUE_TYPE_MULTI, NULL, NULL, UINT_MAX, UINT_MAX,
&gpu_queue_) != HSA_STATUS_SUCCESS) {
queue_size >>= 1;
if (queue_size < 64) {
return false;
}
}
if (!initPool(dev().settings().kernargPoolSize_, (profilingEna) ? queue_size : 0)) {
LogError("Couldn't allocate arguments/signals for the queue");
return false;
}
device::BlitManager::Setup blitSetup;
blitMgr_ = new KernelBlitManager(*this, blitSetup);
if ((NULL == blitMgr_) || !blitMgr_->create(roc_device_)) {
LogError("Could not create BlitManager!");
return false;
}
// Create signal for the barrier packet.
hsa_signal_t signal = { 0 };
if (HSA_STATUS_SUCCESS !=
hsa_signal_create(InitSignalValue, 0, NULL, &signal)) {
return false;
}
barrier_signal_ = signal;
// Initialize barrier packet.
memset(&barrier_packet_, 0, sizeof(barrier_packet_));
barrier_packet_.header = kBarrierPacketHeader;
barrier_packet_.completion_signal = barrier_signal_;
// Create a object of PrintfDbg
printfdbg_ = new PrintfDbg(roc_device_);
if (NULL == printfdbg_) {
LogError("\nCould not create printfDbg Object!");
return false;
}
// Initialize timestamp conversion factor
if (Timestamp::getGpuTicksToTime() == 0) {
uint64_t frequency;
hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &frequency);
Timestamp::setGpuTicksToTime(1e9/double(frequency));
}
if (!memoryDependency().create(GPU_NUM_MEM_DEPENDENCY)) {
LogError("Could not create the array of memory objects!");
return false;
}
return true;
}
bool
VirtualGPU::terminate()
{
delete blitMgr_;
// Release the resources of signal
releaseGpuMemoryFence();
hsa_status_t err = hsa_queue_destroy(gpu_queue_);
if (err != HSA_STATUS_SUCCESS) {
return false;
}
if (barrier_signal_.handle != 0) {
hsa_signal_destroy(barrier_signal_);
}
if (tools_lib_) {
amd::Os::unloadLibrary(tools_lib_);
tools_lib_ = NULL;
}
destroyPool();
return true;
}
bool
VirtualGPU::initPool(size_t kernarg_pool_size, uint signal_pool_count)
{
kernarg_pool_size_ = kernarg_pool_size;
kernarg_pool_base_ = reinterpret_cast<char *>(
roc_device_.hostAlloc(kernarg_pool_size_, 1, true));
if (kernarg_pool_base_ == nullptr) {
return false;
}
if (signal_pool_count != 0) {
signal_pool_.resize(signal_pool_count);
for (uint i = 0; i < signal_pool_count; ++i) {
ProfilingSignal profilingSignal;
if (HSA_STATUS_SUCCESS != hsa_signal_create(
0, 0, nullptr, &profilingSignal.signal_)) {
return false;
}
signal_pool_[i] = profilingSignal;
}
}
return true;
}
void
VirtualGPU::destroyPool() {
if (kernarg_pool_base_ != nullptr) {
roc_device_.hostFree(kernarg_pool_base_, kernarg_pool_size_);
}
if (signal_pool_.size() > 0) {
for (uint i = 0; i < signal_pool_.size(); ++i) {
hsa_signal_destroy(signal_pool_[i].signal_);
}
}
}
void*
VirtualGPU::allocKernArg(size_t size, size_t alignment)
{
char* result = nullptr;
do {
result = amd::alignUp(kernarg_pool_base_ + kernarg_pool_cur_offset_, alignment);
const size_t pool_new_usage = (result + size) - kernarg_pool_base_;
if (pool_new_usage <= kernarg_pool_size_) {
kernarg_pool_cur_offset_ = pool_new_usage;
return result;
}
else {
//! We run out of the arguments space!
//! That means the app didn't call clFlush/clFinish for very long time.
//! We can issue a barrier to avoid expensive extra memory allocations.
// Initialize signal for the barrier packet.
hsa_signal_store_relaxed(barrier_signal_, InitSignalValue);
// Dispatch barrier packet into the queue and wait till it finishes.
dispatchBarrierPacket(&barrier_packet_);
if (hsa_signal_wait_acquire(
barrier_signal_, HSA_SIGNAL_CONDITION_EQ, 0, uint64_t(-1),
HSA_WAIT_STATE_BLOCKED) != 0) {
LogError("Kernel arguments reset failed");
}
resetKernArgPool();
}
} while (true);
return result;
}
/* profilingBegin, when profiling is enabled, creates a timestamp to save in
* virtualgpu's timestamp_, and calls start() to get the current host
* timestamp.
*/
void VirtualGPU::profilingBegin(amd::Command &command, bool drmProfiling)
{
if (command.profilingInfo().enabled_) {
if (timestamp_ != NULL) {
LogWarning("Trying to create a second timestamp in VirtualGPU. \
This could have unintended consequences.");
return;
}
timestamp_ = new Timestamp;
timestamp_->start();
}
}
/* profilingEnd, when profiling is enabled, checks to see if a signal was
* created for whatever command we are running and calls end() to get the
* current host timestamp if no signal is available. It then saves the pointer
* timestamp_ to the command's data.
*/
void VirtualGPU::profilingEnd(amd::Command &command)
{
if (command.profilingInfo().enabled_) {
if (timestamp_->getProfilingSignal() == nullptr) {
timestamp_->end();
}
command.setData(reinterpret_cast<void*>(timestamp_));
timestamp_ = NULL;
}
}
struct DestroySampler : public std::binary_function<hsa_ext_sampler_t,
hsa_agent_t, bool> {
bool operator() (hsa_ext_sampler_t &sampler,
hsa_agent_t agent) const {
hsa_status_t status = hsa_ext_sampler_destroy(agent, sampler);
return status == HSA_STATUS_SUCCESS;
}
};
void VirtualGPU::updateCommandsState(amd::Command *list)
{
Timestamp *ts = NULL;
amd::Command* current = list;
amd::Command* next = NULL;
if (current == NULL) {
return;
}
uint64_t endTimeStamp = 0;
uint64_t startTimeStamp = endTimeStamp;
if (current->profilingInfo().enabled_) {
// TODO: use GPU timestamp when available.
endTimeStamp = amd::Os::timeNanos();
startTimeStamp = endTimeStamp;
// This block gets the first valid timestamp from the first command
// that has one. This timestamp is used below to mark any command that
// came before it to start and end with this first valid start time.
current = list;
while (current != NULL) {
if (current->data() != NULL) {
ts = reinterpret_cast<Timestamp*>(current->data());
startTimeStamp = ts->getStart();
endTimeStamp = ts->getStart();
break;
}
current = current->getNext();
}
}
// Iterate through the list of commands, and set timestamps as appropriate
// Note, if a command does not have a timestamp, it does one of two things:
// - if the command (without a timestamp), A, precedes another command, C,
// that _does_ contain a valid timestamp, command A will set RUNNING and
// COMPLETE with the RUNNING (start) timestamp from command C. This would
// also be true for command B, which is between A and C. These timestamps
// are actually retrieved in the block above (startTimeStamp, endTimeStamp).
// - if the command (without a timestamp), C, follows another command, A,
// that has a valid timestamp, command C will be set RUNNING and COMPLETE
// with the COMPLETE (end) timestamp of the previous command, A. This is
// also true for any command B, which falls between A and C.
current = list;
while (current != NULL) {
if (current->profilingInfo().enabled_) {
if (current->data() != NULL) {
// Since this is a valid command to get a timestamp, we use the
// timestamp provided by the runtime (saved in the data())
ts = reinterpret_cast<Timestamp*>(current->data());
startTimeStamp = ts->getStart();
endTimeStamp = ts->getEnd();
delete ts;
current->setData(NULL);
}
else {
// If we don't have a command that contains a valid timestamp,
// we simply use the end timestamp of the previous command.
// Note, if this is a command before the first valid timestamp,
// this will be equal to the start timestamp of the first valid
// timestamp at this point.
startTimeStamp = endTimeStamp;
}
}
if (current->status() == CL_SUBMITTED) {
current->setStatus(CL_RUNNING, startTimeStamp);
current->setStatus(CL_COMPLETE, endTimeStamp);
}
else if (current->status() != CL_COMPLETE) {
LogPrintfError("Unexpected command status - %d.", current->status());
}
next = current->getNext();
current->release();
current = next;
}
// Release the sampler handles allocated for the various
// on one or more kernel submissions
std::for_each(samplerList_.begin(),
samplerList_.end(),
std::bind2nd<DestroySampler>(DestroySampler(), gpu_device_));
samplerList_.clear();
return;
}
void VirtualGPU::submitReadMemory(amd::ReadMemoryCommand &cmd)
{
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
size_t offset = 0;
// Find if virtual address is a CL allocation
device::Memory* hostMemory = dev().findMemoryFromVA(cmd.destination(), &offset);
device::Memory *devMem = cmd.source().getDeviceMemory(dev());
void *dst = cmd.destination();
amd::Coord3D size = cmd.size();
//! @todo: add multi-devices synchronization when supported.
cl_command_type type = cmd.type();
bool result = false;
bool imageBuffer = false;
// Force buffer read for IMAGE1D_BUFFER
if ((type == CL_COMMAND_READ_IMAGE) &&
(cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
type = CL_COMMAND_READ_BUFFER;
imageBuffer = true;
}
switch (type) {
case CL_COMMAND_READ_BUFFER: {
amd::Coord3D origin(cmd.origin()[0]);
if (imageBuffer) {
size_t elemSize =
cmd.source().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
}
if (hostMemory != nullptr) {
// Accelerated transfer without pinning
amd::Coord3D dstOrigin(offset);
result = blitMgr().copyBuffer(*devMem, *hostMemory,
origin, dstOrigin, size, cmd.isEntireMemory());
}
else {
result = blitMgr().readBuffer(
*devMem, dst, origin, size,
cmd.isEntireMemory());
}
break;
}
case CL_COMMAND_READ_BUFFER_RECT: {
result = blitMgr().readBufferRect(
*devMem, dst, cmd.bufRect(), cmd.hostRect(), size,
cmd.isEntireMemory());
break;
}
case CL_COMMAND_READ_IMAGE: {
result = blitMgr().readImage(
*devMem, dst, cmd.origin(), size, cmd.rowPitch(),
cmd.slicePitch(), cmd.isEntireMemory());
break;
}
default:
ShouldNotReachHere();
break;
}
if (!result) {
LogError("submitReadMemory failed!");
cmd.setStatus(CL_OUT_OF_RESOURCES);
}
profilingEnd(cmd);
}
void VirtualGPU::submitWriteMemory(amd::WriteMemoryCommand &cmd)
{
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
size_t offset = 0;
// Find if virtual address is a CL allocation
device::Memory* hostMemory = dev().findMemoryFromVA(cmd.source(), &offset);
device::Memory *devMem = cmd.destination().getDeviceMemory(dev());
const char *src = static_cast<const char *>(cmd.source());
amd::Coord3D size = cmd.size();
//! @todo add multi-devices synchronization when supported.
cl_command_type type = cmd.type();
bool result = false;
bool imageBuffer = false;
// Force buffer write for IMAGE1D_BUFFER
if ((type == CL_COMMAND_WRITE_IMAGE) &&
(cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
type = CL_COMMAND_WRITE_BUFFER;
imageBuffer = true;
}
switch (type) {
case CL_COMMAND_WRITE_BUFFER: {
amd::Coord3D origin(cmd.origin()[0]);
if (imageBuffer) {
size_t elemSize =
cmd.destination().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
}
if (hostMemory != nullptr) {
// Accelerated transfer without pinning
amd::Coord3D srcOrigin(offset);
result = blitMgr().copyBuffer(*hostMemory, *devMem,
srcOrigin, origin, size, cmd.isEntireMemory());
}
else {
result = blitMgr().writeBuffer(
src, *devMem , origin, size,
cmd.isEntireMemory());
}
break;
}
case CL_COMMAND_WRITE_BUFFER_RECT: {
result = blitMgr().writeBufferRect(
src, *devMem, cmd.hostRect(), cmd.bufRect(), size,
cmd.isEntireMemory());
break;
}
case CL_COMMAND_WRITE_IMAGE: {
result = blitMgr().writeImage(
src, *devMem, cmd.origin(), size, cmd.rowPitch(),
cmd.slicePitch(), cmd.isEntireMemory());
break;
}
default:
ShouldNotReachHere();
break;
}
if (!result) {
LogError("submitWriteMemory failed!");
cmd.setStatus(CL_OUT_OF_RESOURCES);
}
else {
cmd.destination().signalWrite(&dev());
}
profilingEnd(cmd);
}
void VirtualGPU::submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd)
{
// in-order semantics: previous commands need to be done before we start
releaseGpuMemoryFence();
profilingBegin(cmd);
const std::vector<void*>& svmPointers = cmd.svmPointers();
if (cmd.pfnFreeFunc() == NULL) {
// pointers allocated using clSVMAlloc
for (cl_uint i = 0; i < svmPointers.size(); i++) {
amd::SvmBuffer::free(cmd.context(), svmPointers[i]);
}
}
else {
cmd.pfnFreeFunc()(as_cl(cmd.queue()->asCommandQueue()), svmPointers.size(),
(void**) (&(svmPointers[0])), cmd.userData());
}
profilingEnd(cmd);
}
void VirtualGPU::submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd)
{
// in-order semantics: previous commands need to be done before we start
releaseGpuMemoryFence();
profilingBegin(cmd);
amd::SvmBuffer::memFill(cmd.dst(), cmd.src(), cmd.srcSize(), 1);
profilingEnd(cmd);
}
void VirtualGPU::submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd)
{
// in-order semantics: previous commands need to be done before we start
releaseGpuMemoryFence();
profilingBegin(cmd);
amd::SvmBuffer::memFill(cmd.dst(), cmd.pattern(), cmd.patternSize(), cmd.times());
profilingEnd(cmd);
}
void VirtualGPU::submitCopyMemory(amd::CopyMemoryCommand &cmd)
{
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
device::Memory *srcDevMem = cmd.source().getDeviceMemory(dev());
device::Memory *destDevMem = cmd.destination().getDeviceMemory(dev());
amd::Coord3D size = cmd.size();
//! @todo add multi-devices synchronization when supported.
cl_command_type type = cmd.type();
bool result = false;
bool srcImageBuffer = false;
bool dstImageBuffer = false;
// Force buffer copy for IMAGE1D_BUFFER
if (cmd.source().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
srcImageBuffer = true;
type = CL_COMMAND_COPY_BUFFER;
}
if (cmd.destination().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER) {
dstImageBuffer = true;
type = CL_COMMAND_COPY_BUFFER;
}
switch (cmd.type()) {
case CL_COMMAND_COPY_BUFFER: {
amd::Coord3D srcOrigin(cmd.srcOrigin()[0]);
amd::Coord3D dstOrigin(cmd.dstOrigin()[0]);
if (srcImageBuffer) {
const size_t elemSize =
cmd.source().asImage()->getImageFormat().getElementSize();
srcOrigin.c[0] *= elemSize;
if (dstImageBuffer) {
dstOrigin.c[0] *= elemSize;
}
size.c[0] *= elemSize;
}
else if (dstImageBuffer) {
const size_t elemSize =
cmd.destination().asImage()->getImageFormat().getElementSize();
dstOrigin.c[0] *= elemSize;
size.c[0] *= elemSize;
}
result = blitMgr().copyBuffer(
*srcDevMem, *destDevMem, srcOrigin,
dstOrigin, size, cmd.isEntireMemory());
break;
}
case CL_COMMAND_COPY_BUFFER_RECT: {
result = blitMgr().copyBufferRect(
*srcDevMem, *destDevMem, cmd.srcRect(),
cmd.dstRect(), size, cmd.isEntireMemory());
break;
}
case CL_COMMAND_COPY_IMAGE: {
result = blitMgr().copyImage(
*srcDevMem, *destDevMem, cmd.srcOrigin(),
cmd.dstOrigin(), size, cmd.isEntireMemory());
break;
}
case CL_COMMAND_COPY_IMAGE_TO_BUFFER: {
result = blitMgr().copyImageToBuffer(
*srcDevMem, *destDevMem, cmd.srcOrigin(),
cmd.dstOrigin(), size, cmd.isEntireMemory());
break;
}
case CL_COMMAND_COPY_BUFFER_TO_IMAGE: {
result = blitMgr().copyBufferToImage(
*srcDevMem, *destDevMem, cmd.srcOrigin(),
cmd.dstOrigin(), size, cmd.isEntireMemory());
break;
}
default:
ShouldNotReachHere();
break;
}
if (!result) {
LogError("submitCopyMemory failed!");
cmd.setStatus(CL_OUT_OF_RESOURCES);
}
cmd.destination().signalWrite(&dev());
profilingEnd(cmd);
}
void VirtualGPU::submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd)
{
// No fence is needed since this is a no-op: the
// command will be completed only after all the
// previous commands are complete
profilingBegin(cmd);
profilingEnd(cmd);
}
void VirtualGPU::submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd)
{
// No fence is needed since this is a no-op: the
// command will be completed only after all the
// previous commands are complete
profilingBegin(cmd);
profilingEnd(cmd);
}
void VirtualGPU::submitMapMemory(amd::MapMemoryCommand &cmd)
{
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
//! @todo add multi-devices synchronization when supported.
roc::Memory *devMemory = reinterpret_cast<roc::Memory *>(
cmd.memory().getDeviceMemory(dev(), false));
cl_command_type type = cmd.type();
bool imageBuffer = false;
// Save map requirement.
cl_map_flags mapFlag = cmd.mapFlags();
// Treat no map flag as read-write.
if (mapFlag == 0) {
mapFlag = CL_MAP_READ | CL_MAP_WRITE;
}
devMemory->saveMapInfo(cmd.mapPtr(), cmd.origin(), cmd.size(),
mapFlag, cmd.isEntireMemory());
// Sync to the map target.
if (devMemory->isHostMemDirectAccess()) {
// Add memory to VA cache, so rutnime can detect direct access to VA
dev().addVACache(devMemory);
}
if ((!devMemory->isHostMemDirectAccess()) &&
(mapFlag & (CL_MAP_READ | CL_MAP_WRITE))) {
bool result = false;
roc::Memory *hsaMemory = static_cast<roc::Memory *>(devMemory);
amd::Memory* mapMemory = hsaMemory->mapMemory();
void *hostPtr = mapMemory == NULL ?
hsaMemory->owner()->getHostMem() :
mapMemory->getHostMem();
if (type == CL_COMMAND_MAP_BUFFER) {
amd::Coord3D origin(cmd.origin()[0]);
amd::Coord3D size(cmd.size()[0]);
amd::Coord3D dstOrigin(cmd.origin()[0], 0, 0);
if (imageBuffer) {
size_t elemSize =
cmd.memory().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
}
if (mapMemory != nullptr) {
roc::Memory *hsaMapMemory = static_cast<roc::Memory *>(
mapMemory->getDeviceMemory(dev(), false));
result = blitMgr().copyBuffer(*hsaMemory,
*hsaMapMemory, origin, dstOrigin,
size, cmd.isEntireMemory());
}
else {
result = blitMgr().readBuffer(
*hsaMemory, static_cast<char *>(hostPtr)+origin[0],
origin, size, cmd.isEntireMemory());
}
}
else if (type == CL_COMMAND_MAP_IMAGE) {
amd::Image* image = cmd.memory().asImage();
result = blitMgr().readImage(
*hsaMemory, hostPtr, amd::Coord3D(0),
image->getRegion(), image->getRowPitch(),
image->getSlicePitch(), true);
}
else {
ShouldNotReachHere();
}
if (!result) {
LogError("submitMapMemory failed!");
cmd.setStatus(CL_OUT_OF_RESOURCES);
}
}
profilingEnd(cmd);
}
void VirtualGPU::submitUnmapMemory(amd::UnmapMemoryCommand &cmd)
{
roc::Memory* devMemory = static_cast<roc::Memory*>(
cmd.memory().getDeviceMemory(dev(), false));
const device::Memory::WriteMapInfo* mapInfo =
devMemory->writeMapInfo(cmd.mapPtr());
if (nullptr == mapInfo) {
LogError("Unmap without map call");
return;
}
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
// Force buffer write for IMAGE1D_BUFFER
bool imageBuffer = (cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER);
if (devMemory->isHostMemDirectAccess()) {
// Remove memory from VA cache
dev().removeVACache(devMemory);
}
if (mapInfo->isUnmapWrite()) {
// Commit the changes made by the user.
if (devMemory->owner()->getHostMem() == nullptr) {
bool result = false;
if (cmd.memory().asImage() && !imageBuffer) {
amd::Image *image = cmd.memory().asImage();
result = blitMgr().writeImage(
cmd.mapPtr(), *devMemory,
mapInfo->origin_,
mapInfo->region_,
image->getRowPitch(), image->getSlicePitch());
}
else {
amd::Coord3D origin(mapInfo->origin_[0]);
amd::Coord3D size(mapInfo->region_[0]);
if (imageBuffer) {
size_t elemSize =
cmd.memory().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
}
if (devMemory->mapMemory() != nullptr) {
roc::Memory *mapMemory = static_cast<roc::Memory *>(
devMemory->mapMemory()->getDeviceMemory(dev(), false));
result = blitMgr().copyBuffer(
*mapMemory, *devMemory,
mapInfo->origin_,
mapInfo->origin_,
mapInfo->region_,
mapInfo->isEntire());
}
else {
result = blitMgr().writeBuffer(
cmd.mapPtr(), *devMemory, origin, size);
}
}
if (!result) {
LogError("submitMapMemory failed!");
cmd.setStatus(CL_OUT_OF_RESOURCES);
}
}
cmd.memory().signalWrite(&dev());
}
devMemory->clearUnmapInfo(cmd.mapPtr());
profilingEnd(cmd);
}
void VirtualGPU::submitFillMemory(amd::FillMemoryCommand &cmd)
{
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(cmd);
device::Memory *devMemory = cmd.memory().getDeviceMemory(dev(), false);
//! @todo add multi-devices synchronization when supported.
cl_command_type type = cmd.type();
bool result = false;
bool imageBuffer = false;
float fillValue[4];
// Force fill buffer for IMAGE1D_BUFFER
if ((type == CL_COMMAND_FILL_IMAGE) &&
(cmd.memory().getType() == CL_MEM_OBJECT_IMAGE1D_BUFFER)) {
type = CL_COMMAND_FILL_BUFFER;
imageBuffer = true;
}
// Find the the right fill operation
switch (type) {
case CL_COMMAND_FILL_BUFFER: {
const void* pattern = cmd.pattern();
size_t patternSize = cmd.patternSize();
amd::Coord3D origin(cmd.origin()[0]);
amd::Coord3D size(cmd.size()[0]);
// Reprogram fill parameters if it's an IMAGE1D_BUFFER object
if (imageBuffer) {
size_t elemSize =
cmd.memory().asImage()->getImageFormat().getElementSize();
origin.c[0] *= elemSize;
size.c[0] *= elemSize;
memset(fillValue, 0, sizeof(fillValue));
cmd.memory().asImage()->getImageFormat().formatColor(pattern,
fillValue);
pattern = fillValue;
patternSize = elemSize;
}
result = blitMgr().fillBuffer(
*devMemory, pattern, patternSize, origin, size,
cmd.isEntireMemory());
break;
}
case CL_COMMAND_FILL_IMAGE: {
result = blitMgr().fillImage(
*devMemory, cmd.pattern(), cmd.origin(), cmd.size(),
cmd.isEntireMemory());
break;
}
default:
ShouldNotReachHere();
break;
}
if (!result) {
LogError("submitFillMemory failed!");
cmd.setStatus(CL_OUT_OF_RESOURCES);
}
cmd.memory().signalWrite(&dev());
profilingEnd(cmd);
}
void VirtualGPU::submitMigrateMemObjects(amd::MigrateMemObjectsCommand &vcmd)
{
// Wait on a kernel if one is outstanding
releaseGpuMemoryFence();
profilingBegin(vcmd);
std::vector<amd::Memory *>::const_iterator itr;
for (itr = vcmd.memObjects().begin();
itr != vcmd.memObjects().end();
itr++) {
// Find device memory
device::Memory *m = (*itr)->getDeviceMemory(dev());
roc::Memory *memory = static_cast<roc::Memory *>(m);
if (vcmd.migrationFlags() & CL_MIGRATE_MEM_OBJECT_HOST) {
//! @todo revisit this when multi devices is supported.
} else if (vcmd.migrationFlags() &
CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED) {
//! @todo revisit this when multi devices is supported.
} else {
LogWarning("Unknown operation for memory migration!");
}
}
profilingEnd(vcmd);
}
/*! \brief Writes to the buffer and incrememts the write pointer to the
* buffer. Also, ensures that the argument is written to an
* aligned memory as specified
*
* @param dst The write pointer to the buffer
* @param src The source pointer
* @param size The size in bytes to copy
* @param alignment The alignment to follow while writing to the buffer
*/
static void
addArg(unsigned char** dst, const void* src,
size_t size, uint32_t alignment)
{
*dst = amd::alignUp(*dst, alignment);
memcpy(*dst, src, size);
*dst += size;
}
static inline void
addArg(unsigned char** dst, const void* src, size_t size)
{
assert(size < UINT32_MAX);
addArg(dst, src, size, size);
}
//Over rides the workgroup size fields in the packet with runtime/compiler set sizes
void setRuntimeCompilerLocalSize(hsa_kernel_dispatch_packet_t& dispatchPacket,
amd::NDRangeContainer sizes,
const size_t* compile_size,
const roc::Device &dev){
//Todo (sramalin) need to check if compile_size is set to 0 if dimension is not valid
// else this error check is incorrect
if (compile_size[0] || compile_size[1] || compile_size[2]) {
dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? compile_size[0] : 1;
dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? compile_size[1] : 1;
dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? compile_size[2] : 1;
}
else {
//Runtime must set the group size
dispatchPacket.workgroup_size_x = 1;
dispatchPacket.workgroup_size_y = 1;
dispatchPacket.workgroup_size_z = 1;
if (sizes.dimensions() == 1) {
dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize_;
}
else if (sizes.dimensions() == 2) {
dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize2DX_;
dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize2DY_;
}
else if (sizes.dimensions() == 3) {
dispatchPacket.workgroup_size_x = dev.settings().maxWorkGroupSize3DX_;
dispatchPacket.workgroup_size_y = dev.settings().maxWorkGroupSize3DY_;
dispatchPacket.workgroup_size_z = dev.settings().maxWorkGroupSize3DZ_;
}
}
}
static void
fillSampleDescriptor(
hsa_ext_sampler_descriptor_t& samplerDescriptor,
const amd::Sampler& sampler)
{
samplerDescriptor.filter_mode = sampler.filterMode() == CL_FILTER_NEAREST ?
HSA_EXT_SAMPLER_FILTER_MODE_NEAREST : HSA_EXT_SAMPLER_FILTER_MODE_LINEAR;
samplerDescriptor.coordinate_mode = sampler.normalizedCoords() ?
HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED :
HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED;
switch (sampler.addressingMode()) {
case CL_ADDRESS_CLAMP_TO_EDGE:
samplerDescriptor.address_mode =
HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE;
break;
case CL_ADDRESS_REPEAT:
samplerDescriptor.address_mode =
HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT;
break;
case CL_ADDRESS_CLAMP:
samplerDescriptor.address_mode =
HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER;
break;
case CL_ADDRESS_MIRRORED_REPEAT:
samplerDescriptor.address_mode =
HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT;
break;
case CL_ADDRESS_NONE:
samplerDescriptor.address_mode =
HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED;
break;
default:
return;
}
}
bool
VirtualGPU::submitKernelInternal(
const amd::NDRangeContainer& sizes,
const amd::Kernel& kernel,
const_address parameters,
void *eventHandle)
{
if (tools_lib_) {
SetOclCorrelationHandle(tools_lib_, this->gpu_device_, eventHandle);
}
device::Kernel *devKernel = const_cast<device::Kernel *>
(kernel.getDeviceKernel(dev()));
Kernel &gpuKernel = static_cast<Kernel &>(*devKernel);
const size_t compilerLdsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
size_t ldsUsage = compilerLdsUsage;
// Check memory dependency and SVM objects
if (!processMemObjects(kernel, parameters)) {
LogError("Wrong memory objects!");
return false;
}
// Init PrintfDbg object if printf is enabled.
bool printfEnabled = (gpuKernel.printfInfo().size() > 0) ? true : false;
if (!printfDbg()->init(printfEnabled)){
LogError("\nPrintfDbg object initialization failed!");
return false;
}
// Allocate buffer to hold kernel arguments
address argBuffer =
(address)allocKernArg(gpuKernel.KernargSegmentByteSize(),
gpuKernel.KernargSegmentAlignment());
if (argBuffer == NULL) {
LogError("Out of memory");
return false;
}
address argPtr = argBuffer;
// The HLC generates Kernenv arguments, first 3 are global offsets.
const uint extraAargs = ((roc::Kernel*)devKernel)->extraArgumentsNum();
for (uint j = 0; j < extraAargs; ++j) {
// The 4th parameter is the pointer to print buffer
if (3 == j) {
address bufferPtr = printfDbg()->dbgBuffer();
addArg(&argPtr, &bufferPtr, sizeof(void*));
}else {
const size_t offset = j < sizes.dimensions() ? sizes.offset()[j] : 0;
addArg(&argPtr, &offset, sizeof(void*)); //Should be uint32_t for small model and uint64_t for large!
}
}
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
// Find all parameters for the current kernel
for (uint i = 0; i != signature.numParameters(); ++i) {
const HsailKernelArg* arg = gpuKernel.hsailArgAt(i);
const_address srcArgPtr = parameters + signature.at(i).offset_;
if (arg->type_ == HSAIL_ARGTYPE_POINTER ) {
const size_t size = sizeof(void*);
if (arg->addrQual_ == HSAIL_ADDRESS_LOCAL) {
ldsUsage = amd::alignUp(ldsUsage, arg->alignment_); //!< do we need this?
addArg(&argPtr, &ldsUsage, size);
ldsUsage += *reinterpret_cast<const size_t *>(srcArgPtr);
continue;
}
assert((arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) &&
"Unsupported address qualifier");
if (kernelParams.boundToSvmPointer(dev(), parameters, i)) {
addArg(&argPtr, srcArgPtr, size);
continue;
}
amd::Memory* mem = *reinterpret_cast<amd::Memory* const*>(srcArgPtr);
if (mem == NULL) {
addArg(&argPtr, srcArgPtr, size);
continue;
}
Memory *devMem = static_cast<Memory *>(mem->getDeviceMemory(dev()));
//! @todo add multi-devices synchronization when supported.
void* globalAddress = devMem->getDeviceMemory();
addArg(&argPtr, &globalAddress, size);
//! @todo Compiler has to return read/write attributes
const cl_mem_flags flags = mem->getMemFlags();
if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
mem->signalWrite(&dev());
}
}
else if (arg->type_ == HSAIL_ARGTYPE_VALUE) {
if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) {
void *mem = allocKernArg(arg->size_, arg->alignment_);
if (mem == NULL) {
LogError("Out of memory");
return false;
}
memcpy(mem, srcArgPtr, arg->size_);
addArg(&argPtr, &mem, sizeof(void*));
continue;
}
for (uint e = 0; e < arg->numElem_; ++e) {
addArg(&argPtr, srcArgPtr, arg->size_);
srcArgPtr += arg->size_;
}
}
else if (arg->type_ == HSAIL_ARGTYPE_IMAGE) {
amd::Memory* mem = *reinterpret_cast<amd::Memory* const*>(srcArgPtr);
Image* image = static_cast<Image *>(mem->getDeviceMemory(dev()));
if (image == NULL) {
LogError("Kernel image argument is not an image object");
return false;
}
if (dev().settings().enableImageHandle_) {
const uint64_t image_srd = image->getHsaImageObject().handle;
assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
addArg(&argPtr, &image_srd, sizeof(image_srd));
}
else {
// Image arguments are of size 48 bytes and are aligned to 16 bytes
addArg(&argPtr, (void *)image->getHsaImageObject().handle,
HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
}
//! @todo Compiler has to return read/write attributes
const cl_mem_flags flags = mem->getMemFlags();
if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
mem->signalWrite(&dev());
}
}
else if (arg->type_ == HSAIL_ARGTYPE_SAMPLER) {
amd::Sampler* sampler = *reinterpret_cast<amd::Sampler* const*>(srcArgPtr);
if (sampler == NULL) {
LogError("Kernel sampler argument is not an sampler object");
return false;
}
hsa_ext_sampler_descriptor_t samplerDescriptor;
fillSampleDescriptor(samplerDescriptor, *sampler);
hsa_ext_sampler_t hsa_sampler;
hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(),
&samplerDescriptor, &hsa_sampler);
if (status != HSA_STATUS_SUCCESS) {
LogError("Error creating device sampler object!");
return false;
}
if (dev().settings().enableImageHandle_) {
uint64_t sampler_srd = hsa_sampler.handle;
addArg(&argPtr, &sampler_srd, sizeof(sampler_srd));
samplerList_.push_back(hsa_sampler);
// TODO: destroy sampler.
}
else {
argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
argPtr += HSA_SAMPLER_OBJECT_SIZE;
hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
}
}
}
// Check there is no arguments' buffer overflow
assert(argPtr <= argBuffer + gpuKernel.KernargSegmentByteSize());
// Check for group memory overflow
//! @todo Check should be in HSA - here we should have at most an assert
assert(roc_device_.info().localMemSizePerCU_ > 0);
if (ldsUsage > roc_device_.info().localMemSizePerCU_) {
LogError("No local memory available\n");
return false;
}
//Initialize the dispatch Packet
hsa_kernel_dispatch_packet_t dispatchPacket;
memset(&dispatchPacket, 0, sizeof(dispatchPacket));
dispatchPacket.kernel_object = gpuKernel.KernelCodeHandle();
dispatchPacket.header = aqlHeader_;
dispatchPacket.setup |= sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
dispatchPacket.grid_size_x = sizes.dimensions()>0 ? sizes.global()[0] : 1;
dispatchPacket.grid_size_y = sizes.dimensions()>1 ? sizes.global()[1] : 1;
dispatchPacket.grid_size_z = sizes.dimensions()>2 ? sizes.global()[2] : 1;
const size_t* compile_size = devKernel->workGroupInfo()->compileSize_;
if (sizes.local().product() != 0) {
dispatchPacket.workgroup_size_x = sizes.dimensions()>0 ? sizes.local()[0] : 1;
dispatchPacket.workgroup_size_y = sizes.dimensions()>1 ? sizes.local()[1] : 1;
dispatchPacket.workgroup_size_z = sizes.dimensions()>2 ? sizes.local()[2] : 1;
} else {
setRuntimeCompilerLocalSize(dispatchPacket, sizes, compile_size, dev());
}
dispatchPacket.kernarg_address = argBuffer;
dispatchPacket.group_segment_size = ldsUsage;
dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
//Dispatch the packet
if (!dispatchAqlPacket(&dispatchPacket, false)){
return false;
}
// Mark the flag indicating if a dispatch is outstanding.
// We are not waiting after every dispatch.
hasPendingDispatch_ = true;
// Output printf buffer
if(!printfDbg()->output(*this, printfEnabled, gpuKernel.printfInfo())){
LogError("\nCould not print data from the printf buffer!");
return false;
}
return true;
}
/**
* @brief Api to dispatch a kernel for execution. The implementation
* parses the input object, an instance of virtual command to obtain
* the parameters of global size, work group size, offsets of work
* items, enable/disable profiling, etc.
*
* It also parses the kernel arguments buffer to inject into Hsa Runtime
* the list of kernel parameters.
*/
void VirtualGPU::submitKernel(amd::NDRangeKernelCommand &vcmd) {
profilingBegin(vcmd);
// Submit kernel to HW
if (!submitKernelInternal(
vcmd.sizes(), vcmd.kernel(), vcmd.parameters(),
static_cast<void *>(as_cl(&vcmd.event())))) {
LogError("AQL dispatch failed!");
vcmd.setStatus(CL_INVALID_OPERATION);
}
profilingEnd(vcmd);
}
void VirtualGPU::submitNativeFn(amd::NativeFnCommand &cmd) {
// std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
}
void VirtualGPU::submitMarker(amd::Marker &cmd) {
// std::cout<<__FUNCTION__<<" not implemented"<<"*********"<<std::endl;
}
void VirtualGPU::submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd)
{
profilingBegin(vcmd);
auto fence=kBarrierAcquirePacket;
dispatchAqlPacket(&fence, false);
profilingEnd(vcmd);
}
void VirtualGPU::submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd)
{
profilingBegin(vcmd);
auto fence=kBarrierReleasePacket;
dispatchAqlPacket(&fence, false);
profilingEnd(vcmd);
}
void VirtualGPU::flush(amd::Command *list, bool wait) {
releaseGpuMemoryFence();
updateCommandsState(list);
}
} // End of roc namespace