Files
rocm-systems/rocclr/runtime/device/gpu/gpukernel.cpp
T
foreman bf32cddc03 P4 to Git Change 1053379 by xcui@merged_opencl_jxcwin on 2014/07/08 19:38:01
EPR #304775 - fixed the bug 9838. The svm pointer in the arugment list needs to be tracked to make sure all operation of resource has been finished before we dispatch kernel.

	code review:
	http://ocltc.amd.com/reviews/r/5200/
	precheckin:
	http://ocltc.amd.com:8111/viewModification.html?modId=35125&personal=true&buildTypeId=&tab=vcsModificationTests

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#257 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#322 edit
2014-07-08 19:45:31 -04:00

4051 строка
137 KiB
C++

//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//
#include "device/gpu/gpukernel.hpp"
#include "device/gpu/gpuprogram.hpp"
#include "device/gpu/gpublit.hpp"
#include "device/gpu/gpuconstbuf.hpp"
#include "device/gpu/gpusched.hpp"
#include "platform/commandqueue.hpp"
#include "utils/options.hpp"
#include "acl.h"
#include "SCShadersR678XXCommon.h"
#include <string>
#include <memory>
#include <fstream>
#include <sstream>
#include <iostream>
#include <ctime>
namespace gpu {
const MetaDataConst ArgState[ArgStateTotal] =
{
// Note: the order is important
// Name Type Properties
// Kernel description (special properties)
{ "memory:compilerwrite", KernelArg::PrivateFixed, { 0, 0, 0, 0, 0, 0, 0 } },
{ "uniqueid:", KernelArg::None, { 0, 0, 0, 0, 0, 0, 0 } },
{ "memory:private:", KernelArg::PrivateSize, { 0, 0, 0, 0, 0, 0, 0 } },
{ "memory:local:", KernelArg::LocalSize, { 0, 0, 0, 0, 0, 0, 0 } },
{ "memory:hwprivate:", KernelArg::HwPrivateSize, { 0, 0, 0, 0, 0, 0, 0 } },
{ "memory:uavprivate:", KernelArg::HwPrivateSize, { 0, 0, 0, 0, 0, 0, 0 } },
{ "memory:hwlocal:", KernelArg::HwLocalSize, { 0, 0, 0, 0, 0, 0, 0 } },
{ "memory:64bitABI", KernelArg::ABI64Bit, { 0, 0, 0, 0, 0, 0, 0 } },
{ "limitgroupsize", KernelArg::Wavefront, { 0, 0, 0, 0, 0, 0, 0 } },
{ "function:", KernelArg::None, { 1, 1, 0, 0, 0, 0, 0 } },
{ "intrinsic:", KernelArg::None, { 1, 0, 0, 0, 0, 0, 0 } },
{ "error:", KernelArg::ErrorMessage, { 0, 0, 0, 0, 0, 0, 0 } },
{ "warning:", KernelArg::WarningMessage, { 0, 0, 0, 0, 0, 0, 0 } },
{ "printf_fmt:", KernelArg::PrintfFormatStr, { 0, 0, 0, 0, 0, 0, 0 } },
{ "version:", KernelArg::MetadataVersion, { 0, 0, 0, 0, 0, 0, 0 } },
// Kernel basic types
{ "pointer:", KernelArg::PointerGlobal, { 1, 1, 1, 1, 1, 1, 0 } },
{ "value:", KernelArg::Value, { 1, 1, 1, 1, 1, 0, 0 } },
{ "image:", KernelArg::Image, { 1, 1, 1, 1, 1, 0, 0 } },
{ "sampler:", KernelArg::Sampler, { 0, 1, 0, 0, 0, 0, 0 } },
{ "counter:", KernelArg::Counter, { 1, 1, 0, 1, 1, 0, 0 } },
{ "cws:", KernelArg::Grouping, { 0, 0, 0, 0, 0, 0, 0 } },
{ "lws:", KernelArg::WrkgrpSize, { 0, 0, 0, 0, 0, 0, 0 } },
{ "uavid:", KernelArg::UavId, { 0, 0, 0, 0, 0, 0, 0 } },
{ "reflection:", KernelArg::Reflection, { 0, 0, 0, 0, 0, 0, 0 } },
{ "constarg:", KernelArg::ConstArg, { 0, 0, 0, 0, 0, 0, 0 } },
{ "cbid:", KernelArg::ConstBufId, { 0, 0, 0, 0, 0, 0, 0 } },
{ "printfid:", KernelArg::PrintfBufId, { 0, 0, 0, 0, 0, 0, 0 } },
{ "wsh:", KernelArg::GroupingHint, { 0, 0, 0, 0, 0, 0, 0 } },
{ "vth:", KernelArg::VecTypeHint, { 0, 0, 0, 0, 0, 0, 0 } },
};
const DataTypeConst DataType[] =
{
{ "i8:", KernelArg::Char, },
{ "i16:", KernelArg::Short, },
{ "i32:", KernelArg::Int, },
{ "i64:", KernelArg::Long, },
{ "u8:", KernelArg::UChar, },
{ "u16:", KernelArg::UShort, },
{ "u32:", KernelArg::UInt, },
{ "u64:", KernelArg::ULong, },
{ "float:", KernelArg::Float, },
{ "double:", KernelArg::Double, },
{ "struct:", KernelArg::Struct, },
{ "union:", KernelArg::Union, },
{ "1D:", KernelArg::Image1D, },
{ "2D:", KernelArg::Image2D, },
{ "3D:", KernelArg::Image3D, },
{ "1DB:", KernelArg::Image1DB, },
{ "1DA:", KernelArg::Image1DA, },
{ "2DA:", KernelArg::Image2DA, },
{ "opaque:", KernelArg::Opaque, },
{ "event:", KernelArg::Event, },
{ "sampler:", KernelArg::Sampler, },
{ "half:", KernelArg::Half, },
};
const uint DataTypeTotal = sizeof(DataType) / sizeof(DataTypeConst);
struct BufDataConst
{
const char* tagName_; //!< buffer's name
KernelArg::ArgumentType type_; //!< type of argument
struct
{
uint number_ : 1; //!< buffer's number
uint alignment_ : 1; //!< buffer's alignment
uint attribute_ : 1; //!< buffer's read/write attribute
uint reserved : 29; //!< reserved
};
};
static const BufDataConst BufType[] =
{
{ "g", KernelArg::PointerGlobal, { 1, 0, 0, 0 } },
{ "p", KernelArg::PointerPrivate, { 1, 1, 1, 0 } },
{ "l", KernelArg::PointerLocal, { 1, 1, 1, 0 } },
{ "uav", KernelArg::PointerGlobal, { 1, 1, 1, 0 } },
{ "c", KernelArg::PointerConst, { 1, 1, 1, 0 } },
{ "hl", KernelArg::PointerHwLocal, { 1, 1, 1, 0 } },
{ "hp", KernelArg::PointerHwPrivate,{ 1, 1, 1, 0 } },
{ "hc", KernelArg::PointerHwConst, { 1, 1, 1, 0 } }
};
static const uint BufTypeTotal = sizeof(BufType) / sizeof(BufDataConst);
//! The mathlib constants for each kernel execution
static const float MathLibConst[4] = { 0.0f, 0.5f, 1.0f, 2.0f };
bool
expect(const std::string& str, size_t* pos, const std::string& sym)
{
bool result = true;
uint i;
if (*pos == std::string::npos) {
return false;
}
// Check if we have expected symbols
for (i = 0; i < sym.size(); ++i) {
char deb = str[*pos + i];
if (deb != sym[i]) {
result = false;
break;
}
}
if (result) *pos += i;
return result;
}
bool
getword(const std::string& str, size_t* pos, char* sym)
{
if (*pos == std::string::npos) {
return false;
}
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;", *pos);
size_t count = posEnd - *pos;
if (count != 0) {
if (!str.copy(sym, count, *pos)) {
return false;
}
}
sym[count] = 0;
*pos = posEnd + 1;
return true;
}
bool
getstring(const std::string& str, size_t* pos, std::string* out)
{
if (*pos == std::string::npos) {
return false;
}
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(":\n\r;", *pos);
size_t count = posEnd - *pos;
char* sym = new char[count + 1];
if (count != 0) {
if (!str.copy(sym, count, *pos)) {
return false;
}
}
sym[count] = 0;
*out = sym;
delete [] sym;
*pos = posEnd + 1;
return true;
}
bool
getuint(const std::string& str, size_t* pos, uint* val)
{
if (*pos == std::string::npos) {
return false;
}
char sym[16];
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
if (!str.copy(sym, posEnd - *pos, *pos)) {
return false;
}
*val = 0;
for (size_t i = 0; i < (posEnd - *pos); ++i) {
*val = (*val * 10) + (sym[i] - 0x30);
}
*pos = posEnd + 1;
return true;
}
bool
getuintHex(const std::string& str, size_t* pos, uint* val)
{
if (*pos == std::string::npos) {
return false;
}
char sym[16];
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
if (!str.copy(sym, posEnd - *pos, *pos)) {
return false;
}
*val = 0;
for (size_t i = 0; i < (posEnd - *pos); ++i) {
if (sym[i] >= '0' && sym[i] <= 'F') {
*val = (*val * 16) + (sym[i] - '0');
}
else if (sym[i] >= 'a' && sym[i] <= 'f') {
*val = (*val * 16) + (sym[i] - 'a' + 10);
}
else {
return false;
}
}
*pos = posEnd + 1;
return true;
}
bool
getuint64Hex(const std::string& str, size_t* pos, uint64_t* val)
{
if (*pos == std::string::npos) {
return false;
}
char sym[16];
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
if (!str.copy(sym, posEnd - *pos, *pos)) {
return false;
}
*val = 0;
for (size_t i = 0; i < (posEnd - *pos); ++i) {
if (sym[i] >= '0' && sym[i] <= 'F') {
*val = (*val * 16) + (sym[i] - '0');
}
else if (sym[i] >= 'a' && sym[i] <= 'f') {
*val = (*val * 16) + (sym[i] - 'a' + 10);
}
else {
return false;
}
}
*pos = posEnd + 1;
return true;
}
void
intToStr(size_t value, char* str, size_t size)
{
static const uint MaxDigits32bit = 10;
char result[MaxDigits32bit];
uint idx = MaxDigits32bit;
do {
idx--;
result[idx] = static_cast<char>((value % 10) + '0');
value /= 10;
} while ((value != 0) && (idx > 0));
size_t len = MaxDigits32bit - idx;
size_t n = std::min<size_t>(len, size-1);
memcpy(str, &result[idx], n);
str[n] = '\0';
}
//! Default destructor
CalImageReference::~CalImageReference()
{
// Free CAL image
free(image_);
}
KernelArg::KernelArg()
: type_(KernelArg::None)
, size_(0)
, cbIdx_(0)
, cbPos_(0)
, index_(0)
, alignment_(1)
, dataType_(KernelArg::None)
{
name_ = "";
buf_ = "";
memory_.value_ = 0;
typeQualifier_ = CL_KERNEL_ARG_TYPE_NONE;
}
KernelArg::KernelArg(const KernelArg& data)
{
// Fill the new object
*this = data;
}
KernelArg&
KernelArg::operator=(const KernelArg& data)
{
// Fill the fields of the current object
name_ = data.name_;
typeName_ = data.typeName_;
typeQualifier_ = data.typeQualifier_;
type_ = data.type_;
size_ = data.size_;
cbIdx_ = data.cbIdx_;
cbPos_ = data.cbPos_;
buf_ = data.buf_;
index_ = data.index_;
alignment_ = data.alignment_;
dataType_ = data.dataType_;
memory_.value_ = data.memory_.value_;
return *this;
}
bool
KernelArg::isCbNeeded() const
{
//! \note not a safe way
bool result = ((type_ > None) && (type_ < Sampler)) ? true : false;
if ((type_ == Sampler) && (location_ == 0)) {
// Sampler is defined outside the kernel
result = true;
}
return result;
}
size_t
KernelArg::size(bool gpuLayer)const
{
switch (type_) {
case None:
return 0;
case PointerConst:
case PointerHwConst:
case PointerGlobal:
return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem);
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
return (gpuLayer) ? sizeof(ImageConstants) : sizeof(cl_mem);
case Sampler:
return (gpuLayer) ? 2 * sizeof(uint32_t) : sizeof(cl_sampler);
case Counter:
return (gpuLayer) ? 0 : sizeof(cl_mem);
case PointerLocal:
case PointerHwLocal:
return (gpuLayer) ? sizeof(uint32_t) * size_ : 0;
case PointerPrivate:
case PointerHwPrivate:
return (gpuLayer) ? sizeof(uint32_t) * size_ : 0;
case Float:
return sizeof(cl_float) * amd::nextPowerOfTwo(size_);
case Double:
return sizeof(cl_double) * amd::nextPowerOfTwo(size_);
case Char:
case UChar:
return sizeof(cl_char) * amd::nextPowerOfTwo(size_);
case Short:
case UShort:
return sizeof(cl_short) * amd::nextPowerOfTwo(size_);
case Int:
case UInt:
return sizeof(cl_uint) * amd::nextPowerOfTwo(size_);
case Long:
case ULong:
return sizeof(cl_ulong) * amd::nextPowerOfTwo(size_);
case Struct:
case Union:
return (gpuLayer) ? amd::alignUp(size_, 16) : size_;
default:
return 0;
}
}
cl_kernel_arg_address_qualifier
KernelArg::addressQualifier() const
{
switch (type_) {
case PointerGlobal:
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
case PointerLocal:
case PointerHwLocal:
return CL_KERNEL_ARG_ADDRESS_LOCAL;
case PointerConst:
case PointerHwConst:
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
default:
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
}
cl_kernel_arg_access_qualifier
KernelArg::accessQualifier() const
{
switch (type_) {
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
if (memory_.readOnly_) {
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
}
else if (memory_.writeOnly_) {
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
}
else if (memory_.readWrite_) {
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
}
// Fall through ...
default:
return CL_KERNEL_ARG_ACCESS_NONE;
}
}
//! temporary solution for the vectors handling in compiler
size_t
KernelArg::specialVector() const
{
if (size_ > VectorSizeLimit) {
switch (type_) {
case Char:
case UChar:
return sizeof(cl_char);
case Short:
case UShort:
return sizeof(cl_short);
default:
return 0;
}
}
return 0;
}
clk_value_type_t
KernelArg::type()const
{
switch (type_) {
case PointerGlobal:
case PointerLocal:
case PointerHwLocal:
case PointerConst:
case PointerHwConst:
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
case Counter:
return T_POINTER;
case Float:
return T_FLOAT;
case Double:
return T_DOUBLE;
case Char:
case UChar:
return T_CHAR;
case Short:
case UShort:
return T_SHORT;
case Int:
return T_INT;
case UInt:
//! \note No UINT type
return T_INT;
case Long:
return T_LONG;
case ULong:
//! \note No ULONG type
return T_LONG;
case Struct:
case Union:
//! @todo What should we report?
return T_CHAR;
case Sampler:
return T_SAMPLER;
case PointerPrivate:
case PointerHwPrivate:
case None:
default:
return T_VOID;
}
}
NullKernel::NullKernel(
const std::string& name,
const NullDevice& gpuNullDev,
const NullProgram& nullprog)
: device::Kernel(name)
, buildError_(CL_BUILD_PROGRAM_FAILURE)
, gpuDev_(gpuNullDev)
, prog_(nullprog)
, calRef_(NULL)
, internal_(false)
, flags_(0)
, cbSizes_(NULL)
, numCb_(0)
, rwAttributes_(false)
, instructionCnt_(4)
{
// UAV raw index will be detected
uavRaw_ = UavIdUndefined;
// Initialize UAV arena index(should be 8)
uavArena_ = VirtualGPU::UavArena;
// CB index will be detected
cbId_ = UavIdUndefined;
// Printf index will be detected
printfId_ = UavIdUndefined;
}
NullKernel::~NullKernel()
{
uint idx;
if (calRef_ == NULL) {
return;
}
calRef_->release();
// Destroy all kernel arguments
for (idx = 0; idx < arguments_.size(); ++idx) {
delete arguments_[idx];
}
arguments_.clear();
// Destroy all sampler kernel arguments
for (idx = 0; idx < intSamplers_.size(); ++idx) {
delete intSamplers_[idx];
}
intSamplers_.clear();
}
static int
scComponentToArrayIndex(E_SC_COMPONENT dstComp)
{
switch (dstComp) {
case SC_COMPONENT_X:
return 0;
case SC_COMPONENT_Y:
return 1;
case SC_COMPONENT_Z:
return 2;
case SC_COMPONENT_W:
return 3;
}
return 0;
}
static void
addLoopConst(const SC_HWSHADER* shader, AMUabiAddEncoding& encoding)
{
uint count = shader->dep.NumIntrlIConstants;
encoding.litConstsCount = shader->dep.NumIntrlIConstants;
// only suppport loop consts (int consts)
if (count) {
AMUabiLiteralConst* allocatedconsts = encoding.litConsts;
memset(allocatedconsts, 0, count * sizeof(AMUabiLiteralConst));
uint usedConsts = 0;
for (uint i = 0; i < count; ++i) {
uint currentConst;
for (currentConst = 0; currentConst < usedConsts; ++currentConst) {
if (allocatedconsts[currentConst].addr ==
HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber) {
break;
}
}
if (currentConst == usedConsts) {
usedConsts++;
assert(usedConsts <= count);
}
allocatedconsts[currentConst].addr = HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber;
allocatedconsts[currentConst].type = AMU_ABI_INT32;
allocatedconsts[currentConst].value.
int32[scComponentToArrayIndex(HWSHADER_Get(shader, dep.IntrlIConstants)[i].eDstComp)] =
HWSHADER_Get(shader, dep.IntrlIConstants)[i].iValue;
}
encoding.litConstsCount = usedConsts;
}
}
bool
NullKernel::create(
const std::string& code,
const std::string& metadata,
const void* binaryCode,
size_t binarySize)
{
std::auto_ptr<uint> uavRefCount (new uint[MaxUavArguments]);
if (NULL == uavRefCount.get()) {
return false;
}
// Set all ref counts to 0
memset(uavRefCount.get(), 0, sizeof(uavRefCount.get()[0]) * MaxUavArguments);
// parse the metadata fields
if (!parseArguments(metadata, uavRefCount.get())) {
return false;
}
CALimage calImage;
// Save source if DEBUG build
#if DEBUG
ilSource_ = code;
#endif // DEBUG
amd::option::Options *options = nullProg().getCompilerOptions();
internal_ = options->oVariables->clInternalKernel;
if ((binaryCode == NULL) && (binarySize == 0) && !code.empty()) {
acl_error err;
std::string arch = GPU_TARGET_INFO_ARCH;
if (nullDev().settings().use64BitPtr_) {
arch += "64";
}
aclTargetInfo info = aclGetTargetInfo(
arch.c_str(), nullDev().hwInfo()->targetName_, &err);
if (err != ACL_SUCCESS) {
LogWarning("aclGetTargetInfo failed");
return false;
}
aclBinaryOptions binOpts = {0};
binOpts.struct_size = sizeof(binOpts);
binOpts.elfclass = info.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32;
binOpts.bitness = ELFDATA2LSB;
binOpts.alloc = &::malloc;
binOpts.dealloc = &::free;
aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &info, &binOpts, &err);
if (err != ACL_SUCCESS) {
LogWarning("aclBinaryInit failed");
return false;
}
if (ACL_SUCCESS != aclInsertSection(nullDev().compiler(), bin,
code.data(), code.size(), aclSOURCE)) {
LogWarning("aclInsertSection failed");
aclBinaryFini(bin);
return false;
}
amd::option::Options* Opts = (amd::option::Options*)bin->options;
// temporary solution to synchronize buildNo between runtime and complib
// until we move runtime inside complib
Opts->setBuildNo(options->getBuildNo());
// pass kernel name to compiler
Opts->setCurrKernelName(name().c_str());
err = aclCompile(nullDev().compiler(), bin, options->origOptionStr.c_str(),
ACL_TYPE_AMDIL_TEXT, ACL_TYPE_ISA, NULL);
buildLog_ += aclGetCompilerLog(nullDev().compiler());
if (err != ACL_SUCCESS) {
LogWarning("aclCompile failed");
aclBinaryFini(bin);
return false;
}
if (!options->oVariables->BinEXE) {
// Early exit if binary doesn't contain EXE
aclBinaryFini(bin);
return true;
}
size_t len;
const void* isa = aclExtractSection(nullDev().compiler(), bin,
&len, aclTEXT, &err);
if (err != ACL_SUCCESS) {
LogWarning("aclExtractSection failed");
aclBinaryFini(bin);
return false;
}
uint calImageSize;
if (!createMultiBinary(
&calImageSize, reinterpret_cast<void**>(&calImage), isa)) {
LogWarning("initSrcEncoding failed");
aclBinaryFini(bin);
return false;
}
aclBinaryFini(bin);
}
else if ((binaryCode != NULL) && (binarySize != 0)) {
uint size = 0;
if (!amuABIMultiBinaryGetSize(&size, const_cast<void*>(binaryCode))
|| size > binarySize) {
buildLog_ += "Invalid binary image";
LogError("amuABIMultiBinaryGetSize failed!");
return false;
}
calImage = static_cast<CALimage>(malloc(size));
::memcpy(calImage, binaryCode, size);
}
else {
LogError("Incorrect initialization parameters!");
return false;
}
calRef_ = new CalImageReference(calImage);
if (calRef_ == NULL) {
LogError("Memory allocation failure!");
// Free CAL image
free(calImage);
return false;
}
CALfuncInfo calFuncInfo;
// Get kernel compiled information
getFuncInfoFromImage(calImage, &calFuncInfo);
if (calFuncInfo.maxScratchRegsNeeded > 0) {
LogPrintfInfo("%s kernel has register spilling."
"Lower performance is expected.", name().c_str());
}
workGroupInfo_.scratchRegs_ = calFuncInfo.maxScratchRegsNeeded;
workGroupInfo_.wavefrontPerSIMD_ = calFuncInfo.numWavefrontPerSIMD;
workGroupInfo_.wavefrontSize_ = calFuncInfo.wavefrontSize;
workGroupInfo_.availableGPRs_ = calFuncInfo.numGPRsAvailable;
workGroupInfo_.usedGPRs_ = calFuncInfo.numGPRsUsed;
workGroupInfo_.availableSGPRs_ = calFuncInfo.numSGPRsAvailable;
workGroupInfo_.usedSGPRs_ = calFuncInfo.numSGPRsUsed;
workGroupInfo_.availableVGPRs_ = calFuncInfo.numVGPRsAvailable;
workGroupInfo_.usedVGPRs_ = calFuncInfo.numVGPRsUsed;
workGroupInfo_.availableLDSSize_ = calFuncInfo.LDSSizeAvailable;
workGroupInfo_.usedLDSSize_ = calFuncInfo.LDSSizeUsed;
workGroupInfo_.availableStackSize_ = calFuncInfo.stackSizeAvailable;
workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed;
device::Kernel::parameters_t params;
if (!createSignature(params)) {
return false;
}
return true;
}
size_t
NullKernel::getCalBinarySize() const
{
CALuint imageSize;
if (!amuABIMultiBinaryGetSize(&imageSize, calImage())) {
LogError("Failed to get the image size!");
return 0;
}
return static_cast<size_t>(imageSize);
}
bool
NullKernel::getCalBinary(void* binary, size_t size) const
{
uint calImageSize = 0;
if (!amuABIMultiBinaryGetSize(&calImageSize, calImage())
|| size < calImageSize) {
LogError("CAL failed to save the kernel binary!");
return false;
}
::memcpy(binary, calImage(), calImageSize);
return true;
}
bool
Kernel::create(
const std::string& code,
const std::string& metadata,
const void* binaryCode,
size_t binarySize)
{
setPreferredSizeMultiple(dev().getAttribs().wavefrontSize);
if (!NullKernel::create(code, metadata, binaryCode, binarySize)) {
return false;
}
// initialize constant buffer sizes
if (!initConstBuffers()) {
return false;
}
// Initialize the kernel parameters
bool result = initParameters();
if (!dev().heap()->isVirtual()) {
amd::option::Options *options = nullProg().getCompilerOptions();
// @todo Remove this. This is a hack for no VM mode
if (!options->oVariables->EnableDumpKernel) {
if (!name().compare(BlitName[KernelBlitManager::BlitCopyImageToBuffer]) ||
!name().compare(BlitName[KernelBlitManager::BlitCopyBufferToImage])) {
blitKernelHack_ = true;
}
}
}
if (result) {
buildError_ = CL_SUCCESS;
}
else {
result = false;
}
return result;
}
Kernel::Kernel(
const std::string& name,
const Device& gpuDev,
const Program& prog,
const InitData* initData)
: NullKernel(name, gpuDev, prog)
, blitKernelHack_(false)
{
hwPrivateSize_ = 0;
if (NULL != initData) {
flags_ = initData->flags_;
hwPrivateSize_ = initData->hwPrivateSize_;
hwLocalSize_ = initData->hwLocalSize_;
}
// Workgroup info private memory size
workGroupInfo_.privateMemSize_ = hwPrivateSize_;
hsa_ = false;
}
Kernel::~Kernel()
{
if (calRef_ == NULL) {
return;
}
{
Device::ScopedLockVgpus lock(dev());
// Release all virtual image objects on all virtual GPUs
for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
dev().vgpus()[idx]->releaseKernel(calImage());
}
}
if (0 != numCb_) {
delete [] cbSizes_;
}
}
const Device&
Kernel::dev() const
{
return reinterpret_cast<const Device&>(gpuDev_);
}
const Program&
Kernel::prog() const
{
return reinterpret_cast<const Program&>(prog_);
}
bool
NullKernel::createMultiBinary(uint* imageSize, void** image, const void* isa)
{
const SC_HWSHADER* shader = reinterpret_cast<const SC_HWSHADER*>(isa);
bool result = false;
AMUabiAddEncoding encoding;
memset(&encoding, 0, sizeof(AMUabiAddEncoding));
size_t allocSize =
sizeof(uint) * MaxReadImage +
sizeof(CALUavEntry) * MaxUavArguments +
sizeof(CALSamplerMapEntry) * MaxSamplers +
sizeof(CALConstantBufferMask) * MaxConstBuffers +
sizeof(AMUabiLiteralConst) * shader->dep.NumIntrlIConstants;
char* tmpMem = new char[allocSize];
if (tmpMem == NULL) {
LogError("Error allocating memory");
return false;
}
CalcPtr(encoding.inputs, tmpMem, 0, 0);
CalcPtr(encoding.uav, encoding.inputs, sizeof(uint), MaxReadImage);
CalcPtr(encoding.inputSamplerMaps, encoding.uav, sizeof(CALUavEntry), MaxUavArguments);
CalcPtr(encoding.constBuffers, encoding.inputSamplerMaps, sizeof(CALSamplerMapEntry), MaxSamplers);
if (shader->dep.NumIntrlIConstants != 0) {
CalcPtr(encoding.litConsts, encoding.constBuffers, sizeof(CALConstantBufferMask), MaxConstBuffers);
}
AMUabiMultiBinary amuBinary;
amuABIMultiBinaryCreate(&amuBinary);
if (nullDev().settings().siPlus_) {
result = siCreateHwInfo(shader, encoding);
}
else {
result = r800CreateHwInfo(shader, encoding);
}
if (!result) {
delete [] tmpMem;
LogWarning("Error Creating program info");
return false;
}
addLoopConst(shader, encoding);
unsigned int outputCount=0, condOut=0, earlyExit=0, globalCount=0, persistentCount=0;
unsigned int symbolCount=0;
CALOutputEntry* outputs=0;
unsigned int* globalBuffers=0;
unsigned int* persistentBuffers=0;
AMUabiUserSymbol* symbols=0;
CALSamplerMapEntry* inputSamplers = encoding.inputSamplerMaps;
CALConstantBufferMask* constBuffers = encoding.constBuffers;
uint* inputResources = encoding.inputs;
CALUavEntry* uav = encoding.uav;
uint inputSamplerCount = samplerSize();
for (uint i = 0; i < inputSamplerCount; ++i) {
inputSamplers[i].resource = 0;
inputSamplers[i].sampler = sampler(i)->index_;
}
uint constBufferCount = 2;
constBuffers[0].index = 0;
constBuffers[1].index = 1;
uint inputResourceCount = 0;
uint uavCount = 0;
bool globalBound = false;
bool cbBound = false;
bool printfBound = false;
for (uint i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
switch (arg->type_) {
case KernelArg::PointerConst:
case KernelArg::PointerHwConst:
constBuffers[constBufferCount++].index = arg->index_;
break;
case KernelArg::PointerGlobal:
if (nullDev().settings().useAliases_) {
if (!globalBound) {
uav[uavCount].offset = uavRaw_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
if (uavArena_ != 0) {
uav[uavCount].offset = uavArena_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_ARENA;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
}
globalBound = true;
}
else {
uav[uavCount].offset = arg->index_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
break;
case KernelArg::ConstBufId:
if (!cbBound) {
uav[uavCount].offset = cbId_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
cbBound = true;
break;
case KernelArg::PrintfBufId:
if (!printfBound) {
uav[uavCount].offset = printfId_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
printfBound = true;
break;
case KernelArg::UavId:
if (!nullDev().settings().useAliases_ &&
(UavIdUndefined != uavRaw_) &&
!(flags() & PrintfOutput)) {
uav[uavCount].offset = arg->index_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
else {
if (UavIdUndefined != uavRaw_) {
uav[uavCount].offset = uavRaw_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
if (uavArena_ != 0) {
uav[uavCount].offset = uavArena_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_ARENA;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
}
break;
case KernelArg::Sampler:
inputSamplers[inputSamplerCount].resource = 0;
inputSamplers[inputSamplerCount].sampler = arg->index_;
inputSamplerCount++;
break;
case KernelArg::Image1D:
case KernelArg::Image2D:
case KernelArg::Image3D:
case KernelArg::Image1DB:
case KernelArg::Image1DA:
case KernelArg::Image2DA:
if (arg->memory_.readOnly_) {
inputResources[inputResourceCount++] = arg->index_;
}
else {
uav[uavCount].offset = arg->index_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPED;
uav[uavCount].dimension = AMU_ABI_DIM_2D;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
break;
default:
break;
}
}
for (uint i = 0; i < nullProg().glbCb().size(); ++i) {
constBuffers[constBufferCount++].index = nullProg().glbCb()[i];
}
encoding.machine = nullDev().hwInfo()->machine_;
encoding.type = ED_ATI_CAL_TYPE_COMPUTE;
encoding.inputCount = inputResourceCount;
encoding.outputCount = outputCount;
encoding.outputs = outputs;
encoding.condOut = condOut;
encoding.earlyExit = earlyExit;
encoding.globalBuffersCount = globalCount;
encoding.globalBuffers = globalBuffers;
encoding.persistentBuffersCount = persistentCount;
encoding.persistentBuffers = persistentBuffers;
encoding.constBuffersCount = constBufferCount;
encoding.inputSamplerMapCount = inputSamplerCount;
encoding.symbolsCount = symbolCount;
encoding.symbols = symbols;
encoding.uavCount = uavCount;
amuABIMultiBinaryAddEncoding(amuBinary, &encoding);
uint success = amuABIMultiBinaryPack(imageSize, image, amuBinary);
amuABIMultiBinaryDestroy(amuBinary);
delete [] tmpMem;
delete [] encoding.progInfos;
return (success == 0) ? false : true;
}
void
Kernel::findLocalWorkSize(
size_t workDim,
const amd::NDRange& gblWorkSize,
amd::NDRange& lclWorkSize) const
{
// Initialize the default workgoup info
// Check if the kernel has the compiled sizes
if (workGroupInfo()->compileSize_[0] == 0) {
// Find the default local workgroup size, if it wasn't specified
if (lclWorkSize[0] == 0) {
size_t thrPerGrp;
bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
bool overrideSet = ((workDim == 1) && b1DOverrideSet) ||
((workDim == 2) && b2DOverrideSet) ||
((workDim == 3) && b3DOverrideSet);
if (!overrideSet) {
// Find threads per group
thrPerGrp = workGroupInfo()->size_;
// Check if kernel uses images
if ((flags() & ImageEnable) &&
// and thread group is a multiple value of wavefronts
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
// and it's 2 or 3-dimensional workload
(workDim > 1) &&
((dev().settings().partialDispatch_) ||
(((gblWorkSize[0] % 16) == 0) &&
((gblWorkSize[1] % 16) == 0)))) {
// Use 8x8 workgroup size if kernel has image writes
if ((flags() & ImageWrite) ||
(thrPerGrp != nullDev().info().maxWorkGroupSize_)) {
lclWorkSize[0] = 8;
lclWorkSize[1] = 8;
}
else {
lclWorkSize[0] = 16;
lclWorkSize[1] = 16;
}
if (workDim == 3) {
lclWorkSize[2] = 1;
}
}
else {
size_t tmp = thrPerGrp;
// Split the local workgroup into the most efficient way
for (uint d = 0; d < workDim; ++d) {
size_t div = tmp;
for (; (gblWorkSize[d] % div) != 0; div--);
lclWorkSize[d] = div;
tmp /= div;
}
// Check if partial dispatch is enabled and
if (dev().settings().partialDispatch_ &&
// we couldn't find optimal workload
(lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
size_t maxSize = 0;
size_t maxDim = 0;
for (uint d = 0; d < workDim; ++d) {
if (maxSize < gblWorkSize[d]) {
maxSize = gblWorkSize[d];
maxDim = d;
}
}
// Check if a local workgroup has the most optimal size
if (thrPerGrp > maxSize) {
thrPerGrp = maxSize;
}
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 0; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
}
}
}
else {
// Use overrides when app doesn't provide workgroup dimensions
if (workDim == 1) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
}
else if (workDim == 2) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
}
else if (workDim == 3) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
}
else
{
assert(0 && "Invalid workDim!");
}
}
}
}
else {
for (uint d = 0; d < workDim; ++d) {
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
}
}
}
void
Kernel::setupProgramGrid(
VirtualGPU& gpu,
size_t workDim,
const amd::NDRange& glbWorkOffset,
const amd::NDRange& gblWorkSize,
amd::NDRange& lclWorkSize,
const amd::NDRange& groupOffset,
const amd::NDRange& glbWorkOffsetOrg,
const amd::NDRange& glbWorkSizeOrg
) const
{
// ABI is always in CB0
address cbBuf = gpu.cb(0)->sysMemCopy();
uint* pGlobalSize = reinterpret_cast<uint*>
(cbBuf + GlobalWorkitemOffset * ConstBuffer::VectorSize);
uint* pLocalSize = reinterpret_cast<uint*>
(cbBuf + LocalWorkitemOffset * ConstBuffer::VectorSize);
uint* pNumGroups = reinterpret_cast<uint*>
(cbBuf + GroupsOffset * ConstBuffer::VectorSize);
uint* pGlobalOffset = reinterpret_cast<uint*>
(cbBuf + GlobalWorkOffsetOffset * ConstBuffer::VectorSize);
uint* pGroupOffset = reinterpret_cast<uint*>
(cbBuf + GroupWorkOffsetOffset * ConstBuffer::VectorSize);
uint32_t* debugInfo = reinterpret_cast<uint*>
(cbBuf + DebugOffset * ConstBuffer::VectorSize);
uint* pNDRangeGlobalOffset = reinterpret_cast<uint*>
(cbBuf + NDRangeGlobalWorkOffsetOffset * ConstBuffer::VectorSize);
// Check for 64-bit metadata
uint glbABIShift = (abi64Bit()) ? 1 : 0;
ProgramGrid* progGrid = &gpu.cal_.progGrid_;
// Finds local workgroup size
findLocalWorkSize(workDim, gblWorkSize, lclWorkSize);
// Initialize the execution grid block and size/offset
pGlobalSize[0] = pGlobalSize[1] = pGlobalSize[2] = 1;
pGlobalSize[3] = static_cast<uint>(workDim);
pLocalSize[0] = pLocalSize[1] = pLocalSize[2] = 1;
pLocalSize[3] = 0;
pNumGroups[0] = pNumGroups[1] = pNumGroups[2] = 1;
pNumGroups[3] = 0;
pGlobalOffset[2] = pGlobalOffset[1] = pGlobalOffset[0] = 0;
pGroupOffset[2] = pGroupOffset[1] = pGroupOffset[0] = 0;
progGrid->gridBlock.width =
progGrid->gridBlock.height =
progGrid->gridBlock.depth = 1;
progGrid->gridSize.width =
progGrid->gridSize.height =
progGrid->gridSize.depth = 1;
progGrid->partialGridBlock.width =
progGrid->partialGridBlock.height =
progGrid->partialGridBlock.depth = 1;
bool partialGrid = false;
// Fill the right values, based on the application request
switch (workDim) {
case 3:
pLocalSize[2] =
progGrid->gridBlock.depth = static_cast<CALuint>(lclWorkSize[2]);
pGlobalSize[2] = static_cast<CALuint>(glbWorkSizeOrg[2]);
progGrid->gridSize.depth = static_cast<CALuint>(gblWorkSize[2]);
progGrid->gridSize.depth /= progGrid->gridBlock.depth;
pNumGroups[2] = pGlobalSize[2] / progGrid->gridBlock.depth;
pGlobalOffset[2] = glbWorkOffset[2];
pGroupOffset[2] = groupOffset[2];
pNDRangeGlobalOffset[2 + glbABIShift] = glbWorkOffsetOrg[2];
if (dev().settings().partialDispatch_) {
// Check if partial workgroup dispatch is required
progGrid->partialGridBlock.depth = gblWorkSize[2] % lclWorkSize[2];
if (progGrid->partialGridBlock.depth != 0) {
partialGrid = true;
// Increment the number of groups
progGrid->gridSize.depth++;
pNumGroups[2]++;
}
else {
progGrid->partialGridBlock.depth = lclWorkSize[2];
}
}
// Fall through to fill 2D and 1D dimensions...
case 2:
pLocalSize[1] =
progGrid->gridBlock.height = static_cast<CALuint>(lclWorkSize[1]);
pGlobalSize[1] = static_cast<CALuint>(glbWorkSizeOrg[1]);
progGrid->gridSize.height = static_cast<CALuint>(gblWorkSize[1]);
progGrid->gridSize.height /= progGrid->gridBlock.height;
pNumGroups[1] = pGlobalSize[1] / progGrid->gridBlock.height;
pGlobalOffset[1] = glbWorkOffset[1];
pGroupOffset[1] = groupOffset[1];
pNDRangeGlobalOffset[1 + glbABIShift] = glbWorkOffsetOrg[1];
if (dev().settings().partialDispatch_) {
// Check if partial workgroup dispatch is required
progGrid->partialGridBlock.height = gblWorkSize[1] % lclWorkSize[1];
if (progGrid->partialGridBlock.height != 0) {
partialGrid = true;
// Increment the number of groups
progGrid->gridSize.height++;
pNumGroups[1]++;
}
else {
progGrid->partialGridBlock.height = lclWorkSize[1];
}
}
// Fall through to fill 1D dimension...
case 1:
pLocalSize[0] =
progGrid->gridBlock.width = static_cast<CALuint>(lclWorkSize[0]);
pGlobalSize[0] = static_cast<CALuint>(glbWorkSizeOrg[0]);
progGrid->gridSize.width = static_cast<CALuint>(gblWorkSize[0]);
progGrid->gridSize.width /= progGrid->gridBlock.width;
pNumGroups[0] = pGlobalSize[0] / progGrid->gridBlock.width;
pGlobalOffset[0] = glbWorkOffset[0];
pGroupOffset[0] = groupOffset[0];
pNDRangeGlobalOffset[0 + glbABIShift] = glbWorkOffsetOrg[0];
if (dev().settings().partialDispatch_) {
// Check if partial workgroup dispatch is required
progGrid->partialGridBlock.width = gblWorkSize[0] % lclWorkSize[0];
if (progGrid->partialGridBlock.width != 0) {
partialGrid = true;
// Increment the number of groups
progGrid->gridSize.width++;
pNumGroups[0]++;
}
else {
progGrid->partialGridBlock.width = lclWorkSize[0];
}
}
break;
default:
LogWarning("Wrong dimensions. Force to 1x1x1!");
break;
}
if (!partialGrid) {
progGrid->partialGridBlock.width =
progGrid->partialGridBlock.height =
progGrid->partialGridBlock.depth = 0;
}
// Calculate the total number of workitems and workgroups
pGlobalOffset[3] = pGroupOffset[3] = 1;
for (uint i = 0; i < workDim; ++i) {
pGlobalOffset[3] *= pGlobalOffset[i];
pGroupOffset[3] *= pGroupOffset[i];
}
// Setup debug output buffer (if printf is active)
if (flags() & PrintfOutput) {
if (abi64Bit()) {
// Setup the debug info in constant buffer
reinterpret_cast<uint64_t*>(debugInfo)[1] =
gpu.printfDbg().bufOffset();
// Size in DWORDs
debugInfo[4] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
debugInfo[4] /= sizeof(uint32_t);
}
else {
// Setup the debug info in constant buffer
debugInfo[1] = static_cast<uint32_t>(gpu.printfDbg().bufOffset());
// Size in DWORDs
debugInfo[2] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
debugInfo[2] /= sizeof(uint32_t);
}
}
}
bool
Kernel::initParameters()
{
size_t offset = 0;
device::Kernel::parameters_t params;
amd::KernelParameterDescriptor desc;
for (uint i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
// Initialize the arguments for the abstraction layer
if (arg->isCbNeeded()) {
desc.name_ = arg->name_.data();
desc.type_ = arg->type();
desc.size_ = arg->size(false);
desc.addressQualifier_ = arg->addressQualifier();
desc.accessQualifier_ = arg->accessQualifier();
desc.typeName_ = arg->typeName();
desc.typeQualifier_ = arg->typeQualifier();
// Make offset alignment to match CPU metadata, since
// in multidevice config abstraction layer has a single signature
// and CPU sends the paramaters as they are allocated in memory
size_t size = desc.size_;
if (size == 0) {
// Local memory for CPU
size = sizeof(cl_mem);
}
offset = amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
params.push_back(desc);
}
}
// Report the allocated local memory size (emulated and hw)
if (hwLocalSize_ != 0) {
CondLog((dev().info().localMemSize_ < hwLocalSize_),
"Requested local size is bigger than reported");
workGroupInfo_.localMemSize_ = hwLocalSize_;
}
if (!createSignature(params)) {
return false;
}
return true;
}
bool
Kernel::bindGlobalHwCb(
VirtualGPU& gpu,
VirtualGPU::GslKernelDesc* desc) const
{
bool result = true;
// Bind HW constant buffers used for the global data store
const Program::HwConstBuffers& gds = prog().glbHwCb();
for (Program::HwConstBuffers::const_iterator it = gds.begin();
(it != gds.end() && result); ++it) {
uint idx = it->first;
result = bindResource(gpu, *(it->second), idx, ConstantBuffer, idx);
}
return result;
}
bool
Kernel::bindConstantBuffers(VirtualGPU& gpu) const
{
bool result = true;
assert((numCb_ <= MaxConstBuffersArguments) &&
"Runtime doesn't support more CBs for arguments!");
// Upload the parameters to HW and bind all constant buffers
for (uint i = 0; i < numCb_; i++) {
ConstBuffer* cb = gpu.constBufs_[i];
result &= cb->uploadDataToHw(cbSizes_[i]) &&
bindResource(gpu, *cb, i, ConstantBuffer, i, NULL, cb->wrtOffset());
}
return result;
}
bool
Kernel::processMemObjects(
VirtualGPU& gpu,
const amd::Kernel& kernel,
const_address params,
bool nativeMem) const
{
bool aliases = false;
VirtualGPU::MemoryDependency& dependecy = gpu.memoryDependency();
bool readCache = !internal_ && rwAttributes_ && !dev().settings().assumeAliases_;
// Mark the tracker with a new kernel,
// so we can avoid checks of the aliased objects
gpu.memoryDependency().newKernel();
// Check all parameters for the current kernel
const amd::KernelSignature& signature = kernel.signature();
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
const KernelArg* arg = argument(i);
Memory* memory = NULL;
bool readOnly = false;
// Find if current argument is a buffer
if ((desc.type_ == T_POINTER) &&
(arg->type_ != KernelArg::PointerLocal) &&
(arg->type_ != KernelArg::PointerHwLocal)) {
if (nativeMem) {
memory = *reinterpret_cast<Memory* const*>(params + desc.offset_);
}
else if (*reinterpret_cast<amd::Memory* const*>
(params + desc.offset_) != NULL) {
memory = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>
(params + desc.offset_));
// Synchronize data with other memory instances if necessary
memory->syncCacheFromHost(gpu);
}
if (memory != NULL) {
readOnly = arg->memory_.readOnly_;
// Check if read cache optimization is possible
if (readCache) {
// Find if the same buffer was sent to other arguments (aliases)
for (size_t j = i + 1; (j < signature.numParameters()); ++j) {
const amd::KernelParameterDescriptor& descJ = signature.at(j);
const KernelArg* argJ = argument(j);
if (argJ->type_ == KernelArg::PointerGlobal) {
bool readOnlyJ = argJ->memory_.readOnly_;
Memory* memory2 = NULL;
if (nativeMem) {
memory2 = *reinterpret_cast<Memory* const*>
(params + descJ.offset_);
}
else if (*reinterpret_cast<amd::Memory* const*>
(params + descJ.offset_) != NULL) {
memory2 = dev().getGpuMemory(
*reinterpret_cast<amd::Memory* const*>
(params + descJ.offset_));
}
if (memory == memory2) {
if (!readOnly || !readOnlyJ) {
aliases = true;
break;
}
}
}
}
}
// Validate memory for a dependency in the queue
gpu.memoryDependency().validate(gpu, memory, readOnly);
}
}
}
return aliases & readCache;
}
bool
Kernel::loadParameters(
VirtualGPU& gpu,
const amd::Kernel& kernel,
const_address params,
bool nativeMem) const
{
bool result = true;
uint i;
// Initialize local private ranges
if (!initLocalPrivateRanges(gpu)) {
return false;
}
if (!dev().settings().useAliases_ &&
(UavIdUndefined != uavRaw_) &&
(!(flags() & PrintfOutput) || (printfId_ != UavIdUndefined))) {
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
// Bind a buffer for a dummy read
result = bindResource(gpu, *gpuMemory, 0,
ArgumentUavID, uavRaw_);
}
// Find all parameters for the current kernel
const amd::KernelSignature& signature = kernel.signature();
for (i = 0; i != signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
// Set current argument
if (!setArgument(gpu, i, params + desc.offset_, desc.size_, nativeMem)) {
result = false;
break;
}
}
if (result) {
// Update the ring ranges and math constant
setLocalPrivateRanges(gpu);
result = bindConstantBuffers(gpu);
if (dev().settings().useAliases_) {
result &= bindResource(gpu, dev().globalMem(), 0, GlobalBufferArena, uavArena_);
}
else if (flags() & PrivateFixed) {
result &= bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_);
}
// Setup debug output buffer (if printf is active)
if (flags() & PrintfOutput) {
gpu.addVmMemory(gpu.printfDbg().dbgBuffer());
}
}
return result;
}
bool
Kernel::run(VirtualGPU& gpu, GpuEvent* calEvent, bool lastRun) const
{
// 8xx workaround for the number of groups limit in HW
if (gpu.gslKernelDesc()->funcInfo_.setBufferForNumGroup) {
const ProgramGrid* programGrid = &gpu.cal()->progGrid_;
ConstBuffer* cb = gpu.numGrpCb();
assert((cb != NULL) && "Runtime must have the constant buffer");
uint32_t* memPtr = reinterpret_cast<uint32_t*>(cb->sysMemCopy());
memPtr[0] = programGrid->gridSize.width;
memPtr[1] = programGrid->gridSize.height;
memPtr[2] = programGrid->gridSize.depth;
memPtr[3] = 0;
memPtr[4] = programGrid->gridBlock.width;
memPtr[5] = programGrid->gridBlock.height;
memPtr[6] = programGrid->gridBlock.depth;
memPtr[7] = 0;
bool result = cb->uploadDataToHw(8 * sizeof(uint32_t));
if (result) {
gpu.setConstantBuffer(SC_INFO_CONSTANTBUFFER,
cb->gslResource(), static_cast<CALuint>(cb->wrtOffset()), cb->hbSize());
}
else {
assert(!"Runtime didn't upload data for NumGroup workaround");
return false;
}
}
if (!gpu.runProgramGrid(*calEvent,
const_cast<ProgramGrid*>(&gpu.cal()->progGrid_), gpu.vmMems(), gpu.cal_.memCount_)) {
LogError("Failed to execute the program!");
return false;
}
// Unbind all resources
unbindResources(gpu, *calEvent, lastRun);
return true;
}
static size_t counter = 0;
void
Kernel::debug(VirtualGPU& gpu) const
{
std::fstream stubWrite;
address src = NULL;
if (!dev().heap()->isVirtual()) {
src = reinterpret_cast<address>
(const_cast<Resource&>(dev().globalMem()).map(&gpu));
}
std::cerr << "--- " << name_ << " ---" << std::endl;
for (uint i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
Memory* gpuMem = gpu.slots_[i].memory_;
std::stringstream fileName;
bool bufferObj =
((arg->type_ == KernelArg::PointerGlobal) ||
(arg->type_ == KernelArg::PointerConst) ||
(arg->type_ == KernelArg::PointerHwConst));
if ((src != NULL) && arg->isCbNeeded() && bufferObj) {
address memory = gpu.cb(arg->cbIdx_)->sysMemCopy();
std::cerr.setf(std::ios::hex);
uint* location = reinterpret_cast<uint*>
(src + *reinterpret_cast<uint*>(memory + arg->cbPos_));
std::cerr << " > " << arg->name_ << ": 0x" << location << std::endl;
// Dump the data
fileName << counter << "_kernel_" << name() <<
"_" << arg->name_ << "_" << location << ".bin";
stubWrite.open(fileName.str().c_str(),
(std::fstream::out | std::fstream::binary));
// Write data to a file
if (stubWrite.is_open()) {
stubWrite.write(
reinterpret_cast<char*>(location), gpuMem->size());
stubWrite.close();
}
}
if (((arg->type_ >= KernelArg::Image1D) &&
(arg->type_ <= KernelArg::Image3D)) ||
((src == NULL) && bufferObj)) {
Memory* resource = gpu.slots_[i].memory_;
void* memory = resource->map(&gpu);
uint* location = reinterpret_cast<uint*>(memory);
std::cerr << " > " << arg->name_ << (bufferObj ? ": buffer" : ": image") << std::endl;
// Dump the data
fileName << counter << "_kernel_" << name() <<
"_" << arg->name_ << "_" << location << ".bin";
stubWrite.open(fileName.str().c_str(),
(std::fstream::out | std::fstream::binary));
// Write data to a file
if (stubWrite.is_open()) {
stubWrite.write(
reinterpret_cast<char*>(location), gpuMem->size());
stubWrite.close();
}
resource->unmap(&gpu);
}
}
for (uint i = 0; i < gpu.constBufs_.size(); ++i) {
std::stringstream fileName;
fileName << counter++ << "_kernel_" << name() << "_const" << i << ".bin";
stubWrite.open(fileName.str().c_str(),
(std::fstream::out | std::fstream::binary));
if (stubWrite.is_open()) {
address memory = reinterpret_cast<address>(gpu.constBufs_[i]->map(&gpu, Resource::ReadOnly));
// Check if we have OpenCL program
stubWrite.write(reinterpret_cast<char*>(memory+gpu.cb(i)->wrtOffset()), gpu.cb(i)->lastWrtSize());
gpu.constBufs_[i]->unmap(&gpu);
stubWrite.close();
}
}
const Program::HwConstBuffers& gds = prog().glbHwCb();
for (Program::HwConstBuffers::const_iterator it = gds.begin(); it != gds.end(); ++it) {
uint idx = it->first;
std::stringstream fileName;
fileName << counter++ << "_kernel_" << name() << "_const" << idx << ".bin";
stubWrite.open(fileName.str().c_str(),
(std::fstream::out | std::fstream::binary));
if (stubWrite.is_open()) {
address memory = reinterpret_cast<address>((it->second)->map(&gpu, Resource::ReadOnly));
// Check if we have OpenCL program
stubWrite.write(reinterpret_cast<char*>(memory), (it->second)->size());
(it->second)->unmap(&gpu);
stubWrite.close();
}
}
if (!dev().heap()->isVirtual()) {
const_cast<Resource&>(dev().globalMem()).unmap(&gpu);
}
}
bool
Kernel::initConstBuffers()
{
bool result = true;
size_t i;
assert((numCb_ != 0) && "We have 0 constant buffers!");
// Allocate an array for CB sizes
cbSizes_ = new size_t[numCb_];
if (cbSizes_ == NULL) {
return false;
}
memset(cbSizes_, 0, sizeof(size_t) * numCb_);
// CB0 is reserved for ABI data
cbSizes_[0] = TotalABIVectors * ConstBuffer::VectorSize;
// Find sizes of all constant buffers
for (i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
size_t size = arg->cbPos_ + arg->size(true);
size_t specVec = arg->specialVector();
if (specVec != 0) {
size = arg->cbPos_ + (arg->size_ / KernelArg::VectorSizeLimit) *
ConstBuffer::VectorSize;
}
// Do we need a CB?
if (arg->isCbNeeded() && (cbSizes_[arg->cbIdx_] < size)) {
cbSizes_[arg->cbIdx_] = size;
}
}
return result;
}
bool
Kernel::setInternalSamplers(VirtualGPU& gpu) const
{
for (uint i = 0; i < samplerSize(); ++i) {
const KernelArg* arg = sampler(i);
uint state = arg->cbPos_;
uint idx = arg->index_;
if (gpu.cal()->samplersState_[idx] != state) {
setSampler(gpu, state, idx);
gpu.cal_.samplersState_[idx] = state;
}
}
return true;
}
bool
Kernel::setArgument(
VirtualGPU& gpu,
uint idx,
const void* param,
size_t size,
bool nativeMem) const
{
bool result = true;
const KernelArg* arg;
address memory;
size_t argSize;
static const bool waitOnBusyEngine = true;
assert((idx < arguments_.size()) && "Param index is out of range!");
arg = argument(idx);
assert((arg->cbIdx_ == 1) && "Runtime supports CB1 only for the arguments buffer!");
memory = gpu.cb(1)->sysMemCopy();
argSize = arg->size(true);
// Bind the global heap for emulation mode
switch (arg->type_) {
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
return false;
}
// Fall through ...
default:
break;
}
switch (arg->type_) {
case KernelArg::PointerConst:
case KernelArg::PointerHwConst:
case KernelArg::PointerGlobal:
{
gpu::Memory* gpuMem = NULL;
if (nativeMem) {
gpuMem = *reinterpret_cast<Memory* const*>(param);
}
else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
}
bool forceZeroOffset = false;
if (gpuMem == NULL) {
forceZeroOffset = true;
gpuMem = dev().getGpuMemory(dev().dummyPage());
}
uint64_t offset = gpuMem->pinOffset();
// Make sure the passed argument is a buffer object
if (!gpuMem->cal()->buffer_) {
LogError("The kernel buffer argument isn't a buffer object!");
return false;
}
if (arg->type_ == KernelArg::PointerHwConst) {
// Bind current memory object with the kernel
if (!bindResource(gpu, *gpuMem, idx,
ArgumentConstBuffer, arg->index_, gpuMem)) {
return false;
}
assert((offset == 0) && "No offset for HW CB");
// Add a fake offset to make sure (ptr != NULL) is TRUE
offset = 1;
}
else {
ResourceType type = ArgumentHeapBuffer;
// Check if kernel expects UAV binding
if (arg->memory_.uavBuf_) {
type = ArgumentBuffer;
}
else {
if (blitKernelHack_) {
// Bind global buffer to UAV this buffer is bound to
if (!bindResource(gpu, *gpuMem, 0, GlobalBuffer, uavRaw_)) {
return false;
}
}
else {
// Bind global buffer to UAV this buffer is bound to
if (!bindResource(gpu, dev().globalMem(), 0,
GlobalBuffer, uavRaw_)) {
return false;
}
}
}
// Bind current memory object with the kernel
// Note: it's a fake binding, if the buffer is part of
// the global heap
if (!bindResource(gpu, *gpuMem, idx, type, arg->index_, gpuMem)) {
return false;
}
// Update offset only if we bind HeapBuffer or
// it's global address space in UAV setup on SI+
if ((type == ArgumentHeapBuffer) || dev().settings().siPlus_) {
if (!blitKernelHack_) {
offset += gpuMem->hbOffset();
if (!forceZeroOffset) {
assert((offset != 0) && "Offset 0 with a real allocation!");
}
}
gpu.addVmMemory(gpuMem);
}
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, waitOnBusyEngine);
if (forceZeroOffset) {
offset = 0;
}
// Copy memory offset into the constant buffer
if (abi64Bit()) {
*(reinterpret_cast<uint64_t*>(memory + arg->cbPos_)) = offset;
}
else {
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) =
static_cast<uint>(offset);
}
}
break;
case KernelArg::Image1D:
case KernelArg::Image2D:
case KernelArg::Image3D:
case KernelArg::Image1DB:
case KernelArg::Image1DA:
case KernelArg::Image2DA:
{
gpu::Memory* gpuMem = NULL;
if (nativeMem) {
gpuMem = *reinterpret_cast<Memory* const*>(param);
}
else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
}
if (gpuMem == NULL) {
return false;
}
// Make sure the passed argument is an image object
if (gpuMem->cal()->buffer_) {
LogError("The kernel image argument isn't an image object!");
return false;
}
ResourceType resType = arg->memory_.readOnly_ ?
ArgumentImageRead : ArgumentImageWrite;
// Bind current memory object with the shader.
if (!bindResource(gpu, *gpuMem, idx,
resType, arg->index_, gpuMem)) {
return false;
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, waitOnBusyEngine);
// Copy image constants into the constant buffer
if (gpuMem->owner() != NULL) {
copyImageConstants(gpuMem->owner()->asImage(),
reinterpret_cast<ImageConstants*>(memory + arg->cbPos_));
}
}
break;
case KernelArg::Sampler:
{
amd::Sampler* amdSampler =
*reinterpret_cast<amd::Sampler* const*>(param);
uint idx = arg->index_;
uint32_t state = amdSampler->state();
if (state != gpu.cal()->samplersState_[idx]) {
setSampler(gpu, state, idx);
gpu.cal_.samplersState_[idx] = state;
}
// Copy sampler state into the constant buffer
*(reinterpret_cast<uint32_t*>(memory + arg->cbPos_)) = state;
}
break;
case KernelArg::Counter:
{
gpu::Memory* gpuMem = NULL;
if (nativeMem) {
gpuMem = *reinterpret_cast<Memory* const*>(param);
}
else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, waitOnBusyEngine);
// Bind current memory object with the shader.
if (!bindResource(gpu, *gpuMem, idx,
ArgumentCounter, idx, gpuMem)) {
return false;
}
}
break;
case KernelArg::PointerHwLocal:
{
// Calculate current offset in the local ring
uint offset = gpu.cal_.progGrid_.localSize;
uint extra = amd::alignUp(offset, arg->alignment_) - offset;
offset = amd::alignUp(offset, arg->alignment_);
size_t memSize = *static_cast<const uintptr_t*>(param);
// Allocate new memory from the local ring
gpu.cal_.progGrid_.localSize += static_cast<uint>(memSize) + extra;
// Copy current local argument's offset into the CB
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) = offset;
CondLog((gpu.cal_.progGrid_.localSize > dev().info().localMemSize_),
"Requested local size is bigger than reported!");
}
break;
case KernelArg::Float:
case KernelArg::Double:
case KernelArg::Char:
case KernelArg::UChar:
case KernelArg::Short:
case KernelArg::UShort:
case KernelArg::Int:
case KernelArg::UInt:
case KernelArg::Long:
case KernelArg::ULong:
if (size != argSize) {
LogWarning("Parameter's sizes are unmatched!");
}
// Fall through ...
case KernelArg::Struct:
case KernelArg::Union: {
size_t specVec = arg->specialVector();
if (specVec != 0) {
uint iter = (arg->size_ / KernelArg::VectorSizeLimit);
for (uint i = 0; i < iter; ++i) {
amd::Os::fastMemcpy((memory + arg->cbPos_ +
i * ConstBuffer::VectorSize),
reinterpret_cast<const char*>(param) +
i * KernelArg::VectorSizeLimit * specVec,
specVec * KernelArg::VectorSizeLimit);
}
}
else {
// Copy data into the CB
amd::Os::fastMemcpy((memory + arg->cbPos_), param, size);
}
}
break;
default:
LogError("Unhandled argument's type!");
break;
}
return result;
}
bool
Kernel::initLocalPrivateRanges(VirtualGPU& gpu) const
{
// Initialize HW local
gpu.cal_.progGrid_.localSize = hwLocalSize_;
// Bind the global buffer if emulated local or private memory
// was allocated by the kernel
if ((flags() & PrintfOutput && (printfId_ == UavIdUndefined)) &&
(uavRaw_ != UavIdUndefined)) {
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
return false;
}
}
// Bind the global buffer if emulated constant buffers are enabled
if (cbId_ != UavIdUndefined) {
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentCbID, cbId_)) {
return false;
}
}
// Bind the printf buffer
if (printfId_ != UavIdUndefined) {
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentPrintfID, printfId_)) {
return false;
}
}
// Initialize the iterations count
gpu.cal_.iterations_ = 1;
return true;
}
void
Kernel::setLocalPrivateRanges(VirtualGPU& gpu) const
{
address cbBuf = gpu.cb(0)->sysMemCopy();
uint* data;
uint gridSize =
gpu.cal()->progGrid_.gridSize.width *
gpu.cal()->progGrid_.gridSize.height *
gpu.cal()->progGrid_.gridSize.depth;
uint blockSize =
gpu.cal()->progGrid_.gridBlock.width *
gpu.cal()->progGrid_.gridBlock.height *
gpu.cal()->progGrid_.gridBlock.depth;
//! \todo validate if the compiler still generates PrivateFixed
if (flags() & PrivateFixed) {
// Update private ring
data = reinterpret_cast<uint*>
(cbBuf + PrivateRingOffset * ConstBuffer::VectorSize);
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
if (abi64Bit()) {
reinterpret_cast<uint64_t*>(data)[0] = gpuMemory->hbOffset();
data[2] = 0;
data[3] = 0;
}
else {
data[0] = static_cast<uint>(gpuMemory->hbOffset());
data[1] = 0;
data[2] = data[3] = 0;
}
gpu.addVmMemory(gpuMemory);
}
// Copy the math lib constants
amd::Os::fastMemcpy(
(cbBuf + MathLibOffset * ConstBuffer::VectorSize),
MathLibConst, sizeof(MathLibConst));
// Update the offset to the global data
if (prog().glbData() != NULL) {
gpu.addVmMemory(prog().glbData());
uint64_t glbDataOffset = prog().glbData()->hbOffset();
if (abi64Bit()) {
*reinterpret_cast<uint64_t*>(cbBuf + GlobalDataStoreOffset *
ConstBuffer::VectorSize) = glbDataOffset;
}
else {
*reinterpret_cast<uint*>(cbBuf + GlobalDataStoreOffset *
ConstBuffer::VectorSize) = static_cast<uint>(glbDataOffset);
}
}
// Split workload if it was requested
if ((gpu.cal_.iterations_ < 2) &&
gpu.dmaFlushMgmt().dispatchSplitSize() != 0) {
uint totalSize = gridSize * blockSize;
if (totalSize > gpu.dmaFlushMgmt().dispatchSplitSize()) {
gpu.cal_.iterations_ = std::max(gpu.cal_.iterations_,
(totalSize / gpu.dmaFlushMgmt().dispatchSplitSize()));
}
}
// Initialize the number of iterations to the grid size
if (flags() & PrintfOutput) {
gpu.cal_.iterations_ = gridSize;
}
}
void
Kernel::setSampler(
VirtualGPU& gpu,
uint32_t state,
uint physUnit
) const
{
// All CAL sampler's parameters are in floats
float gslAddress = GSL_CLAMP_TO_BORDER;
float gslMinFilter = GSL_MIN_NEAREST;
float gslMagFilter = GSL_MAG_NEAREST;
state &= ~amd::Sampler::StateNormalizedCoordsMask;
// Program the sampler address mode
switch (state & amd::Sampler::StateAddressMask) {
case amd::Sampler::StateAddressRepeat:
gslAddress = GSL_REPEAT;
break;
case amd::Sampler::StateAddressClampToEdge:
gslAddress = GSL_CLAMP_TO_EDGE;
break;
case amd::Sampler::StateAddressMirroredRepeat:
gslAddress = GSL_MIRRORED_REPEAT;
break;
case amd::Sampler::StateAddressClamp:
case amd::Sampler::StateAddressNone:
default:
break;
}
state &= ~amd::Sampler::StateAddressMask;
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_S, &gslAddress);
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_T, &gslAddress);
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_R, &gslAddress);
// Program texture filter mode
if (state == amd::Sampler::StateFilterLinear) {
gslMinFilter = GSL_MIN_LINEAR;
gslMagFilter = GSL_MAG_LINEAR;
}
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MIN_FILTER, &gslMinFilter);
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MAG_FILTER, &gslMagFilter);
}
bool
Kernel::bindResource(
VirtualGPU& gpu,
const Resource& resource,
uint paramIdx,
ResourceType type,
uint physUnit,
Memory* memory,
size_t offset) const
{
gslUAVType uavType = GSL_UAV_TYPE_UNKNOWN;
// Find the original resource name from the IL program
switch (type) {
case GlobalBuffer:
if (gpu.state_.boundGlobal_) {
return true;
}
gpu.state_.boundGlobal_ = true;
physUnit = uavRaw_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
case GlobalBufferArena:
if (gpu.state_.boundGlobal_) {
return true;
}
gpu.state_.boundGlobal_ = true;
physUnit = uavArena_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
case ArgumentCbID:
if (gpu.state_.boundCb_) {
return true;
}
gpu.state_.boundCb_ = true;
physUnit = cbId_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
case ArgumentPrintfID:
if (gpu.state_.boundPrintf_) {
return true;
}
gpu.state_.boundPrintf_ = true;
physUnit = printfId_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
case ArgumentHeapBuffer:
case ArgumentBuffer:
case ArgumentImageRead:
case ArgumentImageWrite:
case ArgumentConstBuffer:
case ArgumentCounter:
// Early exit if resource is bound already
if (gpu.slots_[paramIdx].state_.bound_) {
return true;
}
// Associate resource with the slot
gpu.slots_[paramIdx].memory_ = memory;
// Mark resource as bound
gpu.slots_[paramIdx].state_.bound_ = true;
if (type == ArgumentCounter) {
GpuEvent calEvent;
// Bind memory with atomic counter
gpu.bindAtomicCounter(argument(paramIdx)->index_,
memory->gslResource());
// Copy the counter value into GDS
gpu.syncAtomicCounter(calEvent, argument(paramIdx)->index_, false);
// Mark resource as busy
memory->setBusy(gpu, calEvent);
return true;
}
else if (type == ArgumentHeapBuffer) {
// We return here, since we just have to bind the global heap
return true;
}
else if (type == ArgumentConstBuffer) {
gpu.slots_[paramIdx].state_.constant_ = true;
}
break;
case ArgumentUavID:
case ConstantBuffer:
break;
default:
LogPrintfError("Unspecified argument type ()!", type);
return false;
}
gslMemObject gslMem = NULL;
// Use global address space on SI+ for UAV setup
if (dev().settings().siPlus_ &&
((type == ArgumentBuffer) || (type == ArgumentCbID) ||
(type == ArgumentUavID) || (type == ArgumentPrintfID)) &&
!blitKernelHack_) {
gslMem = dev().heap()->resource().gslResource();
}
else {
gslMem = resource.gslResource();
}
// Associate memory with the physical unit, the actual binding
bool result = true;
switch (type) {
case GlobalBuffer:
case GlobalBufferArena:
case ArgumentBuffer:
case ArgumentImageWrite:
case ArgumentUavID:
case ArgumentCbID:
case ArgumentPrintfID:
if (type == ArgumentImageWrite) {
uavType = GSL_UAV_TYPE_TYPED;
}
else if ((type == ArgumentBuffer) || (type == ArgumentUavID)) {
uavType = GSL_UAV_TYPE_TYPELESS;
}
if (gpu.cal_.uavs_[physUnit] != gslMem) {
result = gpu.setUAVBuffer(physUnit, gslMem, uavType);
gpu.setUAVChannelOrder(physUnit, gslMem);
gpu.cal_.uavs_[physUnit] = gslMem;
}
else if (!dev().settings().siPlus_)
{
gpu.setUAVChannelOrder(physUnit, gslMem);
}
break;
case ConstantBuffer:
case ArgumentConstBuffer:
if ((gpu.cal_.constBuffers_[physUnit] != gslMem) || (offset != 0)) {
result = gpu.setConstantBuffer(physUnit,
gslMem, offset, resource.hbSize());
gpu.cal_.constBuffers_[physUnit] = gslMem;
}
break;
case ArgumentImageRead:
if (gpu.cal_.readImages_[physUnit] != gslMem) {
result = gpu.setInput(physUnit, gslMem);
gpu.cal_.readImages_[physUnit] = gslMem;
}
break;
default:
result = false;
assert(false);
break;
}
if (!result) {
LogPrintfError("setMem failed unit:%d mem:0x%08x!", physUnit, gslMem);
return false;
}
if ((type == GlobalBuffer) && dev().settings().useAliases_) {
if (uavArena_ != 0) {
if (!setupArenaAliases(gpu, resource)) {
return false;
}
if ((uavArena_ != physUnit) &&
(gpu.cal_.uavs_[uavArena_] != gslMem)) {
gpu.cal_.uavs_[uavArena_] = gslMem;
// Associate memory with the name
if (!gpu.setUAVBuffer(uavArena_, gslMem,
GSL_UAV_TYPE_TYPELESS)) {
LogError("calCtxSetMem failed!");
return false;
}
gpu.setUAVChannelOrder(uavArena_, gslMem);
}
}
}
return true;
}
void
Kernel::unbindResources(
VirtualGPU& gpu,
GpuEvent calEvent,
bool lastRun) const
{
// Make sure unbind occurs on the last run, in case the execution had a split
if (lastRun) {
for (uint i = 0; i < arguments_.size(); ++i) {
if (gpu.slots_[i].state_.bound_) {
GpuEvent calEventTmp = calEvent;
if (KernelArg::Counter == argument(i)->type_) {
// Copy the counter value from GDS
gpu.syncAtomicCounter(calEventTmp, argument(i)->index_, true);
}
else if (!(gpu.slots_[i].state_.constant_ ||
argument(i)->memory_.readOnly_)) {
// Signal the abstraction layer that GPU memory is dirty
if (gpu.slots_[i].memory_->owner() != NULL) {
gpu.slots_[i].memory_->owner()->signalWrite(&gpu.dev());
}
}
// Mark resource as busy
gpu.slots_[i].memory_->setBusy(gpu, calEventTmp);
gpu.slots_[i].state_.value_ = 0;
}
}
// Unbind the global buffer
gpu.state_.boundGlobal_ = false;
// Unbind the constant buffer
gpu.state_.boundCb_ = false;
// Unbind the pritnf buffer
gpu.state_.boundPrintf_ = false;
}
// Mark CB busy
for (uint i = 0; i < numCb_; ++i) {
gpu.constBufs_[i]->setBusy(gpu, calEvent);
}
// 8xx workaround for the number of groups limit in HW
if (gpu.gslKernelDesc()->funcInfo_.setBufferForNumGroup) {
ConstBuffer* cb = gpu.numGrpCb();
assert((cb != NULL) && "Runtime must have the constant buffer");
cb->setBusy(gpu, calEvent);
}
// Set the event object for the scratch buffer
if (workGroupInfo()->scratchRegs_ > 0) {
for (uint i = 0; i < dev().scratch(gpu.hwRing())->memObjs_.size(); ++i) {
dev().scratch(gpu.hwRing())->memObjs_[i]->setBusy(gpu, calEvent);
}
}
}
bool
Kernel::setupArenaAliases(VirtualGPU& gpu, const Resource& resource) const
{
const static uint ScArenaUavShortId = 9;
const static uint ScArenaUavByteId = 10;
Resource* buf = &(const_cast<Resource&>(resource));
Resource* alias;
gslMemObject gslMem = NULL;
//
// byte view
//
alias = buf->getAliasUAVBuffer(CM_SURF_FMT_R8I);
if (NULL == alias) {
return false;
}
gslMem = alias->gslResource();
if (gpu.cal_.uavs_[ScArenaUavByteId] != gslMem) {
if (!gpu.setUAVBuffer(ScArenaUavByteId,
gslMem, GSL_UAV_TYPE_TYPELESS)) {
return false;
}
gpu.cal_.uavs_[ScArenaUavByteId] = gslMem;
}
//
// short view
//
alias = buf->getAliasUAVBuffer(CM_SURF_FMT_R16I);
if (NULL == alias) {
return false;
}
bool result = true;
gslMem = alias->gslResource();
if (gpu.cal_.uavs_[ScArenaUavShortId] != gslMem) {
result = gpu.setUAVBuffer(ScArenaUavShortId,
gslMem, GSL_UAV_TYPE_TYPELESS);
gpu.cal_.uavs_[ScArenaUavShortId] = gslMem;
}
return result;
}
void
Kernel::copyImageConstants(
const amd::Image* amdImage,
ImageConstants* imageData
) const
{
imageData->width_ = static_cast<uint32_t>(amdImage->getWidth());
imageData->height_ = static_cast<uint32_t>(amdImage->getHeight());
imageData->depth_ = static_cast<uint32_t>(amdImage->getDepth());
imageData->dataType_ =
static_cast<uint32_t>(amdImage->getImageFormat().image_channel_data_type);
imageData->widthFloat_ = 1.f / static_cast<float>(amdImage->getWidth());
imageData->heightFloat_ = 1.f / static_cast<float>(amdImage->getHeight());
imageData->depthFloat_ = 1.f / static_cast<float>(amdImage->getDepth());
imageData->channelOrder_ =
static_cast<uint32_t>(amdImage->getImageFormat().image_channel_order);
}
union MetadataVersion {
struct {
uint64_t revision_: 16; //!< LLVM metadata revision
uint64_t minorVersion_: 16; //!< LLVM metadata minor verison
uint64_t majorVersion_: 16; //!< LLVM metadata major version
};
uint64_t value_;
MetadataVersion(uint mj, uint mi, uint rev): value_(0)
{
revision_ = rev;
minorVersion_ = mi;
majorVersion_ = mj;
}
MetadataVersion(): value_(0) {}
};
//! Version of metadata with buffer attributes
const MetadataVersion MetadataBufferAttributes = MetadataVersion(2, 0, 88);
//! Version of metadata with type qualifiers
const MetadataVersion MetadataTypeQualifiers = MetadataVersion(3, 1, 103);
bool
NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount)
{
// Initialize workgroup info
workGroupInfo_.size_ = nullDev().info().maxWorkGroupSize_;
MetadataVersion mdVersion;
// Find first tag
size_t pos = metaData.find(";");
// Loop through all provided program arguments
while (pos != std::string::npos) {
KernelArg arg;
if (!expect(metaData, &pos, ";")) {
break;
}
arg.type_ = KernelArg::None;
// Loop through all available metadata types
for (uint i = 0; i < ArgStateTotal; ++i) {
uint tmpValue;
// Find the name tag
if (expect(metaData, &pos, ArgState[i].typeName_)) {
switch (ArgState[i].type_) {
case KernelArg::None:
// Process next ...
continue;
case KernelArg::Reflection: {
uint argIdx;
// Read the argument's index
if (!getuint(metaData, &pos, &argIdx)) {
LogWarning("Couldn't get the argument index!");
return false;
}
KernelArg* tmpArg = arguments_[argIdx];
if (!getstring(metaData, &pos, &tmpArg->typeName_)) {
LogWarning("Couldn't get the argument type!");
return false;
}
}
continue;
case KernelArg::ConstArg: {
uint argIdx;
// Read the argument's index
if (!getuint(metaData, &pos, &argIdx)) {
LogWarning("Couldn't get the argument index!");
return false;
}
KernelArg* tmpArg = arguments_[argIdx];
tmpArg->typeQualifier_ |= CL_KERNEL_ARG_TYPE_CONST;
}
continue;
case KernelArg::Grouping:
for (uint j = 0; j < 3; ++j) {
uint temp;
// Read the compile workgroup size
if (!getuint(metaData, &pos, &temp)) {
LogWarning("Couldn't get the compile workgroup size!");
return false;
}
workGroupInfo_.compileSize_[j] = temp;
}
// Process next ...
continue;
case KernelArg::WrkgrpSize: {
uint temp;
// Read the workgroup size
if (!getuint(metaData, &pos, &temp)) {
LogWarning("Couldn't get the workgroup size!");
return false;
}
workGroupInfo_.size_ = temp;
}
// Process next ...
continue;
case KernelArg::Wavefront:
// Process next ...
continue;
case KernelArg::UavId:
// Read index
if (!getuint(metaData, &pos, &arg.index_)) {
return false;
}
break;
case KernelArg::ConstBufId:
// Read index
if (!getuint(metaData, &pos, &cbId_)) {
return false;
}
continue;
case KernelArg::PrintfBufId:
// Read index
if (!getuint(metaData, &pos, &printfId_)) {
return false;
}
continue;
case KernelArg::MetadataVersion:
// Read metadata version
if (!getuint(metaData, &pos, &tmpValue)) {
return false;
}
mdVersion.majorVersion_ = tmpValue;
if (!getuint(metaData, &pos, &tmpValue)) {
return false;
}
mdVersion.minorVersion_ = tmpValue;
if (!getuint(metaData, &pos, &tmpValue)) {
return false;
}
mdVersion.revision_ = tmpValue;
// Process next ...
continue;
case KernelArg::GroupingHint:
for (uint j = 0; j < 3; ++j) {
uint temp;
// Read the compile workgroup size hint
if (!getuint(metaData, &pos, &temp)) {
LogWarning("Couldn't get the compile workgroup size hint!");
return false;
}
workGroupInfo_.compileSizeHint_[j] = temp;
}
// Process next ...
continue;
case KernelArg::VecTypeHint:
{
std::string temp;
// Read the compile vector type hint
if (!getstring(metaData, &pos, &temp)) {
LogWarning("Couldn't get the compile vector type hint!");
return false;
}
workGroupInfo_.compileVecTypeHint_ = temp;
}
// Process next ...
continue;
default:
break;
}
char argName[256];
// Save the argument type
arg.type_ = ArgState[i].type_;
// Check if we should expect the name
if (ArgState[i].name_) {
// Read the parameter's name
if (!getword(metaData, &pos, argName)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
arg.name_ = argName;
}
if (arg.type_ == KernelArg::Sampler) {
if (!getuint(metaData, &pos, &arg.index_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (!getuint(metaData, &pos, &arg.location_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (!getuint(metaData, &pos, &arg.cbPos_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check if we should expect the resource data type
if (ArgState[i].resType_) {
uint k;
// Search for the data type
for (k = 0; k < DataTypeTotal; k++) {
if (expect(metaData, &pos, DataType[k].tagName_)) {
arg.dataType_ = DataType[k].type_;
if (arg.type_ == KernelArg::Image) {
flags_ |= ImageEnable;
if (expect(metaData, &pos, "RO:")) {
arg.memory_.readOnly_ = 1;
}
else if (expect(metaData, &pos, "RW:")) {
arg.memory_.readWrite_ = 1;
flags_ |= ImageWrite;
}
else if (expect(metaData, &pos, "WO:")) {
arg.memory_.writeOnly_ = 1;
flags_ |= ImageWrite;
}
}
else if (arg.type_ == KernelArg::Value) {
arg.type_ = DataType[k].type_;
}
break;
}
}
if (k == DataTypeTotal) {
LogWarning("We couldn't find the argument's type.");
if ((arg.type_ == KernelArg::Value) ||
!getword(metaData, &pos, argName)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
//! @todo temporary condition
if ((arg.type_ == KernelArg::Opaque) ||
(arg.type_ == KernelArg::Sampler)) {
assert(false);
continue;
}
}
// Check if we should expect the data size
if (ArgState[i].size_) {
uint tmpData;
// Read the data size
if (!getuint(metaData, &pos, &tmpData)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (arg.type_ == KernelArg::Image) {
arg.type_ = arg.dataType_;
arg.index_ = tmpData;
}
else {
arg.size_ = tmpData;
}
}
if (arg.type_ == KernelArg::Counter) {
// Read a counter index
if (!getuint(metaData, &pos, &arg.index_)) {
LogWarning("Couldn't get a counter index!");
return false;
}
}
// Check if we should expect a resource index
if (ArgState[i].cbIdx_) {
// Read resource index
if (!getuint(metaData, &pos, &arg.cbIdx_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (arg.isCbNeeded() && (numCb_ < arg.cbIdx_)) {
numCb_ = arg.cbIdx_;
}
}
// Check if we should expect the CB offset
if (ArgState[i].cbPos_) {
// Read position in the constant buffer
if (!getuint(metaData, &pos, &arg.cbPos_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check if we should expect the buffer type
if (ArgState[i].buf_) {
// Read the buffer type
if (!getword(metaData, &pos, argName)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
arg.buf_ = argName;
for (uint k = 0; k < BufTypeTotal; ++k) {
if (0 == arg.buf_.compare(BufType[k].tagName_)) {
// Update the parameter type
arg.type_ = BufType[k].type_;
// Check if we should expect a buffer index
if (BufType[k].number_) {
// Read a buffer index
if (!getuint(metaData, &pos, &arg.index_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check for the required alignment
if (BufType[k].alignment_) {
// Read data alignment
if (!getuint(metaData, &pos, &arg.alignment_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check for the buffer's attribute
if ((mdVersion.value_ >= MetadataBufferAttributes.value_) &&
BufType[k].attribute_) {
if (expect(metaData, &pos, "RO")) {
arg.memory_.readOnly_ = 1;
}
else if (expect(metaData, &pos, "RW")) {
arg.memory_.readWrite_ = 1;
}
else if (expect(metaData, &pos, "WO")) {
arg.memory_.writeOnly_ = 1;
}
}
// Check for the type qualifier
if ((mdVersion.value_ >= MetadataTypeQualifiers.value_) &&
BufType[k].attribute_) {
uint tmp;
pos += 1;
if (!getuint(metaData, &pos, &tmp)) {
LogWarning("Couldn't get volatile type!");
return false;
}
if (tmp == 1) {
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_VOLATILE;
}
if (!getuint(metaData, &pos, &tmp)) {
LogWarning("Couldn't get restrict type!");
return false;
}
if (tmp == 1) {
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_RESTRICT;
}
}
}
}
}
// Find multiple UAV references
switch (arg.type_) {
case KernelArg::PointerGlobal:
case KernelArg::PointerConst:
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
case KernelArg::UavId:
uavRefCount[arg.index_]++;
break;
default:
break;
}
// Check if this argument will be passed in constant buffer
if (arg.isCbNeeded() || (arg.type_ == KernelArg::UavId)) {
if (arg.type_ == KernelArg::Sampler) {
// Serach for the passed by value sampler
for (uint i = 0; i < argSize(); ++i) {
KernelArg* value = arguments_[i];
if (0 == value->name_.compare(arg.name_)) {
value->type_ = arg.type_;
value->index_ = arg.index_;
value->location_ = 0;
break;
}
}
}
else {
KernelArg* argument = new KernelArg(arg);
if (argument != NULL) {
addArgument(argument);
}
else {
LogError("Couldn't allocate memory!");
return false;
}
}
}
// Check if we have a pre-defined sampler
else if (arg.type_ == KernelArg::Sampler) {
KernelArg* sampler = new KernelArg(arg);
if (sampler != NULL) {
addSampler(sampler);
}
else {
LogError("Couldn't allocate memory!");
return false;
}
}
break;
}
}
// Next argument
pos = metaData.find(";", pos);
}
// Find arguments that will require a reallocation
for (uint i = 0; i < arguments_.size(); ++i) {
KernelArg* arg = arguments_[i];
switch (arg->type_) {
case KernelArg::PointerGlobal:
case KernelArg::PointerConst:
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
// Check if can't use a dedicated UAV,
// so realloc memory in the heap
arg->memory_.realloc_ = isRealloc();
if (nullDev().settings().useAliases_) {
if (uavRefCount[arg->index_] > 1) {
// Multiple accesses, assume this is the heap
uavRaw_ = arg->index_;
}
// Mark argument as an UAV buffer if no aliases or it's not arena
else if (arg->index_ != VirtualGPU::UavArena) {
arg->memory_.uavBuf_ = true;
}
}
else {
arg->memory_.uavBuf_ = true;
}
break;
case KernelArg::PointerHwConst:
arg->memory_.realloc_ = true;
break;
case KernelArg::UavId:
if (!nullDev().settings().useAliases_ ||
(arg->index_ != VirtualGPU::UavArena)) {
uavRaw_ = arg->index_;
}
break;
default:
break;
}
// If argument marked with the const qualifier, then overwrite
// Read-Write attributes, since compiler doesn't mark it properly
if (arg->typeQualifier() & CL_KERNEL_ARG_TYPE_CONST) {
arg->memory_.readOnly_ = 1;
arg->memory_.readWrite_ = 0;
arg->memory_.writeOnly_ = 0;
}
}
if (!nullDev().settings().useAliases_ &&
(uavRaw_ != UavIdUndefined) &&
!(flags() & PrintfOutput)) {
// Find if default UAV is already assigned to an argument
for (uint i = 0; i < arguments_.size(); ++i) {
KernelArg* arg = arguments_[i];
switch (arg->type_) {
case KernelArg::PointerGlobal:
case KernelArg::PointerConst:
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
if (uavRaw_ == arg->index_) {
uavRaw_ = UavIdUndefined;
}
break;
default:
break;
}
}
}
// There is always 1 constant buffer, associated with the kernel
numCb_++;
assert((numCb_ <= MaxConstBuffersArguments) &&
"Runtime doesn't support more than max CBs for arguments!");
// Limit workgroup size if requested
if ((flags() & LimitWorkgroup) && (GPU_MAX_WORKGROUP_SIZE == 0)) {
size_t temp = 1;
workGroupInfo_.size_ = workGroupInfo()->wavefrontSize_;
for (uint j = 0; j < 3; ++j) {
if (workGroupInfo()->compileSize_[j] != 0) {
temp *= workGroupInfo_.compileSize_[j];
}
}
// Report a compilation error if requested compile size doesn't
// match the required workgroup size
if (workGroupInfo()->size_ < temp) {
char str[8];
intToStr(workGroupInfo_.size_, str, 8);
buildError_ = CL_OUT_OF_RESOURCES;
buildLog_ +=
"Error: Requested compile size is bigger than the required workgroup size of ";
buildLog_ += str;
buildLog_ += " elements\n";
LogError(buildLog().c_str());
return false;
}
}
// Read/Write attributes are provided in metadata
if (mdVersion.value_ >= MetadataBufferAttributes.value_) {
rwAttributes_ = true;
}
return true;
}
bool
Kernel::validateMemory(uint idx, amd::Memory* amdMem) const
{
// Check if memory doesn't require reallocation
bool noRealloc = (!argument(idx)->memory_.realloc_ ||
amdMem->reallocedDeviceMemory(&dev()));
return noRealloc;
}
inline static HSAIL_ARG_TYPE
GetHSAILArgType(const aclArgData* argInfo)
{
switch (argInfo->type) {
case ARG_TYPE_POINTER:
if (argInfo->arg.pointer.memory == PTR_MT_SCRATCH_EMU) {
return HSAIL_ARGTYPE_QUEUE;
}
return HSAIL_ARGTYPE_POINTER;
case ARG_TYPE_VALUE:
return HSAIL_ARGTYPE_VALUE;
case ARG_TYPE_IMAGE:
return HSAIL_ARGTYPE_IMAGE;
case ARG_TYPE_SAMPLER:
return HSAIL_ARGTYPE_SAMPLER;
case ARG_TYPE_ERROR:
default:
return HSAIL_ARGTYPE_ERROR;
}
}
inline static size_t
GetHSAILArgAlignment(const aclArgData* argInfo)
{
switch (argInfo->type) {
case ARG_TYPE_POINTER:
return argInfo->arg.pointer.align;
default:
return 1;
}
}
inline static HSAIL_ADDRESS_QUALIFIER
GetHSAILAddrQual(const aclArgData* argInfo)
{
if (argInfo->type == ARG_TYPE_POINTER) {
switch (argInfo->arg.pointer.memory) {
case PTR_MT_CONSTANT_EMU:
case PTR_MT_CONSTANT:
case PTR_MT_UAV:
case PTR_MT_GLOBAL:
return HSAIL_ADDRESS_GLOBAL;
case PTR_MT_LDS_EMU:
case PTR_MT_LDS:
return HSAIL_ADDRESS_LOCAL;
case PTR_MT_SCRATCH_EMU:
return HSAIL_ADDRESS_GLOBAL;
case PTR_MT_ERROR:
default:
LogError("Unsupported address type");
return HSAIL_ADDRESS_ERROR;
}
}
else if ((argInfo->type == ARG_TYPE_IMAGE) ||
(argInfo->type == ARG_TYPE_SAMPLER)) {
return HSAIL_ADDRESS_GLOBAL;
}
return HSAIL_ADDRESS_ERROR;
}
/* f16 returns f32 - workaround due to comp lib */
inline static HSAIL_DATA_TYPE
GetHSAILDataType(const aclArgData* argInfo)
{
aclArgDataType dataType;
if (argInfo->type == ARG_TYPE_POINTER) {
dataType = argInfo->arg.pointer.data;
}
else if (argInfo->type == ARG_TYPE_VALUE) {
dataType = argInfo->arg.value.data;
}
else {
return HSAIL_DATATYPE_ERROR;
}
switch (dataType) {
case DATATYPE_i1:
return HSAIL_DATATYPE_B1;
case DATATYPE_i8:
return HSAIL_DATATYPE_S8;
case DATATYPE_i16:
return HSAIL_DATATYPE_S16;
case DATATYPE_i32:
return HSAIL_DATATYPE_S32;
case DATATYPE_i64:
return HSAIL_DATATYPE_S64;
case DATATYPE_u8:
return HSAIL_DATATYPE_U8;
case DATATYPE_u16:
return HSAIL_DATATYPE_U16;
case DATATYPE_u32:
return HSAIL_DATATYPE_U32;
case DATATYPE_u64:
return HSAIL_DATATYPE_U64;
case DATATYPE_f16:
return HSAIL_DATATYPE_F32;
case DATATYPE_f32:
return HSAIL_DATATYPE_F32;
case DATATYPE_f64:
return HSAIL_DATATYPE_F64;
case DATATYPE_struct:
return HSAIL_DATATYPE_STRUCT;
case DATATYPE_opaque:
return HSAIL_DATATYPE_OPAQUE;
case DATATYPE_ERROR:
default:
return HSAIL_DATATYPE_ERROR;
}
}
inline static int
GetHSAILArgSize(const aclArgData *argInfo)
{
switch (argInfo->type) {
case ARG_TYPE_VALUE:
switch (GetHSAILDataType(argInfo)) {
case HSAIL_DATATYPE_B1:
return 1;
case HSAIL_DATATYPE_B8:
case HSAIL_DATATYPE_S8:
case HSAIL_DATATYPE_U8:
return 1;
case HSAIL_DATATYPE_B16:
case HSAIL_DATATYPE_U16:
case HSAIL_DATATYPE_S16:
case HSAIL_DATATYPE_F16:
return 2;
case HSAIL_DATATYPE_B32:
case HSAIL_DATATYPE_U32:
case HSAIL_DATATYPE_S32:
case HSAIL_DATATYPE_F32:
return 4;
case HSAIL_DATATYPE_B64:
case HSAIL_DATATYPE_U64:
case HSAIL_DATATYPE_S64:
case HSAIL_DATATYPE_F64:
return 8;
case HSAIL_DATATYPE_STRUCT:
return argInfo->arg.value.numElements;
default:
return -1;
}
case ARG_TYPE_POINTER:
case ARG_TYPE_IMAGE:
case ARG_TYPE_SAMPLER:
return sizeof(void*);
default:
return -1;
}
}
inline static clk_value_type_t
GetOclType(const aclArgData* argInfo)
{
static const clk_value_type_t ClkValueMapType[6][6] = {
{ T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 },
{ T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 },
{ T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 },
{ T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 },
{ T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 },
{ T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 },
};
uint sizeType;
if ((argInfo->type == ARG_TYPE_POINTER) &&
(argInfo->arg.pointer.memory == PTR_MT_SCRATCH_EMU)) {
return T_QUEUE;
}
if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) {
return T_POINTER;
}
else if (argInfo->type == ARG_TYPE_VALUE) {
switch (argInfo->arg.value.data) {
case DATATYPE_i8:
case DATATYPE_u8:
sizeType = 0;
break;
case DATATYPE_i16:
case DATATYPE_u16:
sizeType = 1;
break;
case DATATYPE_i32:
case DATATYPE_u32:
sizeType = 2;
break;
case DATATYPE_i64:
case DATATYPE_u64:
sizeType = 3;
break;
case DATATYPE_f16:
case DATATYPE_f32:
sizeType = 4;
break;
case DATATYPE_f64:
sizeType = 5;
break;
default:
return T_VOID;
}
switch (argInfo->arg.value.numElements) {
case 1: return ClkValueMapType[sizeType][0];
case 2: return ClkValueMapType[sizeType][1];
case 3: return ClkValueMapType[sizeType][2];
case 4: return ClkValueMapType[sizeType][3];
case 8: return ClkValueMapType[sizeType][4];
case 16: return ClkValueMapType[sizeType][5];
default: return T_VOID;
}
}
else if (argInfo->type == ARG_TYPE_SAMPLER) {
return T_SAMPLER;
}
else {
return T_VOID;
}
}
inline static cl_kernel_arg_address_qualifier
GetOclAddrQual(const aclArgData* argInfo)
{
if (argInfo->type == ARG_TYPE_POINTER) {
switch (argInfo->arg.pointer.memory) {
case PTR_MT_UAV:
case PTR_MT_GLOBAL:
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
case PTR_MT_CONSTANT:
case PTR_MT_UAV_CONSTANT:
case PTR_MT_CONSTANT_EMU:
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
case PTR_MT_LDS_EMU:
case PTR_MT_LDS:
return CL_KERNEL_ARG_ADDRESS_LOCAL;
default:
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
}
else if (argInfo->type == ARG_TYPE_IMAGE) {
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
}
//default for all other cases
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
inline static cl_kernel_arg_access_qualifier
GetOclAccessQual(const aclArgData* argInfo)
{
if (argInfo->type == ARG_TYPE_IMAGE) {
switch (argInfo->arg.image.type) {
case ACCESS_TYPE_RO:
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
case ACCESS_TYPE_WO:
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
case ACCESS_TYPE_RW:
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
default:
return CL_KERNEL_ARG_ACCESS_NONE;
}
}
return CL_KERNEL_ARG_ACCESS_NONE;
}
inline static cl_kernel_arg_type_qualifier
GetOclTypeQual(const aclArgData* argInfo)
{
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
if (argInfo->type == ARG_TYPE_POINTER) {
if (argInfo->arg.pointer.isVolatile) {
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
}
if (argInfo->arg.pointer.isRestrict) {
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
}
if (argInfo->arg.pointer.isPipe) {
rv |= CL_KERNEL_ARG_TYPE_PIPE;
}
if (argInfo->isConst) {
rv |= CL_KERNEL_ARG_TYPE_CONST;
}
switch (argInfo->arg.pointer.memory) {
case PTR_MT_CONSTANT:
case PTR_MT_UAV_CONSTANT:
case PTR_MT_CONSTANT_EMU:
rv |= CL_KERNEL_ARG_TYPE_CONST;
break;
default:
break;
}
}
return rv;
}
static int
GetOclSize(const aclArgData* argInfo)
{
switch (argInfo->type) {
case ARG_TYPE_POINTER: return sizeof(void *);
case ARG_TYPE_VALUE:
//! \note OCL 6.1.5. For 3-component vector data types,
//! the size of the data type is 4 * sizeof(component).
switch (argInfo->arg.value.data) {
case DATATYPE_struct:
return 1 * argInfo->arg.value.numElements;
case DATATYPE_i8:
case DATATYPE_u8:
return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_u16:
case DATATYPE_i16:
case DATATYPE_f16:
return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_u32:
case DATATYPE_i32:
case DATATYPE_f32:
return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_i64:
case DATATYPE_u64:
case DATATYPE_f64:
return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_ERROR:
default: return -1;
}
case ARG_TYPE_IMAGE: return sizeof(cl_mem);
case ARG_TYPE_SAMPLER: return sizeof(cl_sampler);
default: return -1;
}
}
void
HSAILKernel::initArgList(const aclArgData* aclArg)
{
// Initialize the hsail argument list too
initHsailArgs(aclArg);
// Iterate through the arguments and insert into parameterList
device::Kernel::parameters_t params;
amd::KernelParameterDescriptor desc;
size_t offset = 0;
// Reserved arguments for HSAIL launch
aclArg += ExtraArguments;
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
desc.name_ = arguments_[i]->name_.c_str();
desc.type_ = GetOclType(aclArg);
desc.addressQualifier_ = GetOclAddrQual(aclArg);
desc.accessQualifier_ = GetOclAccessQual(aclArg);
desc.typeQualifier_ = GetOclTypeQual(aclArg);
desc.typeName_ = arguments_[i]->typeName_.c_str();
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
desc.size_ = 0;
}
else {
desc.size_ = GetOclSize(aclArg);
}
// Make offset alignment to match CPU metadata, since
// in multidevice config abstraction layer has a single signature
// and CPU sends the paramaters as they are allocated in memory
size_t size = desc.size_;
if (size == 0) {
// Local memory for CPU
size = sizeof(cl_mem);
}
offset = amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
params.push_back(desc);
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
flags_.imageEna_ = true;
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
flags_.imageWriteEna_ = true;
}
}
}
createSignature(params);
}
void
HSAILKernel::initHsailArgs(const aclArgData* aclArg)
{
int offset = 0;
// Reserved arguments for HSAIL launch
aclArg += ExtraArguments;
// Iterate through the each kernel argument
for (; aclArg->struct_size != 0; aclArg++) {
Argument* arg = new Argument;
// Initialize HSAIL kernel argument
arg->name_ = aclArg->argStr;
arg->typeName_ = aclArg->typeStr;
arg->size_ = GetHSAILArgSize(aclArg);
arg->offset_ = offset;
arg->type_ = GetHSAILArgType(aclArg);
arg->addrQual_ = GetHSAILAddrQual(aclArg);
arg->dataType_ = GetHSAILDataType(aclArg);
// If vector of args we add additional arguments to flatten it out
arg->numElem_ = ((aclArg->type == ARG_TYPE_VALUE) &&
(aclArg->arg.value.data != DATATYPE_struct)) ?
aclArg->arg.value.numElements : 1;
arg->alignment_ = GetHSAILArgAlignment(aclArg);
offset += GetHSAILArgSize(aclArg);
arguments_.push_back(arg);
}
}
void
HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf)
{
PrintfInfo info;
uint index = 0;
for (; aclPrintf->struct_size != 0; aclPrintf++) {
index = aclPrintf->ID;
if (printf_.size() <= index) {
printf_.resize(index + 1);
}
info.fmtString_ = aclPrintf->fmtStr;
info.fmtString_ += "\n";
uint32_t *tmp_ptr = const_cast<uint32_t*>(aclPrintf->argSizes);
for (uint i = 0; i < aclPrintf->numSizes; i++ , tmp_ptr++) {
info.arguments_.push_back(*tmp_ptr);
}
printf_[index] = info;
info.arguments_.clear();
}
}
HSAILKernel::HSAILKernel(std::string name,
HSAILProgram* prog,
std::string compileOptions)
: device::Kernel(name)
, compileOptions_(compileOptions)
, dev_(prog->dev())
, prog_(*prog)
, index_(0)
, code_(NULL)
, hwMetaData_(NULL)
{
hsa_ = true;
}
HSAILKernel::~HSAILKernel()
{
while (!arguments_.empty()) {
Argument* arg = arguments_.back();
delete arg;
arguments_.pop_back();
}
delete [] hwMetaData_;
delete code_;
}
bool
HSAILKernel::init()
{
acl_error error;
//compile kernel down to ISA
std::string openClKernelName("&__OpenCL_" + name() + "_kernel");
std::string options(compileOptions_.c_str());
options.append(" -just-kernel=");
options.append(openClKernelName.c_str());
// Get the ISA out
size_t size_isa;
void* shader_isa = NULL;
error = aclCompile(dev().hsaCompiler(), prog().binaryElf(),
options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL);
if (error != ACL_SUCCESS) {
LogError("Failed to finalize");
return false;
}
shader_isa = const_cast<void *>(aclGetDeviceBinary(dev().hsaCompiler(),
prog().binaryElf(), openClKernelName.c_str(), &size_isa, &error));
if (shader_isa == NULL) {
LogError("Failed find the ISA");
return false;
}
aqlCreateHWInfo(shader_isa, size_isa);
// Pull out metadata from the ELF
size_t sizeOfArgList;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_ARGUMENT_ARRAY, openClKernelName.c_str(), NULL, &sizeOfArgList);
if (error != ACL_SUCCESS) {
return false;
}
char* aclArgList = new char[sizeOfArgList];
if (NULL == aclArgList) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_ARGUMENT_ARRAY, openClKernelName.c_str(), aclArgList, &sizeOfArgList);
if (error != ACL_SUCCESS) {
return false;
}
// Set the argList
initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
delete [] aclArgList;
size_t sizeOfWorkGroupSize;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_WORK_GROUP_SIZE, openClKernelName.c_str(), NULL, &sizeOfWorkGroupSize);
if (error != ACL_SUCCESS) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_WORK_GROUP_SIZE, openClKernelName.c_str(),
workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
if (error != ACL_SUCCESS) {
return false;
}
// Copy wavefront size
workGroupInfo_.wavefrontSize_ = dev().getAttribs().wavefrontSize;
// Find total workgroup size
if (workGroupInfo_.compileSize_[0] != 0) {
workGroupInfo_.size_ =
workGroupInfo_.compileSize_[0] *
workGroupInfo_.compileSize_[1] *
workGroupInfo_.compileSize_[2];
}
else {
workGroupInfo_.size_ = dev().info().maxWorkGroupSize_;
}
// Pull out printf metadata from the ELF
size_t sizeOfPrintfList;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), NULL, &sizeOfPrintfList);
if (error != ACL_SUCCESS) {
return false;
}
// Make sure kernel has any printf info
if (0 != sizeOfPrintfList) {
char* aclPrintfList = new char[sizeOfPrintfList];
if (NULL == aclPrintfList) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), aclPrintfList,
&sizeOfPrintfList);
if (error != ACL_SUCCESS) {
return false;
}
// Set the PrintfList
initPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
delete [] aclPrintfList;
}
size_t sizeOfDevice;
bool hasKernelEnqueue = false;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_DEVICE_ENQUEUE, openClKernelName.c_str(),
&hasKernelEnqueue, &sizeOfDevice);
if (error != ACL_SUCCESS) {
return false;
}
flags_.dynamicParallelism_ = hasKernelEnqueue;
int index = -1;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
RT_KERNEL_INDEX, openClKernelName.c_str(),
&index, &sizeOfDevice);
if (error != ACL_SUCCESS) {
return false;
}
index_ = static_cast<uint>(index);
return true;
}
bool
HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const
{
// Check if memory doesn't require reallocation
bool noRealloc = true;
//amdMem->reallocedDeviceMemory(&dev()));
return noRealloc;
}
const Device&
HSAILKernel::dev() const
{
return reinterpret_cast<const Device&>(dev_);
}
const HSAILProgram&
HSAILKernel::prog() const
{
return reinterpret_cast<const HSAILProgram&>(prog_);
}
void
HSAILKernel::findLocalWorkSize(
size_t workDim,
const amd::NDRange& gblWorkSize,
amd::NDRange& lclWorkSize) const
{
// Initialize the default workgoup info
// Check if the kernel has the compiled sizes
if (workGroupInfo()->compileSize_[0] == 0) {
// Find the default local workgroup size, if it wasn't specified
if (lclWorkSize[0] == 0) {
size_t thrPerGrp;
bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
bool overrideSet = ((workDim == 1) && b1DOverrideSet) ||
((workDim == 2) && b2DOverrideSet) ||
((workDim == 3) && b3DOverrideSet);
if (!overrideSet) {
// Find threads per group
thrPerGrp = workGroupInfo()->size_;
// Check if kernel uses images
if (flags_.imageEna_ &&
// and thread group is a multiple value of wavefronts
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
// and it's 2 or 3-dimensional workload
(workDim > 1) &&
((dev().settings().partialDispatch_) ||
(((gblWorkSize[0] % 16) == 0) &&
((gblWorkSize[1] % 16) == 0)))) {
// Use 8x8 workgroup size if kernel has image writes
if (flags_.imageWriteEna_ ||
(thrPerGrp != dev().info().maxWorkGroupSize_)) {
lclWorkSize[0] = 8;
lclWorkSize[1] = 8;
}
else {
lclWorkSize[0] = 16;
lclWorkSize[1] = 16;
}
if (workDim == 3) {
lclWorkSize[2] = 1;
}
}
else {
size_t tmp = thrPerGrp;
// Split the local workgroup into the most efficient way
for (uint d = 0; d < workDim; ++d) {
size_t div = tmp;
for (; (gblWorkSize[d] % div) != 0; div--);
lclWorkSize[d] = div;
tmp /= div;
}
// Check if partial dispatch is enabled and
if (dev().settings().partialDispatch_ &&
// we couldn't find optimal workload
(lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
size_t maxSize = 0;
size_t maxDim = 0;
for (uint d = 0; d < workDim; ++d) {
if (maxSize < gblWorkSize[d]) {
maxSize = gblWorkSize[d];
maxDim = d;
}
}
// Check if a local workgroup has the most optimal size
if (thrPerGrp > maxSize) {
thrPerGrp = maxSize;
}
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 0; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
}
}
}
else {
// Use overrides when app doesn't provide workgroup dimensions
if (workDim == 1) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
}
else if (workDim == 2) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
}
else if (workDim == 3) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
}
else
{
assert(0 && "Invalid workDim!");
}
}
}
}
else {
for (uint d = 0; d < workDim; ++d) {
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
}
}
}
inline static void
WriteAqlArg(
unsigned char** dst,//!< The write pointer to the buffer
const void* src, //!< The source pointer
uint size, //!< The size in bytes to copy
uint alignment = 0 //!< The alignment to follow while writing to the buffer
)
{
if (alignment == 0) {
*dst = amd::alignUp(*dst, size);
}
else {
*dst = amd::alignUp(*dst, alignment);
}
memcpy(*dst, src, size);
*dst += size;
}
HsaAqlDispatchPacket*
HSAILKernel::loadArguments(
VirtualGPU& gpu,
const amd::Kernel& kernel,
const amd::NDRangeContainer& sizes,
const_address parameters,
bool nativeMem,
uint64_t vmDefQueue,
uint64_t* vmParentWrap,
std::vector<const Resource*>& memList) const
{
static const bool WaitOnBusyEngine = true;
uint64_t ldsAddress = ldsSize();
address aqlArgBuf = gpu.cb(0)->sysMemCopy();
address aqlStruct = gpu.cb(1)->sysMemCopy();
bool srdResource = false;
// The HLC generates 3 additional arguments for the global offsets
//and fourth argument is the printf_buffer pointer
size_t offsetSize[HSAILKernel::ExtraArguments] = { 0, 0, 0, 0, 0, 0 };
for (uint i = 0; i < sizes.dimensions(); ++i) {
offsetSize[i] = sizes.offset()[i];
}
if (dynamicParallelism()) {
// Provide the host parent AQL wrap object to the kernel
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(aqlStruct);
memset(wrap, 0, sizeof(AmdAqlWrap));
wrap->state = AQL_WRAP_BUSY;
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(sizeof(AmdAqlWrap));
*vmParentWrap = cb->vmAddress() + cb->wrtOffset();
offsetSize[4] = vmDefQueue;
offsetSize[5] = *vmParentWrap;
memList.push_back(cb);
}
// Check if the kernel may have printf output
if ((printfInfo().size() > 0) &&
// and printf buffer was allocated
(gpu.printfDbgHSA().dbgBuffer() != NULL)) {
offsetSize[3] = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
memList.push_back(gpu.printfDbgHSA().dbgBuffer());
}
WriteAqlArg(&aqlArgBuf, offsetSize, sizeof(offsetSize), sizeof(size_t));
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
// Find all parameters for the current kernel
for (uint i = 0; i != signature.numParameters(); ++i) {
const HSAILKernel::Argument* arg = argument(i);
const amd::KernelParameterDescriptor& desc = signature.at(i);
const_address paramaddr = parameters + desc.offset_;
switch (arg->type_) {
case HSAIL_ARGTYPE_POINTER:
// If it is a global pointer
if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) {
Memory* gpuMem = NULL;
amd::Memory* mem = NULL;
if (kernelParams.boundToSvmPointer(dev(), parameters, i)) {
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr));
mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast<void* const*>(paramaddr));
if (mem != NULL) {
gpuMem = dev().getGpuMemory(mem);
gpuMem->wait(gpu, WaitOnBusyEngine);
memList.push_back(gpuMem);
}
else {
return NULL;
}
break;
}
if (nativeMem) {
gpuMem = *reinterpret_cast<Memory* const*>(paramaddr);
}
else {
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
if (mem != NULL) {
gpuMem = dev().getGpuMemory(mem);
}
}
if (gpuMem == NULL) {
WriteAqlArg(&aqlArgBuf, &gpuMem, sizeof(void*));
break;
}
//! @todo 64 bit isn't supported with 32 bit binary
uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset();
WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*));
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, WaitOnBusyEngine);
//! @todo Compiler has to return read/write attributes
if ((NULL != mem) &&
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
mem->signalWrite(&dev());
}
memList.push_back(gpuMem);
}
// If it is a local pointer
else {
assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) &&
"Unsupported address type");
ldsAddress = amd::alignUp(ldsAddress, arg->alignment_);
WriteAqlArg(&aqlArgBuf, &ldsAddress, sizeof(size_t));
ldsAddress += *reinterpret_cast<const size_t *>(paramaddr);
}
break;
case HSAIL_ARGTYPE_VALUE:
// Special case for structrues
if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) {
// Copy the current structre into CB1
memcpy(aqlStruct, paramaddr, arg->size_);
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(arg->size_);
// Then use a pointer in aqlArgBuffer to CB1
uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset();
WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*));
memList.push_back(cb);
}
else {
WriteAqlArg(&aqlArgBuf, paramaddr,
arg->numElem_ * arg->size_, arg->size_);
}
break;
case HSAIL_ARGTYPE_IMAGE: {
Image* image = NULL;
amd::Memory* mem = NULL;
if (nativeMem) {
image = static_cast<Image*>(*reinterpret_cast<Memory* const*>(paramaddr));
}
else {
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
if (mem == NULL) {
LogError( "The kernel image argument isn't an image object!");
return false;
}
image = static_cast<Image*>(dev().getGpuMemory(mem));
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
image->wait(gpu, WaitOnBusyEngine);
if (dev().settings().hsailDirectSRD_) {
// Image arguments are of size 48 bytes and aligned to 16 bytes
WriteAqlArg(&aqlArgBuf, image->hwState(),
HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
}
else {
uint64_t srd = image->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
}
//! @todo Compiler has to return read/write attributes
if ((NULL != mem) &&
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
mem->signalWrite(&dev());
}
memList.push_back(image);
break;
}
case HSAIL_ARGTYPE_SAMPLER: {
const amd::Sampler* sampler =
*reinterpret_cast<amd::Sampler* const*>(paramaddr);
const Sampler* gpuSampler = static_cast<Sampler*>
(sampler->getDeviceSampler(dev()));
if (dev().settings().hsailDirectSRD_) {
WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
HSA_SAMPLER_OBJECT_SIZE, HSA_SAMPLER_OBJECT_ALIGNMENT);
}
else {
uint64_t srd = gpuSampler->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
}
break;
}
case HSAIL_ARGTYPE_QUEUE: {
const amd::DeviceQueue* queue =
*reinterpret_cast<amd::DeviceQueue* const*>(paramaddr);
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
uint64_t vmQueue = gpuQueue->vQueue()->vmAddress();
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
break;
}
default:
LogError(" Unsupported address type ");
return NULL;
}
}
if (ldsAddress > dev().info().localMemSize_) {
LogError("No local memory available\n");
return NULL;
}
assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) &&
"Size and the number of arguments don't match!");
uint argBufSize = amd::alignUp(
static_cast<uint>(argsBufferSize()), sizeof(uint32_t));
HsaAqlDispatchPacket* aqlPkt = reinterpret_cast<HsaAqlDispatchPacket*>(
gpu.cb(0)->sysMemCopy() + argBufSize);
amd::NDRange local(sizes.local());
amd::NDRange global(sizes.global());
// Initialize the Global, Local and Offset values
aqlPkt->dimensions = sizes.dimensions();
// Initialize the work grid parameter
for (uint idx = 0; idx < 3; idx++) {
aqlPkt->grid_size[idx] = 1;
aqlPkt->workgroup_size[idx] = 1;
}
// Check if runtime has to find local workgroup size
findLocalWorkSize(aqlPkt->dimensions, global, local);
for (uint idx = 0; idx < aqlPkt->dimensions; idx++) {
aqlPkt->grid_size[idx] = global[idx];
aqlPkt->workgroup_size[idx] = local[idx];
}
// Initialize if dispatch should enable profiling
aqlPkt->reserved2 = 0; //config->profile ? 1:0;
// Initialize kernel ISA and execution buffer requirements
aqlPkt->kernel_object_address = gpuAqlCode()->vmAddress();
aqlPkt->group_segment_size_bytes = ldsAddress - ldsSize();
aqlPkt->private_segment_size_bytes = spillSegSize();
// Initialize cache flush configuration for the dispatch
//! @todo Currently not used in emulation
aqlPkt->barrier = 1;
aqlPkt->release_fence_scope = 1;
aqlPkt->acquire_fence_scope = 2;
aqlPkt->invalidate_instruction_cache = 1;
ConstBuffer* cb = gpu.constBufs_[0];
cb->uploadDataToHw(argBufSize + sizeof(HsaAqlDispatchPacket));
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
aqlPkt->kernel_arg_address = argList;
memList.push_back(cb);
memList.push_back(gpuAqlCode());
if (NULL != prog().globalStore()) {
memList.push_back(prog().globalStore());
}
if (cpuAqlCode_->enable_sgpr_queue_ptr) {
memList.push_back(gpu.hsaQueueMem());
}
if (srdResource) {
dev().srds().fillResourceList(memList);
}
return aqlPkt;
}
} // namespace gpu