Files
rocm-systems/rocclr/runtime/device/gpu/gpukernel.cpp
T

3506 wiersze
112 KiB
C++
Czysty Zwykły widok Historia

2014-07-04 16:17:05 -04:00
//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//
#include "device/gpu/gpukernel.hpp"
#include "device/gpu/gpuprogram.hpp"
#include "device/gpu/gpublit.hpp"
#include "device/gpu/gpuconstbuf.hpp"
#include "device/gpu/gpusched.hpp"
#include "platform/commandqueue.hpp"
#include "shader/ComputeProgramObject.h"
2014-07-04 16:17:05 -04:00
#include "utils/options.hpp"
#include "acl.h"
#include "SCCommon.h"
2014-07-04 16:17:05 -04:00
#include <string>
#include <memory>
#include <fstream>
#include <sstream>
#include <iostream>
#include <ctime>
#include <algorithm>
2014-07-04 16:17:05 -04:00
namespace gpu {
const MetaDataConst ArgState[ArgStateTotal] = {
// Note: the order is important
2014-07-04 16:17:05 -04:00
// Name Type Properties
// Kernel description (special properties)
{"memory:compilerwrite", KernelArg::PrivateFixed, {0, 0, 0, 0, 0, 0, 0}},
{"uniqueid:", KernelArg::NoType, {0, 0, 0, 0, 0, 0, 0}},
{"memory:private:", KernelArg::PrivateSize, {0, 0, 0, 0, 0, 0, 0}},
{"memory:local:", KernelArg::LocalSize, {0, 0, 0, 0, 0, 0, 0}},
{"memory:hwprivate:", KernelArg::HwPrivateSize, {0, 0, 0, 0, 0, 0, 0}},
{"memory:uavprivate:", KernelArg::HwPrivateSize, {0, 0, 0, 0, 0, 0, 0}},
{"memory:hwlocal:", KernelArg::HwLocalSize, {0, 0, 0, 0, 0, 0, 0}},
{"memory:64bitABI", KernelArg::ABI64Bit, {0, 0, 0, 0, 0, 0, 0}},
{"limitgroupsize", KernelArg::Wavefront, {0, 0, 0, 0, 0, 0, 0}},
{"function:", KernelArg::NoType, {1, 1, 0, 0, 0, 0, 0}},
{"intrinsic:", KernelArg::NoType, {1, 0, 0, 0, 0, 0, 0}},
{"error:", KernelArg::ErrorMessage, {0, 0, 0, 0, 0, 0, 0}},
{"warning:", KernelArg::WarningMessage, {0, 0, 0, 0, 0, 0, 0}},
{"printf_fmt:", KernelArg::PrintfFormatStr, {0, 0, 0, 0, 0, 0, 0}},
{"version:", KernelArg::MetadataVersion, {0, 0, 0, 0, 0, 0, 0}},
// Kernel basic types
{"pointer:", KernelArg::PointerGlobal, {1, 1, 1, 1, 1, 1, 0}},
{"value:", KernelArg::Value, {1, 1, 1, 1, 1, 0, 0}},
{"image:", KernelArg::Image, {1, 1, 1, 1, 1, 0, 0}},
{"sampler:", KernelArg::Sampler, {0, 1, 0, 0, 0, 0, 0}},
{"counter:", KernelArg::Counter, {1, 1, 0, 1, 1, 0, 0}},
{"cws:", KernelArg::Grouping, {0, 0, 0, 0, 0, 0, 0}},
{"lws:", KernelArg::WrkgrpSize, {0, 0, 0, 0, 0, 0, 0}},
{"uavid:", KernelArg::UavId, {0, 0, 0, 0, 0, 0, 0}},
{"reflection:", KernelArg::Reflection, {0, 0, 0, 0, 0, 0, 0}},
{"constarg:", KernelArg::ConstArg, {0, 0, 0, 0, 0, 0, 0}},
{"cbid:", KernelArg::ConstBufId, {0, 0, 0, 0, 0, 0, 0}},
{"printfid:", KernelArg::PrintfBufId, {0, 0, 0, 0, 0, 0, 0}},
{"wsh:", KernelArg::GroupingHint, {0, 0, 0, 0, 0, 0, 0}},
{"vth:", KernelArg::VecTypeHint, {0, 0, 0, 0, 0, 0, 0}},
{"WavesPerSimdHint:", KernelArg::WavesPerSimdHint, {0, 0, 0, 0, 0, 0, 0}},
2014-07-04 16:17:05 -04:00
};
const DataTypeConst DataType[] = {
{
"i8:", KernelArg::Char,
},
{
"i16:", KernelArg::Short,
},
{
"i32:", KernelArg::Int,
},
{
"i64:", KernelArg::Long,
},
{
"u8:", KernelArg::UChar,
},
{
"u16:", KernelArg::UShort,
},
{
"u32:", KernelArg::UInt,
},
{
"u64:", KernelArg::ULong,
},
{
"float:", KernelArg::Float,
},
{
"double:", KernelArg::Double,
},
{
"struct:", KernelArg::Struct,
},
{
"union:", KernelArg::Union,
},
{
"1D:", KernelArg::Image1D,
},
{
"2D:", KernelArg::Image2D,
},
{
"3D:", KernelArg::Image3D,
},
{
"1DB:", KernelArg::Image1DB,
},
{
"1DA:", KernelArg::Image1DA,
},
{
"2DA:", KernelArg::Image2DA,
},
{
"opaque:", KernelArg::Opaque,
},
{
"event:", KernelArg::Event,
},
{
"sampler:", KernelArg::Sampler,
},
{
"half:", KernelArg::Half,
},
2014-07-04 16:17:05 -04:00
};
const uint DataTypeTotal = sizeof(DataType) / sizeof(DataTypeConst);
struct BufDataConst {
const char* tagName_; //!< buffer's name
KernelArg::ArgumentType type_; //!< type of argument
struct {
uint number_ : 1; //!< buffer's number
uint alignment_ : 1; //!< buffer's alignment
uint attribute_ : 1; //!< buffer's read/write attribute
uint reserved : 29; //!< reserved
};
2014-07-04 16:17:05 -04:00
};
static const BufDataConst BufType[] = {{"g", KernelArg::PointerGlobal, {1, 0, 0, 0}},
{"p", KernelArg::PointerPrivate, {1, 1, 1, 0}},
{"l", KernelArg::PointerLocal, {1, 1, 1, 0}},
{"uav", KernelArg::PointerGlobal, {1, 1, 1, 0}},
{"c", KernelArg::PointerConst, {1, 1, 1, 0}},
{"hl", KernelArg::PointerHwLocal, {1, 1, 1, 0}},
{"hp", KernelArg::PointerHwPrivate, {1, 1, 1, 0}},
{"hc", KernelArg::PointerHwConst, {1, 1, 1, 0}}};
2014-07-04 16:17:05 -04:00
static const uint BufTypeTotal = sizeof(BufType) / sizeof(BufDataConst);
//! The mathlib constants for each kernel execution
static const float MathLibConst[4] = {0.0f, 0.5f, 1.0f, 2.0f};
bool expect(const std::string& str, size_t* pos, const std::string& sym) {
bool result = true;
uint i;
if (*pos == std::string::npos) {
return false;
}
// Check if we have expected symbols
for (i = 0; i < sym.size(); ++i) {
char deb = str[*pos + i];
if (deb != sym[i]) {
result = false;
break;
}
}
if (result) *pos += i;
return result;
}
bool getword(const std::string& str, size_t* pos, std::string& sym) {
if (*pos == std::string::npos) {
return false;
}
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;", *pos);
size_t count = posEnd - *pos;
if (count != 0) {
sym = str.substr(*pos, count);
}
sym[count] = 0;
*pos = posEnd + 1;
return true;
}
bool getstring(const std::string& str, size_t* pos, std::string* out) {
if (*pos == std::string::npos) {
return false;
}
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(":\n\r;", *pos);
size_t count = posEnd - *pos;
char* sym = new char[count + 1];
if (count != 0) {
if (!str.copy(sym, count, *pos)) {
return false;
}
}
sym[count] = 0;
*out = sym;
delete[] sym;
*pos = posEnd + 1;
return true;
}
bool getuint(const std::string& str, size_t* pos, uint* val) {
if (*pos == std::string::npos) {
return false;
}
char sym[16];
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
if (!str.copy(sym, posEnd - *pos, *pos)) {
return false;
}
*val = 0;
for (size_t i = 0; i < (posEnd - *pos); ++i) {
*val = (*val * 10) + (sym[i] - 0x30);
}
*pos = posEnd + 1;
return true;
}
bool getuintHex(const std::string& str, size_t* pos, uint* val) {
if (*pos == std::string::npos) {
return false;
}
char sym[16];
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
if (!str.copy(sym, posEnd - *pos, *pos)) {
return false;
}
*val = 0;
for (size_t i = 0; i < (posEnd - *pos); ++i) {
if (sym[i] >= '0' && sym[i] <= 'F') {
*val = (*val * 16) + (sym[i] - '0');
} else if (sym[i] >= 'a' && sym[i] <= 'f') {
*val = (*val * 16) + (sym[i] - 'a' + 10);
} else {
return false;
}
}
*pos = posEnd + 1;
return true;
}
bool getuint64Hex(const std::string& str, size_t* pos, uint64_t* val) {
if (*pos == std::string::npos) {
return false;
}
char sym[16];
*pos = str.find_first_not_of(" \n\r", *pos);
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
if (!str.copy(sym, posEnd - *pos, *pos)) {
return false;
}
*val = 0;
for (size_t i = 0; i < (posEnd - *pos); ++i) {
if (sym[i] >= '0' && sym[i] <= 'F') {
*val = (*val * 16) + (sym[i] - '0');
} else if (sym[i] >= 'a' && sym[i] <= 'f') {
*val = (*val * 16) + (sym[i] - 'a' + 10);
} else {
return false;
}
}
*pos = posEnd + 1;
return true;
}
void intToStr(size_t value, char* str, size_t size) {
static const uint MaxDigits32bit = 10;
char result[MaxDigits32bit];
uint idx = MaxDigits32bit;
do {
idx--;
result[idx] = static_cast<char>((value % 10) + '0');
value /= 10;
} while ((value != 0) && (idx > 0));
size_t len = MaxDigits32bit - idx;
size_t n = std::min<size_t>(len, size - 1);
memcpy(str, &result[idx], n);
str[n] = '\0';
2014-07-04 16:17:05 -04:00
}
//! Default destructor
CalImageReference::~CalImageReference() {
// Free CAL image
free(image_);
2014-07-04 16:17:05 -04:00
}
KernelArg::KernelArg()
: type_(KernelArg::NoType),
size_(0),
cbIdx_(0),
cbPos_(0),
index_(0),
alignment_(1),
dataType_(KernelArg::NoType) {
name_ = "";
buf_ = "";
memory_.value_ = 0;
typeQualifier_ = CL_KERNEL_ARG_TYPE_NONE;
}
KernelArg::KernelArg(const KernelArg& data) {
// Fill the new object
*this = data;
}
KernelArg& KernelArg::operator=(const KernelArg& data) {
// Fill the fields of the current object
name_ = data.name_;
typeName_ = data.typeName_;
typeQualifier_ = data.typeQualifier_;
type_ = data.type_;
size_ = data.size_;
cbIdx_ = data.cbIdx_;
cbPos_ = data.cbPos_;
buf_ = data.buf_;
index_ = data.index_;
alignment_ = data.alignment_;
dataType_ = data.dataType_;
memory_.value_ = data.memory_.value_;
return *this;
}
bool KernelArg::isCbNeeded() const {
//! \note not a safe way
bool result = ((type_ > NoType) && (type_ < Sampler)) ? true : false;
if ((type_ == Sampler) && (location_ == 0)) {
// Sampler is defined outside the kernel
result = true;
}
return result;
}
size_t KernelArg::size(bool gpuLayer) const {
switch (type_) {
case NoType:
return 0;
2014-07-04 16:17:05 -04:00
case PointerConst:
case PointerHwConst:
case PointerGlobal:
return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem);
2014-07-04 16:17:05 -04:00
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
return (gpuLayer) ? sizeof(ImageConstants) : sizeof(cl_mem);
2014-07-04 16:17:05 -04:00
case Sampler:
return (gpuLayer) ? 2 * sizeof(uint32_t) : sizeof(cl_sampler);
2014-07-04 16:17:05 -04:00
case Counter:
return (gpuLayer) ? 0 : sizeof(cl_mem);
2014-07-04 16:17:05 -04:00
case PointerLocal:
case PointerHwLocal:
return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem);
2014-07-04 16:17:05 -04:00
case PointerPrivate:
case PointerHwPrivate:
return (gpuLayer) ? sizeof(uint32_t) * size_ : 0;
2014-07-04 16:17:05 -04:00
case Float:
return sizeof(cl_float) * amd::nextPowerOfTwo(size_);
2014-07-04 16:17:05 -04:00
case Double:
return sizeof(cl_double) * amd::nextPowerOfTwo(size_);
2014-07-04 16:17:05 -04:00
case Char:
case UChar:
return sizeof(cl_char) * amd::nextPowerOfTwo(size_);
2014-07-04 16:17:05 -04:00
case Short:
case UShort:
return sizeof(cl_short) * amd::nextPowerOfTwo(size_);
2014-07-04 16:17:05 -04:00
case Int:
case UInt:
return sizeof(cl_uint) * amd::nextPowerOfTwo(size_);
2014-07-04 16:17:05 -04:00
case Long:
case ULong:
return sizeof(cl_ulong) * amd::nextPowerOfTwo(size_);
2014-07-04 16:17:05 -04:00
case Struct:
case Union:
return (gpuLayer) ? amd::alignUp(size_, 16) : size_;
2014-07-04 16:17:05 -04:00
default:
return 0;
}
2014-07-04 16:17:05 -04:00
}
cl_kernel_arg_address_qualifier KernelArg::addressQualifier() const {
switch (type_) {
2014-07-04 16:17:05 -04:00
case PointerGlobal:
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
2014-07-04 16:17:05 -04:00
case PointerLocal:
case PointerHwLocal:
return CL_KERNEL_ARG_ADDRESS_LOCAL;
2014-07-04 16:17:05 -04:00
case PointerConst:
case PointerHwConst:
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
2014-07-04 16:17:05 -04:00
default:
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
2014-07-04 16:17:05 -04:00
}
cl_kernel_arg_access_qualifier KernelArg::accessQualifier() const {
switch (type_) {
2014-07-04 16:17:05 -04:00
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
if (memory_.readOnly_) {
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
} else if (memory_.writeOnly_) {
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
} else if (memory_.readWrite_) {
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
}
// Fall through ...
2014-07-04 16:17:05 -04:00
default:
return CL_KERNEL_ARG_ACCESS_NONE;
}
2014-07-04 16:17:05 -04:00
}
//! temporary solution for the vectors handling in compiler
size_t KernelArg::specialVector() const {
if (size_ > VectorSizeLimit) {
switch (type_) {
case Char:
case UChar:
return sizeof(cl_char);
case Short:
case UShort:
return sizeof(cl_short);
default:
return 0;
2014-07-04 16:17:05 -04:00
}
}
return 0;
2014-07-04 16:17:05 -04:00
}
clk_value_type_t KernelArg::type() const {
switch (type_) {
2014-07-04 16:17:05 -04:00
case PointerGlobal:
case PointerLocal:
case PointerHwLocal:
case PointerConst:
case PointerHwConst:
case Image1D:
case Image2D:
case Image3D:
case Image1DB:
case Image1DA:
case Image2DA:
case Counter:
return T_POINTER;
2014-07-04 16:17:05 -04:00
case Float:
return T_FLOAT;
2014-07-04 16:17:05 -04:00
case Double:
return T_DOUBLE;
2014-07-04 16:17:05 -04:00
case Char:
case UChar:
return T_CHAR;
2014-07-04 16:17:05 -04:00
case Short:
case UShort:
return T_SHORT;
2014-07-04 16:17:05 -04:00
case Int:
return T_INT;
2014-07-04 16:17:05 -04:00
case UInt:
//! \note No UINT type
return T_INT;
2014-07-04 16:17:05 -04:00
case Long:
return T_LONG;
2014-07-04 16:17:05 -04:00
case ULong:
//! \note No ULONG type
return T_LONG;
2014-07-04 16:17:05 -04:00
case Struct:
case Union:
//! @todo What should we report?
return T_CHAR;
2014-07-04 16:17:05 -04:00
case Sampler:
return T_SAMPLER;
2014-07-04 16:17:05 -04:00
case PointerPrivate:
case PointerHwPrivate:
case NoType:
2014-07-04 16:17:05 -04:00
default:
return T_VOID;
}
}
NullKernel::NullKernel(const std::string& name, const NullDevice& gpuNullDev,
const NullProgram& nullprog)
: device::Kernel(gpuNullDev, name, nullprog),
buildError_(CL_BUILD_PROGRAM_FAILURE),
gpuDev_(gpuNullDev),
calRef_(NULL),
internal_(false),
flags_(0),
cbSizes_(NULL),
numCb_(0),
rwAttributes_(false),
instructionCnt_(4) {
// UAV raw index will be detected
uavRaw_ = UavIdUndefined;
// CB index will be detected
cbId_ = UavIdUndefined;
// Printf index will be detected
printfId_ = UavIdUndefined;
}
NullKernel::~NullKernel() {
uint idx;
if (calRef_ == NULL) {
return;
}
calRef_->release();
// Destroy all kernel arguments
for (idx = 0; idx < arguments_.size(); ++idx) {
delete arguments_[idx];
}
arguments_.clear();
// Destroy all sampler kernel arguments
for (idx = 0; idx < intSamplers_.size(); ++idx) {
delete intSamplers_[idx];
}
intSamplers_.clear();
}
static int scComponentToArrayIndex(E_SC_COMPONENT dstComp) {
switch (dstComp) {
case SC_COMPONENT_X:
return 0;
case SC_COMPONENT_Y:
return 1;
case SC_COMPONENT_Z:
return 2;
case SC_COMPONENT_W:
return 3;
}
return 0;
}
static void addLoopConst(const SC_HWSHADER* shader, AMUabiAddEncoding& encoding) {
uint count = shader->dep.NumIntrlIConstants;
encoding.litConstsCount = shader->dep.NumIntrlIConstants;
// only suppport loop consts (int consts)
if (count) {
AMUabiLiteralConst* allocatedconsts = encoding.litConsts;
memset(allocatedconsts, 0, count * sizeof(AMUabiLiteralConst));
uint usedConsts = 0;
for (uint i = 0; i < count; ++i) {
uint currentConst;
for (currentConst = 0; currentConst < usedConsts; ++currentConst) {
if (allocatedconsts[currentConst].addr ==
HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber) {
break;
}
}
if (currentConst == usedConsts) {
usedConsts++;
assert(usedConsts <= count);
}
allocatedconsts[currentConst].addr = HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber;
allocatedconsts[currentConst].type = AMU_ABI_INT32;
allocatedconsts[currentConst].value.int32[scComponentToArrayIndex(
HWSHADER_Get(shader, dep.IntrlIConstants)[i].eDstComp)] =
HWSHADER_Get(shader, dep.IntrlIConstants)[i].iValue;
}
encoding.litConstsCount = usedConsts;
}
}
bool NullKernel::create(const std::string& code, const std::string& metadata,
const void* binaryCode, size_t binarySize) {
std::auto_ptr<uint> uavRefCount(new uint[MaxUavArguments]);
if (NULL == uavRefCount.get()) {
return false;
}
// Set all ref counts to 0
memset(uavRefCount.get(), 0, sizeof(uavRefCount.get()[0]) * MaxUavArguments);
// parse the metadata fields
if (!parseArguments(metadata, uavRefCount.get())) {
return false;
}
CALimage calImage;
// Save source if DEBUG build
#if DEBUG
ilSource_ = code;
#endif // DEBUG
amd::option::Options* options = nullProg().getCompilerOptions();
internal_ = options->oVariables->clInternalKernel;
if ((binaryCode == NULL) && (binarySize == 0) && !code.empty()) {
acl_error err;
std::string arch = "amdil";
if (nullDev().settings().use64BitPtr_) {
arch += "64";
}
aclTargetInfo info = aclGetTargetInfo(arch.c_str(), nullDev().hwInfo()->targetName_, &err);
if (err != ACL_SUCCESS) {
LogWarning("aclGetTargetInfo failed");
return false;
2014-07-04 16:17:05 -04:00
}
aclBinaryOptions binOpts = {0};
binOpts.struct_size = sizeof(binOpts);
binOpts.elfclass = info.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32;
binOpts.bitness = ELFDATA2LSB;
binOpts.alloc = &::malloc;
binOpts.dealloc = &::free;
2014-07-04 16:17:05 -04:00
aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &info, &binOpts, &err);
if (err != ACL_SUCCESS) {
LogWarning("aclBinaryInit failed");
return false;
}
2014-07-04 16:17:05 -04:00
if (ACL_SUCCESS !=
aclInsertSection(nullDev().amdilCompiler(), bin, code.data(), code.size(), aclSOURCE)) {
LogWarning("aclInsertSection failed");
aclBinaryFini(bin);
return false;
2014-07-04 16:17:05 -04:00
}
amd::option::Options* Opts = (amd::option::Options*)bin->options;
// Append an option so that we can selectively enable a SCOption on CZ
// whenever IOMMUv2 is enabled.
if (nullDev().settings().svmFineGrainSystem_) {
options->origOptionStr.append(" -sc-xnack-iommu");
2014-07-04 16:17:05 -04:00
}
// temporary solution to synchronize buildNo between runtime and complib
// until we move runtime inside complib
Opts->setBuildNo(options->getBuildNo());
// pass kernel name to compiler
Opts->setCurrKernelName(name().c_str());
err = aclCompile(nullDev().amdilCompiler(), bin, options->origOptionStr.c_str(), ACL_TYPE_AMDIL_TEXT,
ACL_TYPE_ISA, NULL);
2014-07-04 16:17:05 -04:00
buildLog_ += aclGetCompilerLog(nullDev().amdilCompiler());
if (err != ACL_SUCCESS) {
LogWarning("aclCompile failed");
aclBinaryFini(bin);
return false;
}
if (!options->oVariables->BinEXE) {
// Early exit if binary doesn't contain EXE
aclBinaryFini(bin);
return true;
}
size_t len;
const void* isa = aclExtractSection(nullDev().amdilCompiler(), bin, &len, aclTEXT, &err);
if (err != ACL_SUCCESS) {
LogWarning("aclExtractSection failed");
aclBinaryFini(bin);
return false;
2014-07-04 16:17:05 -04:00
}
uint calImageSize;
if (!createMultiBinary(&calImageSize, reinterpret_cast<void**>(&calImage), isa)) {
LogWarning("initSrcEncoding failed");
aclBinaryFini(bin);
return false;
}
2014-07-04 16:17:05 -04:00
aclBinaryFini(bin);
} else if ((binaryCode != NULL) && (binarySize != 0)) {
uint size = 0;
if (!amuABIMultiBinaryGetSize(&size, const_cast<void*>(binaryCode)) || size > binarySize) {
buildLog_ += "Invalid binary image";
LogError("amuABIMultiBinaryGetSize failed!");
return false;
2014-07-04 16:17:05 -04:00
}
calImage = static_cast<CALimage>(malloc(size));
::memcpy(calImage, binaryCode, size);
} else {
LogError("Incorrect initialization parameters!");
return false;
}
calRef_ = new CalImageReference(calImage);
if (calRef_ == NULL) {
LogError("Memory allocation failure!");
// Free CAL image
free(calImage);
return false;
}
CALfuncInfo calFuncInfo;
// Get kernel compiled information
getFuncInfoFromImage(calImage, &calFuncInfo);
if (calFuncInfo.maxScratchRegsNeeded > 0) {
LogPrintfInfo(
"%s kernel has register spilling."
"Lower performance is expected.",
name().c_str());
}
workGroupInfo_.scratchRegs_ = calFuncInfo.maxScratchRegsNeeded;
workGroupInfo_.wavefrontPerSIMD_ = calFuncInfo.numWavefrontPerSIMD;
workGroupInfo_.wavefrontSize_ = calFuncInfo.wavefrontSize;
workGroupInfo_.availableGPRs_ = calFuncInfo.numGPRsAvailable;
workGroupInfo_.usedGPRs_ = calFuncInfo.numGPRsUsed;
workGroupInfo_.availableSGPRs_ = calFuncInfo.numSGPRsAvailable;
workGroupInfo_.usedSGPRs_ = calFuncInfo.numSGPRsUsed;
workGroupInfo_.availableVGPRs_ = calFuncInfo.numVGPRsAvailable;
workGroupInfo_.usedVGPRs_ = calFuncInfo.numVGPRsUsed;
workGroupInfo_.availableLDSSize_ = calFuncInfo.LDSSizeAvailable;
workGroupInfo_.usedLDSSize_ = calFuncInfo.LDSSizeUsed;
workGroupInfo_.availableStackSize_ = calFuncInfo.stackSizeAvailable;
workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed;
device::Kernel::parameters_t params;
if (!createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0)) {
return false;
}
return true;
}
size_t NullKernel::getCalBinarySize() const {
CALuint imageSize;
if (!amuABIMultiBinaryGetSize(&imageSize, calImage())) {
LogError("Failed to get the image size!");
2014-07-04 16:17:05 -04:00
return 0;
}
return static_cast<size_t>(imageSize);
2014-07-04 16:17:05 -04:00
}
bool NullKernel::getCalBinary(void* binary, size_t size) const {
uint calImageSize = 0;
if (!amuABIMultiBinaryGetSize(&calImageSize, calImage()) || size < calImageSize) {
LogError("CAL failed to save the kernel binary!");
return false;
}
::memcpy(binary, calImage(), calImageSize);
return true;
}
bool Kernel::create(const std::string& code, const std::string& metadata, const void* binaryCode,
size_t binarySize) {
setPreferredSizeMultiple(dev().getAttribs().wavefrontSize);
if (!NullKernel::create(code, metadata, binaryCode, binarySize)) {
return false;
}
// initialize constant buffer sizes
if (!initConstBuffers()) {
return false;
}
// Initialize the kernel parameters
bool result = initParameters();
// Wave limiter needs to be initialized after kernel metadata is parsed
// Since it depends on it.
waveLimiter_.enable(dev().settings().ciPlus_);
if (result) {
buildError_ = CL_SUCCESS;
} else {
result = false;
}
return result;
}
Kernel::Kernel(const std::string& name, const Device& gpuDev, const Program& prog,
const InitData* initData)
: NullKernel(name, gpuDev, prog) {
hwPrivateSize_ = 0;
if (NULL != initData) {
flags_ = initData->flags_;
hwPrivateSize_ = initData->hwPrivateSize_;
hwLocalSize_ = initData->hwLocalSize_;
}
// Workgroup info private memory size
workGroupInfo_.privateMemSize_ = hwPrivateSize_;
// Default wavesPerSimdHint_
workGroupInfo_.wavesPerSimdHint_ = ~0U;
}
Kernel::~Kernel() {
if (calRef_ == NULL) {
return;
}
{
Device::ScopedLockVgpus lock(dev());
// Release all virtual image objects on all virtual GPUs
for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
dev().vgpus()[idx]->releaseKernel(calImage());
}
}
if (0 != numCb_) {
delete[] cbSizes_;
}
}
const Device& Kernel::dev() const { return reinterpret_cast<const Device&>(gpuDev_); }
const Program& Kernel::prog() const { return reinterpret_cast<const Program&>(prog_); }
bool NullKernel::createMultiBinary(uint* imageSize, void** image, const void* isa) {
const SC_HWSHADER* shader = reinterpret_cast<const SC_HWSHADER*>(isa);
bool result = false;
AMUabiAddEncoding encoding;
memset(&encoding, 0, sizeof(AMUabiAddEncoding));
size_t allocSize = sizeof(uint) * MaxReadImage + sizeof(CALUavEntry) * MaxUavArguments +
sizeof(CALSamplerMapEntry) * MaxSamplers + sizeof(CALConstantBufferMask) * MaxConstBuffers +
sizeof(AMUabiLiteralConst) * shader->dep.NumIntrlIConstants;
char* tmpMem = new char[allocSize];
if (tmpMem == NULL) {
LogError("Error allocating memory");
return false;
}
CalcPtr(encoding.inputs, tmpMem, 0, 0);
CalcPtr(encoding.uav, encoding.inputs, sizeof(uint), MaxReadImage);
CalcPtr(encoding.inputSamplerMaps, encoding.uav, sizeof(CALUavEntry), MaxUavArguments);
CalcPtr(encoding.constBuffers, encoding.inputSamplerMaps, sizeof(CALSamplerMapEntry),
MaxSamplers);
if (shader->dep.NumIntrlIConstants != 0) {
CalcPtr(encoding.litConsts, encoding.constBuffers, sizeof(CALConstantBufferMask),
MaxConstBuffers);
}
AMUabiMultiBinary amuBinary;
amuABIMultiBinaryCreate(&amuBinary);
result = siCreateHwInfo(shader, encoding);
if (!result) {
delete[] tmpMem;
LogWarning("Error Creating program info");
return false;
}
addLoopConst(shader, encoding);
unsigned int outputCount = 0, condOut = 0, earlyExit = 0, globalCount = 0, persistentCount = 0;
unsigned int symbolCount = 0;
CALOutputEntry* outputs = 0;
unsigned int* globalBuffers = 0;
unsigned int* persistentBuffers = 0;
AMUabiUserSymbol* symbols = 0;
CALSamplerMapEntry* inputSamplers = encoding.inputSamplerMaps;
CALConstantBufferMask* constBuffers = encoding.constBuffers;
uint* inputResources = encoding.inputs;
CALUavEntry* uav = encoding.uav;
uint inputSamplerCount = samplerSize();
for (uint i = 0; i < inputSamplerCount; ++i) {
inputSamplers[i].resource = 0;
inputSamplers[i].sampler = sampler(i)->index_;
}
uint constBufferCount = 2;
constBuffers[0].index = 0;
constBuffers[1].index = 1;
uint inputResourceCount = 0;
uint uavCount = 0;
bool cbBound = false;
bool printfBound = false;
for (uint i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
switch (arg->type_) {
case KernelArg::PointerConst:
case KernelArg::PointerHwConst:
constBuffers[constBufferCount++].index = arg->index_;
break;
case KernelArg::PointerGlobal:
uav[uavCount].offset = arg->index_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
break;
case KernelArg::ConstBufId:
if (!cbBound) {
uav[uavCount].offset = cbId_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
cbBound = true;
break;
case KernelArg::PrintfBufId:
if (!printfBound) {
uav[uavCount].offset = printfId_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
printfBound = true;
break;
case KernelArg::UavId:
if ((UavIdUndefined != uavRaw_) && !(flags() & PrintfOutput)) {
uav[uavCount].offset = arg->index_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
} else {
if (UavIdUndefined != uavRaw_) {
uav[uavCount].offset = uavRaw_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
}
break;
case KernelArg::Sampler:
inputSamplers[inputSamplerCount].resource = 0;
inputSamplers[inputSamplerCount].sampler = arg->index_;
inputSamplerCount++;
break;
case KernelArg::Image1D:
case KernelArg::Image2D:
case KernelArg::Image3D:
case KernelArg::Image1DB:
case KernelArg::Image1DA:
case KernelArg::Image2DA:
if (arg->memory_.readOnly_) {
inputResources[inputResourceCount++] = arg->index_;
} else {
uav[uavCount].offset = arg->index_;
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPED;
uav[uavCount].dimension = AMU_ABI_DIM_2D;
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
uavCount++;
}
break;
default:
break;
}
}
for (uint i = 0; i < nullProg().glbCb().size(); ++i) {
constBuffers[constBufferCount++].index = nullProg().glbCb()[i];
}
encoding.machine = nullDev().hwInfo()->machine_;
encoding.type = ED_ATI_CAL_TYPE_COMPUTE;
encoding.inputCount = inputResourceCount;
encoding.outputCount = outputCount;
encoding.outputs = outputs;
encoding.condOut = condOut;
encoding.earlyExit = earlyExit;
encoding.globalBuffersCount = globalCount;
encoding.globalBuffers = globalBuffers;
encoding.persistentBuffersCount = persistentCount;
encoding.persistentBuffers = persistentBuffers;
encoding.constBuffersCount = constBufferCount;
encoding.inputSamplerMapCount = inputSamplerCount;
encoding.symbolsCount = symbolCount;
encoding.symbols = symbols;
encoding.uavCount = uavCount;
amuABIMultiBinaryAddEncoding(amuBinary, &encoding);
uint success = amuABIMultiBinaryPack(imageSize, image, amuBinary);
amuABIMultiBinaryDestroy(amuBinary);
delete[] tmpMem;
delete[] encoding.progInfos;
return (success == 0) ? false : true;
}
void Kernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
amd::NDRange& lclWorkSize) const {
// Initialize the default workgoup info
// Check if the kernel has the compiled sizes
if (workGroupInfo()->compileSize_[0] == 0) {
// Find the default local workgroup size, if it wasn't specified
if (lclWorkSize[0] == 0) {
if ((dev().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
// Find threads per group
size_t thrPerGrp = workGroupInfo()->size_;
// Check if kernel uses images
if ((flags() & ImageEnable) &&
// and thread group is a multiple value of wavefronts
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
// and it's 2 or 3-dimensional workload
(workDim > 1) && ((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)) {
// Use 8x8 workgroup size if kernel has image writes
if ((flags() & ImageWrite) || (thrPerGrp != nullDev().info().preferredWorkGroupSize_)) {
lclWorkSize[0] = 8;
lclWorkSize[1] = 8;
} else {
lclWorkSize[0] = 16;
lclWorkSize[1] = 16;
}
if (workDim == 3) {
lclWorkSize[2] = 1;
}
} else {
size_t tmp = thrPerGrp;
// Split the local workgroup into the most efficient way
for (uint d = 0; d < workDim; ++d) {
size_t div = tmp;
for (; (gblWorkSize[d] % div) != 0; div--)
;
lclWorkSize[d] = div;
tmp /= div;
}
// Assuming DWORD access
const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2;
// Check if we couldn't find optimal workload
if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
// or size is too small for the cache line
(lclWorkSize[0] < cacheLineMatch)) {
size_t maxSize = 0;
size_t maxDim = 0;
for (uint d = 0; d < workDim; ++d) {
if (maxSize < gblWorkSize[d]) {
maxSize = gblWorkSize[d];
maxDim = d;
}
2014-07-04 16:17:05 -04:00
}
// Use X dimension as high priority. Runtime will assume that
// X dimension is more important for the address calculation
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
lclWorkSize[0] = cacheLineMatch;
thrPerGrp /= cacheLineMatch;
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 1; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
} else {
// Check if a local workgroup has the most optimal size
if (thrPerGrp > maxSize) {
thrPerGrp = maxSize;
}
lclWorkSize[maxDim] = thrPerGrp;
for (uint d = 0; d < workDim; ++d) {
if (d != maxDim) {
lclWorkSize[d] = 1;
}
}
2014-07-04 16:17:05 -04:00
}
}
2014-07-04 16:17:05 -04:00
}
} else {
// Use overrides when app doesn't provide workgroup dimensions
if (workDim == 1) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
} else if (workDim == 2) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
} else if (workDim == 3) {
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
} else {
assert(0 && "Invalid workDim!");
}
}
2014-07-04 16:17:05 -04:00
}
} else {
for (uint d = 0; d < workDim; ++d) {
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
2014-07-04 16:17:05 -04:00
}
}
}
2014-07-04 16:17:05 -04:00
void Kernel::setupProgramGrid(VirtualGPU& gpu, size_t workDim, const amd::NDRange& glbWorkOffset,
const amd::NDRange& gblWorkSize, amd::NDRange& lclWorkSize,
const amd::NDRange& groupOffset, const amd::NDRange& glbWorkOffsetOrg,
const amd::NDRange& glbWorkSizeOrg) const {
// ABI is always in CB0
address cbBuf = gpu.cb(0)->sysMemCopy();
uint* pGlobalSize =
reinterpret_cast<uint*>(cbBuf + GlobalWorkitemOffset * ConstBuffer::VectorSize);
uint* pLocalSize = reinterpret_cast<uint*>(cbBuf + LocalWorkitemOffset * ConstBuffer::VectorSize);
uint* pNumGroups = reinterpret_cast<uint*>(cbBuf + GroupsOffset * ConstBuffer::VectorSize);
uint* pGlobalOffset =
reinterpret_cast<uint*>(cbBuf + GlobalWorkOffsetOffset * ConstBuffer::VectorSize);
uint* pGroupOffset =
reinterpret_cast<uint*>(cbBuf + GroupWorkOffsetOffset * ConstBuffer::VectorSize);
uint32_t* debugInfo = reinterpret_cast<uint*>(cbBuf + DebugOffset * ConstBuffer::VectorSize);
uint* pNDRangeGlobalOffset =
reinterpret_cast<uint*>(cbBuf + NDRangeGlobalWorkOffsetOffset * ConstBuffer::VectorSize);
2014-07-04 16:17:05 -04:00
// Check for 64-bit metadata
uint glbABIShift = (abi64Bit()) ? 1 : 0;
2014-07-04 16:17:05 -04:00
VirtualGPU::CalVirtualDesc* progGrid = &gpu.cal_;
2014-07-04 16:17:05 -04:00
// Finds local workgroup size
findLocalWorkSize(workDim, gblWorkSize, lclWorkSize);
2014-07-04 16:17:05 -04:00
// Initialize the execution grid block and size/offset
pGlobalSize[0] = pGlobalSize[1] = pGlobalSize[2] = 1;
pGlobalSize[3] = static_cast<uint>(workDim);
2014-07-04 16:17:05 -04:00
pLocalSize[0] = pLocalSize[1] = pLocalSize[2] = 1;
pLocalSize[3] = 0;
2014-07-04 16:17:05 -04:00
pNumGroups[0] = pNumGroups[1] = pNumGroups[2] = 1;
pNumGroups[3] = 0;
2014-07-04 16:17:05 -04:00
pGlobalOffset[2] = pGlobalOffset[1] = pGlobalOffset[0] = 0;
pGroupOffset[2] = pGroupOffset[1] = pGroupOffset[0] = 0;
progGrid->gridBlock.width = progGrid->gridBlock.height = progGrid->gridBlock.depth = 1;
2014-07-04 16:17:05 -04:00
progGrid->gridSize.width = progGrid->gridSize.height = progGrid->gridSize.depth = 1;
2014-07-04 16:17:05 -04:00
progGrid->partialGridBlock.width = progGrid->partialGridBlock.height =
progGrid->partialGridBlock.depth = 1;
2014-07-04 16:17:05 -04:00
bool partialGrid = false;
2014-07-04 16:17:05 -04:00
// Fill the right values, based on the application request
switch (workDim) {
case 3:
pLocalSize[2] = progGrid->gridBlock.depth = static_cast<CALuint>(lclWorkSize[2]);
pGlobalSize[2] = static_cast<CALuint>(glbWorkSizeOrg[2]);
progGrid->gridSize.depth = static_cast<CALuint>(gblWorkSize[2]);
progGrid->gridSize.depth /= progGrid->gridBlock.depth;
pNumGroups[2] = pGlobalSize[2] / progGrid->gridBlock.depth;
pGlobalOffset[2] = glbWorkOffset[2];
pGroupOffset[2] = groupOffset[2];
pNDRangeGlobalOffset[2 + glbABIShift] = glbWorkOffsetOrg[2];
// Check if partial workgroup dispatch is required
progGrid->partialGridBlock.depth = gblWorkSize[2] % lclWorkSize[2];
if (progGrid->partialGridBlock.depth != 0) {
partialGrid = true;
// Increment the number of groups
progGrid->gridSize.depth++;
pNumGroups[2]++;
} else {
progGrid->partialGridBlock.depth = lclWorkSize[2];
}
// Fall through to fill 2D and 1D dimensions...
case 2:
pLocalSize[1] = progGrid->gridBlock.height = static_cast<CALuint>(lclWorkSize[1]);
pGlobalSize[1] = static_cast<CALuint>(glbWorkSizeOrg[1]);
progGrid->gridSize.height = static_cast<CALuint>(gblWorkSize[1]);
progGrid->gridSize.height /= progGrid->gridBlock.height;
pNumGroups[1] = pGlobalSize[1] / progGrid->gridBlock.height;
pGlobalOffset[1] = glbWorkOffset[1];
pGroupOffset[1] = groupOffset[1];
pNDRangeGlobalOffset[1 + glbABIShift] = glbWorkOffsetOrg[1];
// Check if partial workgroup dispatch is required
progGrid->partialGridBlock.height = gblWorkSize[1] % lclWorkSize[1];
if (progGrid->partialGridBlock.height != 0) {
partialGrid = true;
// Increment the number of groups
progGrid->gridSize.height++;
pNumGroups[1]++;
} else {
progGrid->partialGridBlock.height = lclWorkSize[1];
}
// Fall through to fill 1D dimension...
case 1:
pLocalSize[0] = progGrid->gridBlock.width = static_cast<CALuint>(lclWorkSize[0]);
pGlobalSize[0] = static_cast<CALuint>(glbWorkSizeOrg[0]);
progGrid->gridSize.width = static_cast<CALuint>(gblWorkSize[0]);
progGrid->gridSize.width /= progGrid->gridBlock.width;
pNumGroups[0] = pGlobalSize[0] / progGrid->gridBlock.width;
pGlobalOffset[0] = glbWorkOffset[0];
pGroupOffset[0] = groupOffset[0];
pNDRangeGlobalOffset[0 + glbABIShift] = glbWorkOffsetOrg[0];
// Check if partial workgroup dispatch is required
progGrid->partialGridBlock.width = gblWorkSize[0] % lclWorkSize[0];
if (progGrid->partialGridBlock.width != 0) {
partialGrid = true;
// Increment the number of groups
progGrid->gridSize.width++;
pNumGroups[0]++;
} else {
progGrid->partialGridBlock.width = lclWorkSize[0];
}
break;
default:
LogWarning("Wrong dimensions. Force to 1x1x1!");
break;
}
2014-07-04 16:17:05 -04:00
if (!partialGrid) {
progGrid->partialGridBlock.width = progGrid->partialGridBlock.height =
progGrid->partialGridBlock.depth = 0;
}
// Calculate the total number of workitems and workgroups
pGlobalOffset[3] = pGroupOffset[3] = 1;
for (uint i = 0; i < workDim; ++i) {
pGlobalOffset[3] *= pGlobalOffset[i];
pGroupOffset[3] *= pGroupOffset[i];
}
// Setup debug output buffer (if printf is active)
if (flags() & PrintfOutput) {
if (abi64Bit()) {
// Setup the debug info in constant buffer
reinterpret_cast<uint64_t*>(debugInfo)[1] = gpu.printfDbg().bufOffset();
// Size in DWORDs
debugInfo[4] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
debugInfo[4] /= sizeof(uint32_t);
} else {
// Setup the debug info in constant buffer
debugInfo[1] = static_cast<uint32_t>(gpu.printfDbg().bufOffset());
// Size in DWORDs
debugInfo[2] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
debugInfo[2] /= sizeof(uint32_t);
}
}
}
bool Kernel::initParameters() {
size_t offset = 0;
device::Kernel::parameters_t params;
amd::KernelParameterDescriptor desc;
for (uint i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
// Initialize the arguments for the abstraction layer
if (arg->isCbNeeded()) {
desc.name_ = arg->name_.data();
desc.type_ = arg->type();
desc.size_ = arg->size(false);
desc.addressQualifier_ = arg->addressQualifier();
desc.accessQualifier_ = arg->accessQualifier();
desc.typeName_ = arg->typeName();
desc.typeQualifier_ = arg->typeQualifier();
// Make offset alignment to match CPU metadata, since
// in multidevice config abstraction layer has a single signature
// and CPU sends the paramaters as they are allocated in memory
size_t size = desc.size_;
if (size == 0) {
// Local memory for CPU
size = sizeof(cl_mem);
}
offset = amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
params.push_back(desc);
}
}
// Report the allocated local memory size (emulated and hw)
if (hwLocalSize_ != 0) {
CondLog((dev().info().localMemSize_ < hwLocalSize_),
"Requested local size is bigger than reported");
workGroupInfo_.localMemSize_ = hwLocalSize_;
}
2014-07-04 16:17:05 -04:00
if (!createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0)) {
return false;
}
2014-07-04 16:17:05 -04:00
return true;
}
2014-07-04 16:17:05 -04:00
bool Kernel::bindGlobalHwCb(VirtualGPU& gpu, VirtualGPU::GslKernelDesc* desc) const {
bool result = true;
2014-07-04 16:17:05 -04:00
// Bind HW constant buffers used for the global data store
const Program::HwConstBuffers& gds = prog().glbHwCb();
for (const auto& it : gds) {
uint idx = it.first;
result = bindResource(gpu, *(it.second), idx, ConstantBuffer, idx);
}
2014-07-04 16:17:05 -04:00
return result;
}
2014-07-04 16:17:05 -04:00
bool Kernel::bindConstantBuffers(VirtualGPU& gpu) const {
bool result = true;
2014-07-04 16:17:05 -04:00
assert((numCb_ <= MaxConstBuffersArguments) && "Runtime doesn't support more CBs for arguments!");
2014-07-04 16:17:05 -04:00
// Upload the parameters to HW and bind all constant buffers
for (uint i = 0; i < numCb_; i++) {
ConstBuffer* cb = gpu.constBufs_[i];
result &= cb->uploadDataToHw(cbSizes_[i]) &&
bindResource(gpu, *cb, i, ConstantBuffer, i, cb->wrtOffset());
}
2014-07-04 16:17:05 -04:00
return result;
2014-07-04 16:17:05 -04:00
}
void Kernel::processMemObjects(VirtualGPU& gpu, const amd::Kernel& kernel, const_address params,
bool nativeMem) const {
// Mark the tracker with a new kernel,
// so we can avoid checks of the aliased objects
gpu.memoryDependency().newKernel();
2014-07-04 16:17:05 -04:00
// Check all parameters for the current kernel
const amd::KernelSignature& signature = kernel.signature();
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(params + kernel.parameters().memoryObjOffset());
for (size_t i = 0; i < signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
const KernelArg* arg = argument(i);
Memory* memory = NULL;
2014-07-04 16:17:05 -04:00
// Find if current argument is a buffer
if ((desc.type_ == T_POINTER) && (arg->type_ != KernelArg::PointerLocal) &&
(arg->type_ != KernelArg::PointerHwLocal)) {
uint32_t index = desc.info_.arrayIndex_;
if (nativeMem) {
memory = reinterpret_cast<Memory* const*>(memories)[index];
} else if (*reinterpret_cast<amd::Memory* const*>(params + desc.offset_) != NULL) {
memory = dev().getGpuMemory(memories[index]);
// Synchronize data with other memory instances if necessary
memory->syncCacheFromHost(gpu);
}
2014-07-04 16:17:05 -04:00
if (memory != NULL) {
// Validate memory for a dependency in the queue
gpu.memoryDependency().validate(gpu, memory, arg->memory_.readOnly_);
}
2014-07-04 16:17:05 -04:00
}
}
}
2014-07-04 16:17:05 -04:00
bool Kernel::loadParameters(VirtualGPU& gpu, const amd::Kernel& kernel, const_address params,
bool nativeMem) const {
bool result = true;
uint i;
// Initialize local private ranges
if (!initLocalPrivateRanges(gpu)) {
return false;
}
2014-07-04 16:17:05 -04:00
if ((UavIdUndefined != uavRaw_) && (!(flags() & PrintfOutput) || (printfId_ != UavIdUndefined))) {
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
// Bind a buffer for a dummy read
result = bindResource(gpu, *gpuMemory, 0, ArgumentUavID, uavRaw_);
}
2014-07-04 16:17:05 -04:00
// Find all parameters for the current kernel
const amd::KernelSignature& signature = kernel.signature();
for (i = 0; i != signature.numParameters(); ++i) {
const amd::KernelParameterDescriptor& desc = signature.at(i);
// Set current argument
if (!setArgument(gpu, kernel, i, params, desc, nativeMem)) {
result = false;
break;
2014-07-04 16:17:05 -04:00
}
}
2014-07-04 16:17:05 -04:00
if (result) {
// Update the ring ranges and math constant
setLocalPrivateRanges(gpu);
2014-07-04 16:17:05 -04:00
result = bindConstantBuffers(gpu);
2014-07-04 16:17:05 -04:00
if (flags() & PrivateFixed) {
result &= bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_);
2014-07-04 16:17:05 -04:00
}
// Setup debug output buffer (if printf is active)
if (flags() & PrintfOutput) {
gpu.addVmMemory(gpu.printfDbg().dbgBuffer());
2014-07-04 16:17:05 -04:00
}
}
2014-07-04 16:17:05 -04:00
return result;
2014-07-04 16:17:05 -04:00
}
bool Kernel::run(VirtualGPU& gpu, GpuEvent* calEvent, bool lastRun, bool lastDoppCmd,
bool pfpaDoppCmd) const {
const VirtualGPU::CalVirtualDesc* dispatch = gpu.cal();
2014-07-04 16:17:05 -04:00
auto compProg = static_cast<gsl::ComputeProgramObject*>(gpu.gslKernelDesc()->func_);
compProg->setWavesPerSH(waveLimiter_.getWavesPerSH(&gpu));
2014-07-04 16:17:05 -04:00
gpu.eventBegin(MainEngine);
gpu.rs()->Dispatch(gpu.cs(), &dispatch->gridBlock, &dispatch->partialGridBlock,
&dispatch->gridSize, dispatch->localSize, gpu.vmMems(), dispatch->memCount_,
lastDoppCmd, pfpaDoppCmd);
gpu.flushCUCaches();
gpu.eventEnd(MainEngine, *calEvent);
2014-07-04 16:17:05 -04:00
// Unbind all resources
unbindResources(gpu, *calEvent, lastRun);
2014-07-04 16:17:05 -04:00
return true;
}
2014-07-04 16:17:05 -04:00
static size_t counter = 0;
void Kernel::debug(VirtualGPU& gpu) const {
std::fstream stubWrite;
address src = NULL;
std::cerr << "--- " << name_ << " ---" << std::endl;
for (uint i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
const Memory* gpuMem = gpu.slots_[i].memory_;
std::stringstream fileName;
bool bufferObj =
((arg->type_ == KernelArg::PointerGlobal) || (arg->type_ == KernelArg::PointerConst) ||
(arg->type_ == KernelArg::PointerHwConst));
if ((src != NULL) && arg->isCbNeeded() && bufferObj) {
address memory = gpu.cb(arg->cbIdx_)->sysMemCopy();
std::cerr.setf(std::ios::hex);
uint* location =
reinterpret_cast<uint*>(src + *reinterpret_cast<uint*>(memory + arg->cbPos_));
std::cerr << " > " << arg->name_ << ": 0x" << location << std::endl;
// Dump the data
fileName << counter << "_kernel_" << name() << "_" << arg->name_ << "_" << location << ".bin";
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
// Write data to a file
if (stubWrite.is_open()) {
stubWrite.write(reinterpret_cast<char*>(location), gpuMem->size());
stubWrite.close();
}
}
if (((arg->type_ >= KernelArg::Image1D) && (arg->type_ <= KernelArg::Image3D)) ||
((src == NULL) && bufferObj)) {
//@todo Replace the current map
Memory* resource = const_cast<Memory*>(gpu.slots_[i].memory_);
void* memory = resource->map(&gpu);
uint* location = reinterpret_cast<uint*>(memory);
std::cerr << " > " << arg->name_ << (bufferObj ? ": buffer" : ": image") << std::endl;
// Dump the data
fileName << counter << "_kernel_" << name() << "_" << arg->name_ << "_" << location << ".bin";
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
// Write data to a file
if (stubWrite.is_open()) {
stubWrite.write(reinterpret_cast<char*>(location), gpuMem->size());
stubWrite.close();
}
resource->unmap(&gpu);
}
}
for (uint i = 0; i < gpu.constBufs_.size(); ++i) {
std::stringstream fileName;
fileName << counter++ << "_kernel_" << name() << "_const" << i << ".bin";
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
if (stubWrite.is_open()) {
address memory = reinterpret_cast<address>(gpu.constBufs_[i]->map(&gpu, Resource::ReadOnly));
// Check if we have OpenCL program
stubWrite.write(reinterpret_cast<char*>(memory + gpu.cb(i)->wrtOffset()),
gpu.cb(i)->lastWrtSize());
gpu.constBufs_[i]->unmap(&gpu);
stubWrite.close();
}
}
const Program::HwConstBuffers& gds = prog().glbHwCb();
for (const auto& it : gds) {
uint idx = it.first;
std::stringstream fileName;
fileName << counter++ << "_kernel_" << name() << "_const" << idx << ".bin";
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
if (stubWrite.is_open()) {
address memory = reinterpret_cast<address>(it.second->map(&gpu, Resource::ReadOnly));
// Check if we have OpenCL program
stubWrite.write(reinterpret_cast<char*>(memory), it.second->size());
it.second->unmap(&gpu);
stubWrite.close();
}
}
}
bool Kernel::initConstBuffers() {
bool result = true;
size_t i;
assert((numCb_ != 0) && "We have 0 constant buffers!");
// Allocate an array for CB sizes
cbSizes_ = new size_t[numCb_];
if (cbSizes_ == NULL) {
return false;
}
memset(cbSizes_, 0, sizeof(size_t) * numCb_);
// CB0 is reserved for ABI data
cbSizes_[0] = TotalABIVectors * ConstBuffer::VectorSize;
// Find sizes of all constant buffers
for (i = 0; i < arguments_.size(); ++i) {
const KernelArg* arg = argument(i);
size_t size = arg->cbPos_ + arg->size(true);
size_t specVec = arg->specialVector();
if (specVec != 0) {
size = arg->cbPos_ + (arg->size_ / KernelArg::VectorSizeLimit) * ConstBuffer::VectorSize;
}
// Do we need a CB?
if (arg->isCbNeeded() && (cbSizes_[arg->cbIdx_] < size)) {
cbSizes_[arg->cbIdx_] = size;
}
}
return result;
}
bool Kernel::setInternalSamplers(VirtualGPU& gpu) const {
for (uint i = 0; i < samplerSize(); ++i) {
const KernelArg* arg = sampler(i);
uint state = arg->cbPos_;
uint idx = arg->index_;
if (gpu.cal()->samplersState_[idx] != state) {
setSampler(gpu, state, idx);
gpu.cal_.samplersState_[idx] = state;
}
}
return true;
}
bool Kernel::setArgument(VirtualGPU& gpu, const amd::Kernel& kernel,
uint idx, const_address params,
const amd::KernelParameterDescriptor& desc,
bool nativeMem) const {
size_t size = desc.size_;
const void* param = params + desc.offset_;
bool result = true;
const KernelArg* arg;
address memory;
size_t argSize;
static const bool waitOnBusyEngine = true;
assert((idx < arguments_.size()) && "Param index is out of range!");
arg = argument(idx);
assert((arg->cbIdx_ == 1) && "Runtime supports CB1 only for the arguments buffer!");
memory = gpu.cb(1)->sysMemCopy();
argSize = arg->size(true);
// Bind the global heap for emulation mode
switch (arg->type_) {
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
2014-07-04 16:17:05 -04:00
return false;
}
// Fall through ...
default:
break;
}
2014-07-04 16:17:05 -04:00
switch (arg->type_) {
case KernelArg::PointerConst:
case KernelArg::PointerHwConst:
case KernelArg::PointerGlobal: {
gpu::Memory* gpuMem = NULL;
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(params + kernel.parameters().memoryObjOffset());
uint32_t index = desc.info_.arrayIndex_;
if (nativeMem) {
gpuMem = reinterpret_cast<Memory*>(memories[index]);
} else if (memories[index] != nullptr) {
gpuMem = dev().getGpuMemory(memories[index]);
}
bool forceZeroOffset = false;
if (gpuMem == NULL) {
forceZeroOffset = true;
gpuMem = dev().getGpuMemory(dev().dummyPage());
}
uint64_t offset = gpuMem->pinOffset();
// Make sure the passed argument is a buffer object
if (!gpuMem->cal()->buffer_) {
LogError("The kernel buffer argument isn't a buffer object!");
return false;
}
if (arg->type_ == KernelArg::PointerHwConst) {
// Bind current memory object with the kernel
if (!bindResource(gpu, *gpuMem, idx, ArgumentConstBuffer, arg->index_)) {
return false;
}
assert((offset == 0) && "No offset for HW CB");
// Add a fake offset to make sure (ptr != NULL) is TRUE
offset = 1;
} else {
ResourceType type = ArgumentHeapBuffer;
// Check if kernel expects UAV binding
if (arg->memory_.uavBuf_) {
type = ArgumentBuffer;
} else {
// Bind global buffer to UAV this buffer is bound to
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
return false;
}
}
2014-07-04 16:17:05 -04:00
// Bind current memory object with the kernel
// Note: it's a fake binding, if the buffer is part of
// the global heap
if (!bindResource(gpu, *gpuMem, idx, type, arg->index_)) {
return false;
}
2014-07-04 16:17:05 -04:00
// Update offset only if we bind HeapBuffer or
// it's global address space in UAV setup on SI+
offset += gpuMem->hbOffset();
if (!forceZeroOffset) {
assert((offset != 0) && "Offset 0 with a real allocation!");
}
gpu.addVmMemory(gpuMem);
}
2014-07-04 16:17:05 -04:00
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, waitOnBusyEngine);
2014-07-04 16:17:05 -04:00
if (forceZeroOffset) {
offset = 0;
}
2014-07-04 16:17:05 -04:00
// Copy memory offset into the constant buffer
if (abi64Bit()) {
*(reinterpret_cast<uint64_t*>(memory + arg->cbPos_)) = offset;
} else {
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) = static_cast<uint>(offset);
}
} break;
case KernelArg::Image1D:
case KernelArg::Image2D:
case KernelArg::Image3D:
case KernelArg::Image1DB:
case KernelArg::Image1DA:
case KernelArg::Image2DA: {
gpu::Memory* gpuMem = NULL;
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(params + kernel.parameters().memoryObjOffset());
uint32_t index = desc.info_.arrayIndex_;
if (nativeMem) {
gpuMem = reinterpret_cast<Memory*>(memories[index]);
} else if (memories[index] != nullptr) {
gpuMem = dev().getGpuMemory(memories[index]);
}
if (gpuMem == NULL) {
return false;
}
// Make sure the passed argument is an image object
if (gpuMem->cal()->buffer_) {
LogError("The kernel image argument isn't an image object!");
return false;
}
2014-07-04 16:17:05 -04:00
ResourceType resType = arg->memory_.readOnly_ ? ArgumentImageRead : ArgumentImageWrite;
2014-07-04 16:17:05 -04:00
// Bind current memory object with the shader.
if (!bindResource(gpu, *gpuMem, idx, resType, arg->index_)) {
2014-07-04 16:17:05 -04:00
return false;
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, waitOnBusyEngine);
// Copy image constants into the constant buffer
if (gpuMem->owner() != NULL) {
copyImageConstants(gpuMem->owner()->asImage(),
reinterpret_cast<ImageConstants*>(memory + arg->cbPos_));
}
// Handle DOPP texture resource
gslMemObject gslMem = gpuMem->gslResource();
if (gslMem->getAttribs().isDOPPDesktopTexture) {
gpu.addVmMemory(gpuMem);
}
} break;
case KernelArg::Sampler: {
uint32_t index = desc.info_.arrayIndex_;
const amd::Sampler* amdSampler = reinterpret_cast<amd::Sampler* const*>(params +
kernel.parameters().samplerObjOffset())[index];
uint idx = arg->index_;
uint32_t state = amdSampler->state();
if (state != gpu.cal()->samplersState_[idx]) {
setSampler(gpu, state, idx);
gpu.cal_.samplersState_[idx] = state;
}
// Copy sampler state into the constant buffer
*(reinterpret_cast<uint32_t*>(memory + arg->cbPos_)) = state;
} break;
case KernelArg::Counter: {
gpu::Memory* gpuMem = NULL;
if (nativeMem) {
gpuMem = *reinterpret_cast<Memory* const*>(param);
} else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, waitOnBusyEngine);
// Bind current memory object with the shader.
if (!bindResource(gpu, *gpuMem, idx, ArgumentCounter, idx)) {
2014-07-04 16:17:05 -04:00
return false;
}
} break;
case KernelArg::PointerHwLocal: {
// Calculate current offset in the local ring
uint offset = gpu.cal_.localSize;
uint extra = amd::alignUp(offset, arg->alignment_) - offset;
offset = amd::alignUp(offset, arg->alignment_);
size_t memSize = *static_cast<const uintptr_t*>(param);
// Allocate new memory from the local ring
gpu.cal_.localSize += static_cast<uint>(memSize) + extra;
// Copy current local argument's offset into the CB
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) = offset;
CondLog((gpu.cal_.localSize > dev().info().localMemSize_),
"Requested local size is bigger than reported!");
} break;
2014-07-04 16:17:05 -04:00
case KernelArg::Float:
case KernelArg::Double:
case KernelArg::Char:
case KernelArg::UChar:
case KernelArg::Short:
case KernelArg::UShort:
case KernelArg::Int:
case KernelArg::UInt:
case KernelArg::Long:
case KernelArg::ULong:
if (size != argSize) {
LogWarning("Parameter's sizes are unmatched!");
}
// Fall through ...
2014-07-04 16:17:05 -04:00
case KernelArg::Struct:
case KernelArg::Union: {
size_t specVec = arg->specialVector();
if (specVec != 0) {
uint iter = (arg->size_ / KernelArg::VectorSizeLimit);
for (uint i = 0; i < iter; ++i) {
amd::Os::fastMemcpy(
(memory + arg->cbPos_ + i * ConstBuffer::VectorSize),
reinterpret_cast<const char*>(param) + i * KernelArg::VectorSizeLimit * specVec,
specVec * KernelArg::VectorSizeLimit);
}
} else {
// Copy data into the CB
amd::Os::fastMemcpy((memory + arg->cbPos_), param, size);
}
} break;
2014-07-04 16:17:05 -04:00
default:
LogError("Unhandled argument's type!");
break;
}
return result;
}
bool Kernel::initLocalPrivateRanges(VirtualGPU& gpu) const {
// Initialize HW local
gpu.cal_.localSize = hwLocalSize_;
// Bind the global buffer if emulated local or private memory
// was allocated by the kernel
if ((flags() & PrintfOutput && (printfId_ == UavIdUndefined)) && (uavRaw_ != UavIdUndefined)) {
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
return false;
}
}
// Bind the global buffer if emulated constant buffers are enabled
if (cbId_ != UavIdUndefined) {
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentCbID, cbId_)) {
return false;
}
}
// Bind the printf buffer
if (printfId_ != UavIdUndefined) {
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentPrintfID, printfId_)) {
return false;
}
}
// Initialize the iterations count
gpu.cal_.iterations_ = 1;
return true;
}
void Kernel::setLocalPrivateRanges(VirtualGPU& gpu) const {
address cbBuf = gpu.cb(0)->sysMemCopy();
uint* data;
uint gridSize =
gpu.cal()->gridSize.width * gpu.cal()->gridSize.height * gpu.cal()->gridSize.depth;
uint blockSize =
gpu.cal()->gridBlock.width * gpu.cal()->gridBlock.height * gpu.cal()->gridBlock.depth;
//! \todo validate if the compiler still generates PrivateFixed
if (flags() & PrivateFixed) {
// Update private ring
data = reinterpret_cast<uint*>(cbBuf + PrivateRingOffset * ConstBuffer::VectorSize);
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
if (abi64Bit()) {
reinterpret_cast<uint64_t*>(data)[0] = gpuMemory->hbOffset();
data[2] = 0;
data[3] = 0;
} else {
data[0] = static_cast<uint>(gpuMemory->hbOffset());
data[1] = 0;
data[2] = data[3] = 0;
}
gpu.addVmMemory(gpuMemory);
}
// Copy the math lib constants
amd::Os::fastMemcpy((cbBuf + MathLibOffset * ConstBuffer::VectorSize), MathLibConst,
sizeof(MathLibConst));
// Update the offset to the global data
if (prog().glbData() != NULL) {
gpu.addVmMemory(prog().glbData());
uint64_t glbDataOffset = prog().glbData()->hbOffset();
if (abi64Bit()) {
*reinterpret_cast<uint64_t*>(cbBuf + GlobalDataStoreOffset * ConstBuffer::VectorSize) =
glbDataOffset;
} else {
*reinterpret_cast<uint*>(cbBuf + GlobalDataStoreOffset * ConstBuffer::VectorSize) =
static_cast<uint>(glbDataOffset);
}
}
// Split workload if it was requested
if ((gpu.cal_.iterations_ < 2) && gpu.dmaFlushMgmt().dispatchSplitSize() != 0) {
uint totalSize = gridSize * blockSize;
if (totalSize > gpu.dmaFlushMgmt().dispatchSplitSize()) {
gpu.cal_.iterations_ =
std::max(gpu.cal_.iterations_, (totalSize / gpu.dmaFlushMgmt().dispatchSplitSize()));
}
}
// Initialize the number of iterations to the grid size
if (flags() & PrintfOutput) {
gpu.cal_.iterations_ = gridSize;
}
}
void Kernel::setSampler(VirtualGPU& gpu, uint32_t state, uint physUnit) const {
// All CAL sampler's parameters are in floats
float gslAddress = GSL_CLAMP_TO_BORDER;
float gslMinFilter = GSL_MIN_NEAREST;
float gslMagFilter = GSL_MAG_NEAREST;
state &= ~amd::Sampler::StateNormalizedCoordsMask;
// Program the sampler address mode
switch (state & amd::Sampler::StateAddressMask) {
case amd::Sampler::StateAddressRepeat:
gslAddress = GSL_REPEAT;
break;
case amd::Sampler::StateAddressClampToEdge:
gslAddress = GSL_CLAMP_TO_EDGE;
break;
case amd::Sampler::StateAddressMirroredRepeat:
gslAddress = GSL_MIRRORED_REPEAT;
break;
case amd::Sampler::StateAddressClamp:
case amd::Sampler::StateAddressNone:
default:
break;
}
state &= ~amd::Sampler::StateAddressMask;
2014-07-04 16:17:05 -04:00
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_S, &gslAddress);
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_T, &gslAddress);
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_R, &gslAddress);
2014-07-04 16:17:05 -04:00
// Program texture filter mode
if (state == amd::Sampler::StateFilterLinear) {
gslMinFilter = GSL_MIN_LINEAR;
gslMagFilter = GSL_MAG_LINEAR;
}
2014-07-04 16:17:05 -04:00
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MIN_FILTER, &gslMinFilter);
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MAG_FILTER, &gslMagFilter);
2014-07-04 16:17:05 -04:00
}
bool Kernel::bindResource(VirtualGPU& gpu, const Memory& memory, uint paramIdx, ResourceType type,
uint physUnit, size_t offset) const {
gslUAVType uavType = GSL_UAV_TYPE_UNKNOWN;
2014-07-04 16:17:05 -04:00
// Find the original resource name from the IL program
switch (type) {
2014-07-04 16:17:05 -04:00
case GlobalBuffer:
if (gpu.state_.boundGlobal_) {
return true;
}
gpu.state_.boundGlobal_ = true;
physUnit = uavRaw_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
2014-07-04 16:17:05 -04:00
case ArgumentCbID:
if (gpu.state_.boundCb_) {
return true;
}
gpu.state_.boundCb_ = true;
physUnit = cbId_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
2014-07-04 16:17:05 -04:00
case ArgumentPrintfID:
if (gpu.state_.boundPrintf_) {
return true;
}
gpu.state_.boundPrintf_ = true;
physUnit = printfId_;
uavType = GSL_UAV_TYPE_TYPELESS;
break;
2014-07-04 16:17:05 -04:00
case ArgumentHeapBuffer:
case ArgumentBuffer:
case ArgumentImageRead:
case ArgumentImageWrite:
case ArgumentConstBuffer:
case ArgumentCounter:
// Early exit if resource is bound already
if (gpu.slots_[paramIdx].state_.bound_) {
return true;
}
// Associate resource with the slot
gpu.slots_[paramIdx].memory_ = &memory;
// Mark resource as bound
gpu.slots_[paramIdx].state_.bound_ = true;
if (type == ArgumentCounter) {
GpuEvent calEvent;
// Bind memory with atomic counter
gpu.cs()->bindAtomicCounter(argument(paramIdx)->index_, memory.gslResource());
// Copy the counter value into GDS
gpu.eventBegin(MainEngine);
gpu.cs()->syncAtomicCounter(argument(paramIdx)->index_, false);
gpu.eventEnd(MainEngine, calEvent);
// Mark resource as busy
memory.setBusy(gpu, calEvent);
return true;
} else if (type == ArgumentHeapBuffer) {
// We return here, since we just have to bind the global heap
return true;
} else if (type == ArgumentConstBuffer) {
gpu.slots_[paramIdx].state_.constant_ = true;
}
break;
2014-07-04 16:17:05 -04:00
case ArgumentUavID:
case ConstantBuffer:
break;
2014-07-04 16:17:05 -04:00
default:
LogPrintfError("Unspecified argument type ()!", type);
return false;
}
gslMemObject gslMem = NULL;
// Use global address space on SI+ for UAV setup
if ((type == ArgumentBuffer) || (type == ArgumentCbID) || (type == ArgumentUavID) ||
(type == ArgumentPrintfID)) {
gslMem = dev().heap().resource().gslResource();
} else {
gslMem = memory.gslResource();
}
// Associate memory with the physical unit, the actual binding
bool result = true;
switch (type) {
2014-07-04 16:17:05 -04:00
case GlobalBuffer:
case ArgumentBuffer:
case ArgumentImageWrite:
case ArgumentUavID:
case ArgumentCbID:
case ArgumentPrintfID:
if (type == ArgumentImageWrite) {
uavType = GSL_UAV_TYPE_TYPED;
} else if ((type == ArgumentBuffer) || (type == ArgumentUavID)) {
uavType = GSL_UAV_TYPE_TYPELESS;
}
if (gpu.cal_.uavs_[physUnit] != gslMem) {
result = gpu.setUAVBuffer(physUnit, gslMem, uavType);
gpu.setUAVChannelOrder(physUnit, gslMem);
gpu.cal_.uavs_[physUnit] = gslMem;
}
break;
2014-07-04 16:17:05 -04:00
case ConstantBuffer:
case ArgumentConstBuffer:
if ((gpu.cal_.constBuffers_[physUnit] != gslMem) || (offset != 0)) {
result = gpu.setConstantBuffer(physUnit, gslMem, offset, memory.hbSize());
gpu.cal_.constBuffers_[physUnit] = gslMem;
}
break;
2014-07-04 16:17:05 -04:00
case ArgumentImageRead:
if (gpu.cal_.readImages_[physUnit] != gslMem) {
result = gpu.setInput(physUnit, gslMem);
gpu.cal_.readImages_[physUnit] = gslMem;
}
break;
2014-07-04 16:17:05 -04:00
default:
result = false;
assert(false);
break;
}
if (!result) {
LogPrintfError("setMem failed unit:%d mem:0x%08x!", physUnit, gslMem);
return false;
}
2014-07-04 16:17:05 -04:00
return true;
2014-07-04 16:17:05 -04:00
}
void Kernel::unbindResources(VirtualGPU& gpu, GpuEvent calEvent, bool lastRun) const {
// Make sure unbind occurs on the last run, in case the execution had a split
if (lastRun) {
for (uint i = 0; i < arguments_.size(); ++i) {
if (gpu.slots_[i].state_.bound_) {
GpuEvent calEventTmp = calEvent;
if (KernelArg::Counter == argument(i)->type_) {
// Copy the counter value from GDS
gpu.eventBegin(MainEngine);
gpu.cs()->syncAtomicCounter(argument(i)->index_, true);
gpu.eventEnd(MainEngine, calEventTmp);
} else if (!(gpu.slots_[i].state_.constant_ || argument(i)->memory_.readOnly_)) {
// Signal the abstraction layer that GPU memory is dirty
if (gpu.slots_[i].memory_->owner() != NULL) {
gpu.slots_[i].memory_->owner()->signalWrite(&gpu.dev());
}
2014-07-04 16:17:05 -04:00
}
// Mark resource as busy
gpu.slots_[i].memory_->setBusy(gpu, calEventTmp);
2014-07-04 16:17:05 -04:00
gpu.slots_[i].state_.value_ = 0;
}
}
2014-07-04 16:17:05 -04:00
// Unbind the global buffer
gpu.state_.boundGlobal_ = false;
2014-07-04 16:17:05 -04:00
// Unbind the constant buffer
gpu.state_.boundCb_ = false;
2014-07-04 16:17:05 -04:00
// Unbind the pritnf buffer
gpu.state_.boundPrintf_ = false;
}
2014-07-04 16:17:05 -04:00
// Mark CB busy
for (uint i = 0; i < numCb_; ++i) {
gpu.constBufs_[i]->setBusy(gpu, calEvent);
}
// Set the event object for the scratch buffer
if (workGroupInfo()->scratchRegs_ > 0) {
dev().scratch(gpu.hwRing())->memObj_->setBusy(gpu, calEvent);
}
2014-07-04 16:17:05 -04:00
}
void Kernel::copyImageConstants(const amd::Image* amdImage, ImageConstants* imageData) const {
imageData->width_ = static_cast<uint32_t>(amdImage->getWidth());
imageData->height_ = static_cast<uint32_t>(amdImage->getHeight());
imageData->depth_ = static_cast<uint32_t>(amdImage->getDepth());
imageData->dataType_ = static_cast<uint32_t>(amdImage->getImageFormat().image_channel_data_type);
2014-07-04 16:17:05 -04:00
imageData->widthFloat_ = 1.f / static_cast<float>(amdImage->getWidth());
imageData->heightFloat_ = 1.f / static_cast<float>(amdImage->getHeight());
imageData->depthFloat_ = 1.f / static_cast<float>(amdImage->getDepth());
imageData->channelOrder_ = static_cast<uint32_t>(amdImage->getImageFormat().image_channel_order);
2014-07-04 16:17:05 -04:00
}
union MetadataVersion {
struct {
uint64_t revision_ : 16; //!< LLVM metadata revision
uint64_t minorVersion_ : 16; //!< LLVM metadata minor verison
uint64_t majorVersion_ : 16; //!< LLVM metadata major version
};
uint64_t value_;
MetadataVersion(uint mj, uint mi, uint rev) : value_(0) {
revision_ = rev;
minorVersion_ = mi;
majorVersion_ = mj;
}
MetadataVersion() : value_(0) {}
2014-07-04 16:17:05 -04:00
};
//! Version of metadata with buffer attributes
const MetadataVersion MetadataBufferAttributes = MetadataVersion(2, 0, 88);
//! Version of metadata with type qualifiers
const MetadataVersion MetadataTypeQualifiers = MetadataVersion(3, 1, 103);
bool NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount) {
// Initialize workgroup info
workGroupInfo_.size_ = nullDev().info().preferredWorkGroupSize_;
MetadataVersion mdVersion;
// Find first tag
size_t pos = metaData.find(";");
// Loop through all provided program arguments
while (pos != std::string::npos) {
KernelArg arg;
if (!expect(metaData, &pos, ";")) {
break;
}
arg.type_ = KernelArg::NoType;
// Loop through all available metadata types
for (uint i = 0; i < ArgStateTotal; ++i) {
uint tmpValue;
// Find the name tag
if (expect(metaData, &pos, ArgState[i].typeName_)) {
switch (ArgState[i].type_) {
case KernelArg::NoType:
// Process next ...
continue;
case KernelArg::Reflection: {
uint argIdx;
// Read the argument's index
if (!getuint(metaData, &pos, &argIdx)) {
LogWarning("Couldn't get the argument index!");
return false;
}
KernelArg* tmpArg = arguments_[argIdx];
if (!getstring(metaData, &pos, &tmpArg->typeName_)) {
LogWarning("Couldn't get the argument type!");
return false;
}
}
continue;
case KernelArg::ConstArg: {
uint argIdx;
// Read the argument's index
if (!getuint(metaData, &pos, &argIdx)) {
LogWarning("Couldn't get the argument index!");
return false;
}
KernelArg* tmpArg = arguments_[argIdx];
tmpArg->typeQualifier_ |= CL_KERNEL_ARG_TYPE_CONST;
}
continue;
case KernelArg::Grouping:
for (uint j = 0; j < 3; ++j) {
uint temp;
// Read the compile workgroup size
if (!getuint(metaData, &pos, &temp)) {
LogWarning("Couldn't get the compile workgroup size!");
return false;
}
workGroupInfo_.compileSize_[j] = temp;
}
// Process next ...
continue;
case KernelArg::WrkgrpSize: {
uint temp;
// Read the workgroup size
if (!getuint(metaData, &pos, &temp)) {
LogWarning("Couldn't get the workgroup size!");
return false;
}
workGroupInfo_.size_ = temp;
}
// Process next ...
continue;
case KernelArg::Wavefront:
// Process next ...
continue;
case KernelArg::UavId:
// Read index
if (!getuint(metaData, &pos, &arg.index_)) {
return false;
2014-07-04 16:17:05 -04:00
}
break;
case KernelArg::ConstBufId:
// Read index
if (!getuint(metaData, &pos, &cbId_)) {
return false;
}
continue;
case KernelArg::PrintfBufId:
// Read index
if (!getuint(metaData, &pos, &printfId_)) {
return false;
}
continue;
case KernelArg::MetadataVersion:
// Read metadata version
if (!getuint(metaData, &pos, &tmpValue)) {
return false;
}
mdVersion.majorVersion_ = tmpValue;
if (!getuint(metaData, &pos, &tmpValue)) {
return false;
}
mdVersion.minorVersion_ = tmpValue;
if (!getuint(metaData, &pos, &tmpValue)) {
return false;
}
mdVersion.revision_ = tmpValue;
// Process next ...
continue;
case KernelArg::GroupingHint:
for (uint j = 0; j < 3; ++j) {
uint temp;
// Read the compile workgroup size hint
if (!getuint(metaData, &pos, &temp)) {
LogWarning("Couldn't get the compile workgroup size hint!");
return false;
}
workGroupInfo_.compileSizeHint_[j] = temp;
}
// Process next ...
continue;
case KernelArg::VecTypeHint: {
std::string temp;
// Read the compile vector type hint
if (!getstring(metaData, &pos, &temp)) {
LogWarning("Couldn't get the compile vector type hint!");
return false;
}
workGroupInfo_.compileVecTypeHint_ = temp;
}
// Process next ...
continue;
case KernelArg::WavesPerSimdHint: {
uint tmp;
if (!getuint(metaData, &pos, &tmp)) {
return false;
}
workGroupInfo_.wavesPerSimdHint_ = tmp;
}
continue;
default:
2014-07-04 16:17:05 -04:00
break;
}
std::string argName;
// Save the argument type
arg.type_ = ArgState[i].type_;
2014-07-04 16:17:05 -04:00
// Check if we should expect the name
if (ArgState[i].name_) {
// Read the parameter's name
if (!getword(metaData, &pos, argName)) {
LogWarning("Couldn't get a kernel argument!");
2014-07-04 16:17:05 -04:00
return false;
}
arg.name_ = argName;
2014-07-04 16:17:05 -04:00
}
if (arg.type_ == KernelArg::Sampler) {
if (!getuint(metaData, &pos, &arg.index_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (!getuint(metaData, &pos, &arg.location_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (!getuint(metaData, &pos, &arg.cbPos_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
2014-07-04 16:17:05 -04:00
}
// Check if we should expect the resource data type
if (ArgState[i].resType_) {
uint k;
// Search for the data type
for (k = 0; k < DataTypeTotal; k++) {
if (expect(metaData, &pos, DataType[k].tagName_)) {
arg.dataType_ = DataType[k].type_;
if (arg.type_ == KernelArg::Image) {
flags_ |= ImageEnable;
if (expect(metaData, &pos, "RO:")) {
arg.memory_.readOnly_ = 1;
} else if (expect(metaData, &pos, "RW:")) {
arg.memory_.readWrite_ = 1;
flags_ |= ImageWrite;
} else if (expect(metaData, &pos, "WO:")) {
arg.memory_.writeOnly_ = 1;
flags_ |= ImageWrite;
}
} else if (arg.type_ == KernelArg::Value) {
arg.type_ = DataType[k].type_;
}
break;
2014-07-04 16:17:05 -04:00
}
}
if (k == DataTypeTotal) {
LogWarning("We couldn't find the argument's type.");
if ((arg.type_ == KernelArg::Value) || !getword(metaData, &pos, argName)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
//! @todo temporary condition
if ((arg.type_ == KernelArg::Opaque) || (arg.type_ == KernelArg::Sampler)) {
assert(false);
continue;
}
2014-07-04 16:17:05 -04:00
}
// Check if we should expect the data size
if (ArgState[i].size_) {
uint tmpData;
// Read the data size
if (!getuint(metaData, &pos, &tmpData)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (arg.type_ == KernelArg::Image) {
arg.type_ = arg.dataType_;
arg.index_ = tmpData;
} else {
arg.size_ = tmpData;
}
2014-07-04 16:17:05 -04:00
}
if (arg.type_ == KernelArg::Counter) {
// Read a counter index
if (!getuint(metaData, &pos, &arg.index_)) {
LogWarning("Couldn't get a counter index!");
return false;
}
2014-07-04 16:17:05 -04:00
}
// Check if we should expect a resource index
if (ArgState[i].cbIdx_) {
// Read resource index
if (!getuint(metaData, &pos, &arg.cbIdx_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
if (arg.isCbNeeded() && (numCb_ < arg.cbIdx_)) {
numCb_ = arg.cbIdx_;
}
2014-07-04 16:17:05 -04:00
}
// Check if we should expect the CB offset
if (ArgState[i].cbPos_) {
// Read position in the constant buffer
if (!getuint(metaData, &pos, &arg.cbPos_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
2014-07-04 16:17:05 -04:00
}
// Check if we should expect the buffer type
if (ArgState[i].buf_) {
// Read the buffer type
if (!getword(metaData, &pos, argName)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
arg.buf_ = argName;
for (uint k = 0; k < BufTypeTotal; ++k) {
if (0 == arg.buf_.compare(BufType[k].tagName_)) {
// Update the parameter type
arg.type_ = BufType[k].type_;
// Check if we should expect a buffer index
if (BufType[k].number_) {
// Read a buffer index
if (!getuint(metaData, &pos, &arg.index_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check for the required alignment
if (BufType[k].alignment_) {
// Read data alignment
if (!getuint(metaData, &pos, &arg.alignment_)) {
LogWarning("Couldn't get a kernel argument!");
return false;
}
}
// Check for the buffer's attribute
if ((mdVersion.value_ >= MetadataBufferAttributes.value_) && BufType[k].attribute_) {
if (expect(metaData, &pos, "RO")) {
arg.memory_.readOnly_ = 1;
} else if (expect(metaData, &pos, "RW")) {
arg.memory_.readWrite_ = 1;
} else if (expect(metaData, &pos, "WO")) {
arg.memory_.writeOnly_ = 1;
}
}
// Check for the type qualifier
if ((mdVersion.value_ >= MetadataTypeQualifiers.value_) && BufType[k].attribute_) {
uint tmp;
pos += 1;
if (!getuint(metaData, &pos, &tmp)) {
LogWarning("Couldn't get volatile type!");
return false;
}
if (tmp == 1) {
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_VOLATILE;
}
if (!getuint(metaData, &pos, &tmp)) {
LogWarning("Couldn't get restrict type!");
return false;
}
if (tmp == 1) {
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_RESTRICT;
}
}
}
}
2014-07-04 16:17:05 -04:00
}
// Find multiple UAV references
switch (arg.type_) {
case KernelArg::PointerGlobal:
case KernelArg::PointerConst:
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
case KernelArg::UavId:
uavRefCount[arg.index_]++;
2014-07-04 16:17:05 -04:00
break;
default:
2014-07-04 16:17:05 -04:00
break;
}
// Check if this argument will be passed in constant buffer
if (arg.isCbNeeded() || (arg.type_ == KernelArg::UavId)) {
if (arg.type_ == KernelArg::Sampler) {
// Serach for the passed by value sampler
for (uint i = 0; i < argSize(); ++i) {
KernelArg* value = arguments_[i];
if (0 == value->name_.compare(arg.name_)) {
value->type_ = arg.type_;
value->index_ = arg.index_;
value->location_ = 0;
break;
}
}
} else {
KernelArg* argument = new KernelArg(arg);
if (argument != NULL) {
addArgument(argument);
} else {
LogError("Couldn't allocate memory!");
return false;
}
}
}
// Check if we have a pre-defined sampler
else if (arg.type_ == KernelArg::Sampler) {
KernelArg* sampler = new KernelArg(arg);
if (sampler != NULL) {
addSampler(sampler);
} else {
LogError("Couldn't allocate memory!");
return false;
}
2014-07-04 16:17:05 -04:00
}
break;
}
2014-07-04 16:17:05 -04:00
}
// Next argument
pos = metaData.find(";", pos);
}
2014-07-04 16:17:05 -04:00
// Find arguments that will require a reallocation
for (uint i = 0; i < arguments_.size(); ++i) {
KernelArg* arg = arguments_[i];
switch (arg->type_) {
case KernelArg::PointerGlobal:
case KernelArg::PointerConst:
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
// Check if can't use a dedicated UAV,
// so realloc memory in the heap
arg->memory_.realloc_ = false;
arg->memory_.uavBuf_ = true;
break;
case KernelArg::PointerHwConst:
arg->memory_.realloc_ = true;
break;
case KernelArg::UavId:
uavRaw_ = arg->index_;
break;
default:
break;
}
// If argument marked with the const qualifier, then overwrite
// Read-Write attributes, since compiler doesn't mark it properly
if (arg->typeQualifier() & CL_KERNEL_ARG_TYPE_CONST) {
arg->memory_.readOnly_ = 1;
arg->memory_.readWrite_ = 0;
arg->memory_.writeOnly_ = 0;
}
}
2014-07-04 16:17:05 -04:00
if ((uavRaw_ != UavIdUndefined) && !(flags() & PrintfOutput)) {
// Find if default UAV is already assigned to an argument
for (uint i = 0; i < arguments_.size(); ++i) {
KernelArg* arg = arguments_[i];
switch (arg->type_) {
case KernelArg::PointerGlobal:
case KernelArg::PointerConst:
case KernelArg::PointerLocal:
case KernelArg::PointerPrivate:
if (uavRaw_ == arg->index_) {
uavRaw_ = UavIdUndefined;
}
break;
default:
break;
}
}
}
// There is always 1 constant buffer, associated with the kernel
numCb_++;
assert((numCb_ <= MaxConstBuffersArguments) &&
"Runtime doesn't support more than max CBs for arguments!");
// Limit workgroup size if requested
if ((flags() & LimitWorkgroup) && (GPU_MAX_WORKGROUP_SIZE == 0)) {
size_t temp = 1;
workGroupInfo_.size_ = workGroupInfo()->wavefrontSize_;
for (uint j = 0; j < 3; ++j) {
if (workGroupInfo()->compileSize_[j] != 0) {
temp *= workGroupInfo_.compileSize_[j];
}
}
// Report a compilation error if requested compile size doesn't
// match the required workgroup size
if (workGroupInfo()->size_ < temp) {
char str[8];
intToStr(workGroupInfo_.size_, str, 8);
buildError_ = CL_OUT_OF_RESOURCES;
buildLog_ += "Error: Requested compile size is bigger than the required workgroup size of ";
buildLog_ += str;
buildLog_ += " elements\n";
LogError(buildLog().c_str());
return false;
}
}
// Read/Write attributes are provided in metadata
if (mdVersion.value_ >= MetadataBufferAttributes.value_) {
rwAttributes_ = true;
}
return true;
}
inline static HSAIL_ARG_TYPE GetHSAILArgType(const aclArgData* argInfo) {
switch (argInfo->type) {
case ARG_TYPE_POINTER:
return HSAIL_ARGTYPE_POINTER;
case ARG_TYPE_QUEUE:
return HSAIL_ARGTYPE_QUEUE;
case ARG_TYPE_VALUE:
return HSAIL_ARGTYPE_VALUE;
case ARG_TYPE_IMAGE:
return HSAIL_ARGTYPE_IMAGE;
case ARG_TYPE_SAMPLER:
return HSAIL_ARGTYPE_SAMPLER;
case ARG_TYPE_ERROR:
default:
return HSAIL_ARGTYPE_ERROR;
}
}
2014-07-04 16:17:05 -04:00
inline static size_t GetHSAILArgAlignment(const aclArgData* argInfo) {
switch (argInfo->type) {
case ARG_TYPE_POINTER:
return argInfo->arg.pointer.align;
default:
return 1;
}
}
inline static HSAIL_ACCESS_TYPE GetHSAILArgAccessType(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
switch (argInfo->arg.pointer.type) {
case ACCESS_TYPE_RO:
return HSAIL_ACCESS_TYPE_RO;
case ACCESS_TYPE_WO:
return HSAIL_ACCESS_TYPE_WO;
case ACCESS_TYPE_RW:
default:
return HSAIL_ACCESS_TYPE_RW;
}
}
return HSAIL_ACCESS_TYPE_NONE;
}
inline static HSAIL_ADDRESS_QUALIFIER GetHSAILAddrQual(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
switch (argInfo->arg.pointer.memory) {
case PTR_MT_CONSTANT_EMU:
case PTR_MT_CONSTANT:
case PTR_MT_UAV:
case PTR_MT_GLOBAL:
return HSAIL_ADDRESS_GLOBAL;
case PTR_MT_LDS_EMU:
case PTR_MT_LDS:
return HSAIL_ADDRESS_LOCAL;
case PTR_MT_SCRATCH_EMU:
return HSAIL_ADDRESS_GLOBAL;
case PTR_MT_ERROR:
default:
LogError("Unsupported address type");
return HSAIL_ADDRESS_ERROR;
2014-07-04 16:17:05 -04:00
}
} else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) {
return HSAIL_ADDRESS_GLOBAL;
} else if (argInfo->type == ARG_TYPE_QUEUE) {
return HSAIL_ADDRESS_GLOBAL;
}
return HSAIL_ADDRESS_ERROR;
}
2014-07-04 16:17:05 -04:00
/* f16 returns f32 - workaround due to comp lib */
inline static HSAIL_DATA_TYPE GetHSAILDataType(const aclArgData* argInfo) {
aclArgDataType dataType;
if (argInfo->type == ARG_TYPE_POINTER) {
dataType = argInfo->arg.pointer.data;
} else if (argInfo->type == ARG_TYPE_VALUE) {
dataType = argInfo->arg.value.data;
} else {
return HSAIL_DATATYPE_ERROR;
}
switch (dataType) {
case DATATYPE_i1:
return HSAIL_DATATYPE_B1;
case DATATYPE_i8:
return HSAIL_DATATYPE_S8;
case DATATYPE_i16:
return HSAIL_DATATYPE_S16;
case DATATYPE_i32:
return HSAIL_DATATYPE_S32;
case DATATYPE_i64:
return HSAIL_DATATYPE_S64;
case DATATYPE_u8:
return HSAIL_DATATYPE_U8;
case DATATYPE_u16:
return HSAIL_DATATYPE_U16;
case DATATYPE_u32:
return HSAIL_DATATYPE_U32;
case DATATYPE_u64:
return HSAIL_DATATYPE_U64;
case DATATYPE_f16:
return HSAIL_DATATYPE_F32;
case DATATYPE_f32:
return HSAIL_DATATYPE_F32;
case DATATYPE_f64:
return HSAIL_DATATYPE_F64;
case DATATYPE_struct:
return HSAIL_DATATYPE_STRUCT;
case DATATYPE_opaque:
return HSAIL_DATATYPE_OPAQUE;
case DATATYPE_ERROR:
default:
return HSAIL_DATATYPE_ERROR;
}
}
inline static int GetHSAILArgSize(const aclArgData* argInfo) {
switch (argInfo->type) {
case ARG_TYPE_VALUE:
switch (GetHSAILDataType(argInfo)) {
case HSAIL_DATATYPE_B1:
return 1;
case HSAIL_DATATYPE_B8:
case HSAIL_DATATYPE_S8:
case HSAIL_DATATYPE_U8:
return 1;
case HSAIL_DATATYPE_B16:
case HSAIL_DATATYPE_U16:
case HSAIL_DATATYPE_S16:
case HSAIL_DATATYPE_F16:
return 2;
case HSAIL_DATATYPE_B32:
case HSAIL_DATATYPE_U32:
case HSAIL_DATATYPE_S32:
case HSAIL_DATATYPE_F32:
return 4;
case HSAIL_DATATYPE_B64:
case HSAIL_DATATYPE_U64:
case HSAIL_DATATYPE_S64:
case HSAIL_DATATYPE_F64:
return 8;
case HSAIL_DATATYPE_STRUCT:
return argInfo->arg.value.numElements;
default:
return -1;
}
case ARG_TYPE_POINTER:
case ARG_TYPE_IMAGE:
case ARG_TYPE_SAMPLER:
case ARG_TYPE_QUEUE:
return sizeof(void*);
default:
return -1;
}
}
inline static clk_value_type_t GetOclType(const aclArgData* argInfo) {
static const clk_value_type_t ClkValueMapType[6][6] = {
{T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16},
{T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16},
{T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16},
{T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16},
{T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16},
{T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16},
};
uint sizeType;
if (argInfo->type == ARG_TYPE_QUEUE) {
return T_QUEUE;
}
if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) {
return T_POINTER;
} else if (argInfo->type == ARG_TYPE_VALUE) {
switch (argInfo->arg.value.data) {
case DATATYPE_i8:
case DATATYPE_u8:
sizeType = 0;
break;
case DATATYPE_i16:
case DATATYPE_u16:
sizeType = 1;
break;
case DATATYPE_i32:
case DATATYPE_u32:
sizeType = 2;
break;
case DATATYPE_i64:
case DATATYPE_u64:
sizeType = 3;
break;
case DATATYPE_f16:
case DATATYPE_f32:
sizeType = 4;
break;
case DATATYPE_f64:
sizeType = 5;
break;
default:
return T_VOID;
2014-07-04 16:17:05 -04:00
}
switch (argInfo->arg.value.numElements) {
case 1:
return ClkValueMapType[sizeType][0];
case 2:
return ClkValueMapType[sizeType][1];
case 3:
return ClkValueMapType[sizeType][2];
case 4:
return ClkValueMapType[sizeType][3];
case 8:
return ClkValueMapType[sizeType][4];
case 16:
return ClkValueMapType[sizeType][5];
default:
return T_VOID;
2014-07-04 16:17:05 -04:00
}
} else if (argInfo->type == ARG_TYPE_SAMPLER) {
return T_SAMPLER;
} else {
return T_VOID;
}
}
2014-07-04 16:17:05 -04:00
inline static cl_kernel_arg_address_qualifier GetOclAddrQual(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_POINTER) {
switch (argInfo->arg.pointer.memory) {
case PTR_MT_UAV:
case PTR_MT_GLOBAL:
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
case PTR_MT_CONSTANT:
case PTR_MT_UAV_CONSTANT:
case PTR_MT_CONSTANT_EMU:
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
case PTR_MT_LDS_EMU:
case PTR_MT_LDS:
return CL_KERNEL_ARG_ADDRESS_LOCAL;
default:
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
2014-07-04 16:17:05 -04:00
}
} else if (argInfo->type == ARG_TYPE_IMAGE) {
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
}
// default for all other cases
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
inline static cl_kernel_arg_access_qualifier GetOclAccessQual(const aclArgData* argInfo) {
if (argInfo->type == ARG_TYPE_IMAGE) {
switch (argInfo->arg.image.type) {
case ACCESS_TYPE_RO:
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
case ACCESS_TYPE_WO:
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
case ACCESS_TYPE_RW:
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
default:
return CL_KERNEL_ARG_ACCESS_NONE;
2014-07-04 16:17:05 -04:00
}
}
return CL_KERNEL_ARG_ACCESS_NONE;
}
2014-07-04 16:17:05 -04:00
inline static cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) {
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
if (argInfo->type == ARG_TYPE_POINTER) {
if (argInfo->arg.pointer.isVolatile) {
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
2014-07-04 16:17:05 -04:00
}
if (argInfo->arg.pointer.isRestrict) {
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
2014-07-04 16:17:05 -04:00
}
if (argInfo->arg.pointer.isPipe) {
rv |= CL_KERNEL_ARG_TYPE_PIPE;
2014-07-04 16:17:05 -04:00
}
if (argInfo->isConst) {
rv |= CL_KERNEL_ARG_TYPE_CONST;
2014-07-04 16:17:05 -04:00
}
switch (argInfo->arg.pointer.memory) {
case PTR_MT_CONSTANT:
case PTR_MT_UAV_CONSTANT:
case PTR_MT_CONSTANT_EMU:
rv |= CL_KERNEL_ARG_TYPE_CONST;
break;
default:
break;
2014-07-04 16:17:05 -04:00
}
}
return rv;
}
2014-07-04 16:17:05 -04:00
static int GetOclSize(const aclArgData* argInfo) {
switch (argInfo->type) {
case ARG_TYPE_POINTER:
return sizeof(void*);
case ARG_TYPE_VALUE:
//! \note OCL 6.1.5. For 3-component vector data types,
//! the size of the data type is 4 * sizeof(component).
switch (argInfo->arg.value.data) {
case DATATYPE_struct:
return 1 * argInfo->arg.value.numElements;
case DATATYPE_i8:
case DATATYPE_u8:
return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_u16:
case DATATYPE_i16:
case DATATYPE_f16:
return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_u32:
case DATATYPE_i32:
case DATATYPE_f32:
return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_i64:
case DATATYPE_u64:
case DATATYPE_f64:
return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
case DATATYPE_ERROR:
default:
return -1;
}
case ARG_TYPE_IMAGE:
return sizeof(cl_mem);
case ARG_TYPE_SAMPLER:
return sizeof(cl_sampler);
case ARG_TYPE_QUEUE:
return sizeof(cl_command_queue);
default:
return -1;
}
}
void HSAILKernel::initArgList(const aclArgData* aclArg) {
// Initialize the hsail argument list too
initHsailArgs(aclArg);
// Iterate through the arguments and insert into parameterList
device::Kernel::parameters_t params;
amd::KernelParameterDescriptor desc;
size_t offset = 0;
// Reserved arguments for HSAIL launch
aclArg += MaxExtraArgumentsNum;
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
desc.name_ = arguments_[i]->name_.c_str();
desc.type_ = GetOclType(aclArg);
desc.addressQualifier_ = GetOclAddrQual(aclArg);
desc.accessQualifier_ = GetOclAccessQual(aclArg);
desc.typeQualifier_ = GetOclTypeQual(aclArg);
desc.typeName_ = arguments_[i]->typeName_.c_str();
// Make a check if it is local or global
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
desc.size_ = sizeof(cl_mem);
} else {
desc.size_ = GetOclSize(aclArg);
}
// Make offset alignment to match CPU metadata, since
// in multidevice config abstraction layer has a single signature
// and CPU sends the paramaters as they are allocated in memory
size_t size = desc.size_;
offset = amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
params.push_back(desc);
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
flags_.imageEna_ = true;
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
flags_.imageWriteEna_ = true;
}
}
}
createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0);
}
void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
int offset = 0;
// Reserved arguments for HSAIL launch
aclArg += MaxExtraArgumentsNum;
// Iterate through the each kernel argument
for (; aclArg->struct_size != 0; aclArg++) {
Argument* arg = new Argument;
// Initialize HSAIL kernel argument
arg->name_ = aclArg->argStr;
arg->typeName_ = aclArg->typeStr;
arg->size_ = GetHSAILArgSize(aclArg);
arg->offset_ = offset;
arg->type_ = GetHSAILArgType(aclArg);
arg->addrQual_ = GetHSAILAddrQual(aclArg);
arg->dataType_ = GetHSAILDataType(aclArg);
// If vector of args we add additional arguments to flatten it out
arg->numElem_ =
((aclArg->type == ARG_TYPE_VALUE) && (aclArg->arg.value.data != DATATYPE_struct))
? aclArg->arg.value.numElements
: 1;
arg->alignment_ = GetHSAILArgAlignment(aclArg);
arg->access_ = GetHSAILArgAccessType(aclArg);
offset += GetHSAILArgSize(aclArg);
arguments_.push_back(arg);
}
}
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions,
uint extraArgsNum)
: device::Kernel(prog->dev(), name, *prog),
compileOptions_(compileOptions),
index_(0),
code_(NULL),
codeSize_(0),
hwMetaData_(NULL),
extraArgumentsNum_(extraArgsNum) {
flags_.hsa_ = true;
}
HSAILKernel::~HSAILKernel() {
while (!arguments_.empty()) {
Argument* arg = arguments_.back();
delete arg;
arguments_.pop_back();
}
delete[] hwMetaData_;
delete code_;
}
bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
if (extraArgumentsNum_ > MaxExtraArgumentsNum) {
LogError("Failed to initialize kernel: extra arguments number is bigger than is supported");
return false;
}
acl_error error = ACL_SUCCESS;
std::string openClKernelName = openclMangledName(name());
flags_.internalKernel_ =
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
// compile kernel down to ISA
if (finalize) {
std::string options(compileOptions_.c_str());
options.append(" -just-kernel=");
options.append(openClKernelName.c_str());
// Append an option so that we can selectively enable a SCOption on CZ
// whenever IOMMUv2 is enabled.
if (dev().settings().svmFineGrainSystem_) {
options.append(" -sc-xnack-iommu");
}
error = aclCompile(dev().hsaCompiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG,
ACL_TYPE_ISA, NULL);
buildLog_ += aclGetCompilerLog(dev().hsaCompiler());
if (error != ACL_SUCCESS) {
LogError("Failed to finalize kernel");
return false;
}
}
aqlCreateHWInfo(sym);
// Pull out metadata from the ELF
size_t sizeOfArgList;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
openClKernelName.c_str(), NULL, &sizeOfArgList);
if (error != ACL_SUCCESS) {
return false;
}
char* aclArgList = new char[sizeOfArgList];
if (NULL == aclArgList) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
openClKernelName.c_str(), aclArgList, &sizeOfArgList);
if (error != ACL_SUCCESS) {
return false;
}
size_t sizeOfWorkGroupSize;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
openClKernelName.c_str(), NULL, &sizeOfWorkGroupSize);
if (error != ACL_SUCCESS) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
if (error != ACL_SUCCESS) {
return false;
}
// Copy wavefront size
workGroupInfo_.wavefrontSize_ = prog().isNull() ? 64 : dev().getAttribs().wavefrontSize;
// Find total workgroup size
if (workGroupInfo_.compileSize_[0] != 0) {
workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
workGroupInfo_.compileSize_[2];
} else {
workGroupInfo_.size_ = dev().info().preferredWorkGroupSize_;
}
// Pull out printf metadata from the ELF
size_t sizeOfPrintfList;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
openClKernelName.c_str(), NULL, &sizeOfPrintfList);
if (error != ACL_SUCCESS) {
return false;
}
// Make sure kernel has any printf info
if (0 != sizeOfPrintfList) {
char* aclPrintfList = new char[sizeOfPrintfList];
if (NULL == aclPrintfList) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList);
if (error != ACL_SUCCESS) {
return false;
}
// Set the PrintfList
InitPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
delete[] aclPrintfList;
}
aclMetadata md;
md.enqueue_kernel = false;
size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel);
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE,
openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue);
if (error != ACL_SUCCESS) {
return false;
}
flags_.dynamicParallelism_ = md.enqueue_kernel;
md.kernel_index = -1;
size_t sizeOfIndex = sizeof(md.kernel_index);
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_KERNEL_INDEX,
openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex);
if (error != ACL_SUCCESS) {
return false;
}
index_ = md.kernel_index;
size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT,
openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_,
&sizeOfWavesPerSimdHint);
if (error != ACL_SUCCESS) {
return false;
}
waveLimiter_.enable(dev().settings().ciPlus_);
size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_);
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
openClKernelName.c_str(), workGroupInfo_.compileSizeHint_,
&sizeOfWorkGroupSizeHint);
if (error != ACL_SUCCESS) {
return false;
}
size_t sizeOfVecTypeHint;
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
openClKernelName.c_str(), NULL, &sizeOfVecTypeHint);
if (error != ACL_SUCCESS) {
return false;
}
if (0 != sizeOfVecTypeHint) {
char* VecTypeHint = new char[sizeOfVecTypeHint + 1];
if (NULL == VecTypeHint) {
return false;
}
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint);
if (error != ACL_SUCCESS) {
return false;
}
VecTypeHint[sizeOfVecTypeHint] = '\0';
workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint);
delete[] VecTypeHint;
}
// Set the argList
initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
delete[] aclArgList;
return true;
}
const Device& HSAILKernel::dev() const { return reinterpret_cast<const Device&>(dev_); }
const HSAILProgram& HSAILKernel::prog() const {
return reinterpret_cast<const HSAILProgram&>(prog_);
}
inline static void WriteAqlArg(
unsigned char** dst, //!< The write pointer to the buffer
const void* src, //!< The source pointer
uint size, //!< The size in bytes to copy
uint alignment = 0 //!< The alignment to follow while writing to the buffer
) {
if (alignment == 0) {
*dst = amd::alignUp(*dst, size);
} else {
*dst = amd::alignUp(*dst, alignment);
}
memcpy(*dst, src, size);
*dst += size;
}
const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap,
std::vector<const Memory*>& memList) const {
static const bool WaitOnBusyEngine = true;
uint64_t ldsAddress = ldsSize();
address aqlArgBuf = gpu.cb(0)->sysMemCopy();
address aqlStruct = gpu.cb(1)->sysMemCopy();
bool srdResource = false;
if (extraArgumentsNum_ > 0) {
assert(MaxExtraArgumentsNum >= 6 &&
"MaxExtraArgumentsNum has changed, the below algorithm should be changed accordingly");
size_t extraArgs[MaxExtraArgumentsNum] = {0, 0, 0, 0, 0, 0};
// The HLC generates up to 3 additional arguments for the global offsets
for (uint i = 0; i < sizes.dimensions(); ++i) {
extraArgs[i] = sizes.offset()[i];
}
// Check if the kernel may have printf output
if ((printfInfo().size() > 0) &&
// and printf buffer was allocated
(gpu.printfDbgHSA().dbgBuffer() != NULL)) {
// and set the fourth argument as the printf_buffer pointer
extraArgs[3] = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
memList.push_back(gpu.printfDbgHSA().dbgBuffer());
}
if (dynamicParallelism()) {
// Provide the host parent AQL wrap object to the kernel
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(aqlStruct);
memset(wrap, 0, sizeof(AmdAqlWrap));
wrap->state = AQL_WRAP_BUSY;
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(sizeof(AmdAqlWrap));
*vmParentWrap = cb->vmAddress() + cb->wrtOffset();
// and set 5th & 6th arguments
extraArgs[4] = vmDefQueue;
extraArgs[5] = *vmParentWrap;
memList.push_back(cb);
}
WriteAqlArg(&aqlArgBuf, extraArgs, sizeof(size_t) * extraArgumentsNum_, sizeof(size_t));
}
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
amd::Memory* const* memories =
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
// Find all parameters for the current kernel
for (uint i = 0; i != signature.numParameters(); ++i) {
const HSAILKernel::Argument* arg = argument(i);
const amd::KernelParameterDescriptor& desc = signature.at(i);
const_address paramaddr = parameters + desc.offset_;
switch (arg->type_) {
case HSAIL_ARGTYPE_POINTER:
// If it is a global pointer
if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) {
Memory* gpuMem = NULL;
amd::Memory* mem = NULL;
uint32_t index = signature.at(i).info_.arrayIndex_;
if (nativeMem) {
gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
if (nullptr != gpuMem) {
mem = gpuMem->owner();
2014-07-04 16:17:05 -04:00
}
} else {
mem = memories[index];
if (mem != nullptr) {
gpuMem = dev().getGpuMemory(mem);
2014-07-04 16:17:05 -04:00
}
}
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr), sizeof(paramaddr));
if (gpuMem == nullptr) {
2014-07-04 16:17:05 -04:00
break;
}
2014-07-04 16:17:05 -04:00
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
gpuMem->wait(gpu, WaitOnBusyEngine);
2014-07-04 16:17:05 -04:00
//! @todo Compiler has to return read/write attributes
if ((NULL != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
mem->signalWrite(&dev());
}
memList.push_back(gpuMem);
// save the memory object pointer to allow global memory access
if (NULL != dev().hwDebugMgr()) {
dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner());
}
2014-07-04 16:17:05 -04:00
}
// If it is a local pointer
else {
assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) && "Unsupported address type");
ldsAddress = amd::alignUp(ldsAddress, arg->alignment_);
WriteAqlArg(&aqlArgBuf, &ldsAddress, desc.size_);
if (desc.size_ == 8) {
ldsAddress += *reinterpret_cast<const uint64_t*>(paramaddr);
} else {
ldsAddress += *reinterpret_cast<const uint32_t*>(paramaddr);
}
2014-07-04 16:17:05 -04:00
}
break;
case HSAIL_ARGTYPE_VALUE:
// Special case for structrues
if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) {
// Copy the current structre into CB1
memcpy(aqlStruct, paramaddr, arg->size_);
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(arg->size_);
// Then use a pointer in aqlArgBuffer to CB1
uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset();
WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*));
memList.push_back(cb);
} else {
WriteAqlArg(&aqlArgBuf, paramaddr, arg->numElem_ * arg->size_, arg->size_);
2014-07-04 16:17:05 -04:00
}
break;
case HSAIL_ARGTYPE_IMAGE: {
Image* image = nullptr;
amd::Memory* mem = nullptr;
uint32_t index = signature.at(i).info_.arrayIndex_;
if (nativeMem) {
image = reinterpret_cast<Image* const*>(memories)[index];
if (nullptr != image) {
mem = image->owner();
}
} else {
mem = memories[index];
if (mem == NULL) {
LogError("The kernel image argument isn't an image object!");
return nullptr;
}
image = static_cast<Image*>(dev().getGpuMemory(mem));
}
// Wait for resource if it was used on an inactive engine
//! \note syncCache may call DRM transfer
image->wait(gpu, WaitOnBusyEngine);
//! \note Special case for the image views.
//! Copy SRD to CB1, so blit manager will be able to release
//! this view without a wait for SRD resource.
if (image->memoryType() == Resource::ImageView) {
// Copy the current structre into CB1
memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
ConstBuffer* cb = gpu.constBufs_[1];
cb->uploadDataToHw(HsaImageObjectSize);
// Then use a pointer in aqlArgBuffer to CB1
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
memList.push_back(cb);
} else {
uint64_t srd = image->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
}
//! @todo Compiler has to return read/write attributes
if ((NULL != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
mem->signalWrite(&dev());
2014-07-04 16:17:05 -04:00
}
memList.push_back(image);
break;
}
case HSAIL_ARGTYPE_SAMPLER: {
uint32_t index = signature.at(i).info_.arrayIndex_;
const amd::Sampler* sampler = reinterpret_cast<amd::Sampler* const*>(parameters +
kernelParams.samplerObjOffset())[index];
const Sampler* gpuSampler = static_cast<Sampler*>(sampler->getDeviceSampler(dev()));
uint64_t srd = gpuSampler->hwSrd();
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
srdResource = true;
break;
}
case HSAIL_ARGTYPE_QUEUE: {
uint32_t index = signature.at(i).info_.arrayIndex_;
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
parameters + kernelParams.queueObjOffset())[index];
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
uint64_t vmQueue;
if (dev().settings().useDeviceQueue_) {
vmQueue = gpuQueue->vQueue()->vmAddress();
} else {
if (!gpu.createVirtualQueue(queue->size())) {
LogError("Virtual queue creation failed!");
return nullptr;
}
vmQueue = gpu.vQueue()->vmAddress();
}
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
break;
}
default:
LogError(" Unsupported address type ");
2014-07-04 16:17:05 -04:00
return NULL;
}
}
2014-07-04 16:17:05 -04:00
if (ldsAddress > dev().info().localMemSize_) {
LogError("No local memory available\n");
return NULL;
}
2014-07-04 16:17:05 -04:00
// HSAIL kernarg segment size is rounded up to multiple of 16.
aqlArgBuf = amd::alignUp(aqlArgBuf, 16);
assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) &&
"Size and the number of arguments don't match!");
hsa_kernel_dispatch_packet_t* hsaDisp =
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlArgBuf);
2014-07-04 16:17:05 -04:00
amd::NDRange local(sizes.local());
const amd::NDRange& global = sizes.global();
2014-07-04 16:17:05 -04:00
// Check if runtime has to find local workgroup size
FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
2014-07-04 16:17:05 -04:00
hsaDisp->header = kDispatchPacketHeader;
hsaDisp->setup = sizes.dimensions();
2014-07-04 16:17:05 -04:00
hsaDisp->workgroup_size_x = local[0];
hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
hsaDisp->grid_size_x = global[0];
hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
hsaDisp->reserved2 = 0;
2014-07-04 16:17:05 -04:00
// Initialize kernel ISA and execution buffer requirements
hsaDisp->private_segment_size = spillSegSize();
hsaDisp->group_segment_size = ldsAddress;
hsaDisp->kernel_object = gpuAqlCode()->vmAddress();
ConstBuffer* cb = gpu.constBufs_[0];
cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
hsaDisp->kernarg_address = reinterpret_cast<void*>(argList);
hsaDisp->reserved2 = 0;
hsaDisp->completion_signal.handle = 0;
2014-07-04 16:17:05 -04:00
memList.push_back(cb);
memList.push_back(gpuAqlCode());
for (gpu::Memory* mem : prog().globalStores()) {
memList.push_back(mem);
}
if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
memList.push_back(gpu.hsaQueueMem());
}
if (srdResource || prog().isStaticSampler()) {
dev().srds().fillResourceList(memList);
}
2014-07-04 16:17:05 -04:00
return hsaDisp;
2014-07-04 16:17:05 -04:00
}
} // namespace gpu