d71cf5ef8b
SWDEV-79445 - OCL generic changes and code clean-up - Don't add static LDS usage, since the value from AQL packet will already accumulate that. Also there is a plan to remove the static LDS field from the code object. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#334 edit
3505 строки
112 KiB
C++
3505 строки
112 KiB
C++
//
|
|
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
|
|
#include "device/gpu/gpukernel.hpp"
|
|
#include "device/gpu/gpuprogram.hpp"
|
|
#include "device/gpu/gpublit.hpp"
|
|
#include "device/gpu/gpuconstbuf.hpp"
|
|
#include "device/gpu/gpusched.hpp"
|
|
#include "platform/commandqueue.hpp"
|
|
#include "shader/ComputeProgramObject.h"
|
|
#include "utils/options.hpp"
|
|
|
|
#include "acl.h"
|
|
#include "SCShadersR678XXCommon.h"
|
|
|
|
#include <string>
|
|
#include <memory>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <ctime>
|
|
#include <algorithm>
|
|
|
|
namespace gpu {
|
|
|
|
const MetaDataConst ArgState[ArgStateTotal] = {
|
|
// Note: the order is important
|
|
// Name Type Properties
|
|
// Kernel description (special properties)
|
|
{"memory:compilerwrite", KernelArg::PrivateFixed, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"uniqueid:", KernelArg::NoType, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"memory:private:", KernelArg::PrivateSize, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"memory:local:", KernelArg::LocalSize, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"memory:hwprivate:", KernelArg::HwPrivateSize, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"memory:uavprivate:", KernelArg::HwPrivateSize, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"memory:hwlocal:", KernelArg::HwLocalSize, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"memory:64bitABI", KernelArg::ABI64Bit, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"limitgroupsize", KernelArg::Wavefront, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"function:", KernelArg::NoType, {1, 1, 0, 0, 0, 0, 0}},
|
|
{"intrinsic:", KernelArg::NoType, {1, 0, 0, 0, 0, 0, 0}},
|
|
{"error:", KernelArg::ErrorMessage, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"warning:", KernelArg::WarningMessage, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"printf_fmt:", KernelArg::PrintfFormatStr, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"version:", KernelArg::MetadataVersion, {0, 0, 0, 0, 0, 0, 0}},
|
|
// Kernel basic types
|
|
{"pointer:", KernelArg::PointerGlobal, {1, 1, 1, 1, 1, 1, 0}},
|
|
{"value:", KernelArg::Value, {1, 1, 1, 1, 1, 0, 0}},
|
|
{"image:", KernelArg::Image, {1, 1, 1, 1, 1, 0, 0}},
|
|
{"sampler:", KernelArg::Sampler, {0, 1, 0, 0, 0, 0, 0}},
|
|
{"counter:", KernelArg::Counter, {1, 1, 0, 1, 1, 0, 0}},
|
|
{"cws:", KernelArg::Grouping, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"lws:", KernelArg::WrkgrpSize, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"uavid:", KernelArg::UavId, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"reflection:", KernelArg::Reflection, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"constarg:", KernelArg::ConstArg, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"cbid:", KernelArg::ConstBufId, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"printfid:", KernelArg::PrintfBufId, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"wsh:", KernelArg::GroupingHint, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"vth:", KernelArg::VecTypeHint, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"WavesPerSimdHint:", KernelArg::WavesPerSimdHint, {0, 0, 0, 0, 0, 0, 0}},
|
|
};
|
|
|
|
const DataTypeConst DataType[] = {
|
|
{
|
|
"i8:", KernelArg::Char,
|
|
},
|
|
{
|
|
"i16:", KernelArg::Short,
|
|
},
|
|
{
|
|
"i32:", KernelArg::Int,
|
|
},
|
|
{
|
|
"i64:", KernelArg::Long,
|
|
},
|
|
{
|
|
"u8:", KernelArg::UChar,
|
|
},
|
|
{
|
|
"u16:", KernelArg::UShort,
|
|
},
|
|
{
|
|
"u32:", KernelArg::UInt,
|
|
},
|
|
{
|
|
"u64:", KernelArg::ULong,
|
|
},
|
|
{
|
|
"float:", KernelArg::Float,
|
|
},
|
|
{
|
|
"double:", KernelArg::Double,
|
|
},
|
|
{
|
|
"struct:", KernelArg::Struct,
|
|
},
|
|
{
|
|
"union:", KernelArg::Union,
|
|
},
|
|
{
|
|
"1D:", KernelArg::Image1D,
|
|
},
|
|
{
|
|
"2D:", KernelArg::Image2D,
|
|
},
|
|
{
|
|
"3D:", KernelArg::Image3D,
|
|
},
|
|
{
|
|
"1DB:", KernelArg::Image1DB,
|
|
},
|
|
{
|
|
"1DA:", KernelArg::Image1DA,
|
|
},
|
|
{
|
|
"2DA:", KernelArg::Image2DA,
|
|
},
|
|
{
|
|
"opaque:", KernelArg::Opaque,
|
|
},
|
|
{
|
|
"event:", KernelArg::Event,
|
|
},
|
|
{
|
|
"sampler:", KernelArg::Sampler,
|
|
},
|
|
{
|
|
"half:", KernelArg::Half,
|
|
},
|
|
};
|
|
|
|
const uint DataTypeTotal = sizeof(DataType) / sizeof(DataTypeConst);
|
|
|
|
struct BufDataConst {
|
|
const char* tagName_; //!< buffer's name
|
|
KernelArg::ArgumentType type_; //!< type of argument
|
|
struct {
|
|
uint number_ : 1; //!< buffer's number
|
|
uint alignment_ : 1; //!< buffer's alignment
|
|
uint attribute_ : 1; //!< buffer's read/write attribute
|
|
uint reserved : 29; //!< reserved
|
|
};
|
|
};
|
|
|
|
static const BufDataConst BufType[] = {{"g", KernelArg::PointerGlobal, {1, 0, 0, 0}},
|
|
{"p", KernelArg::PointerPrivate, {1, 1, 1, 0}},
|
|
{"l", KernelArg::PointerLocal, {1, 1, 1, 0}},
|
|
{"uav", KernelArg::PointerGlobal, {1, 1, 1, 0}},
|
|
{"c", KernelArg::PointerConst, {1, 1, 1, 0}},
|
|
{"hl", KernelArg::PointerHwLocal, {1, 1, 1, 0}},
|
|
{"hp", KernelArg::PointerHwPrivate, {1, 1, 1, 0}},
|
|
{"hc", KernelArg::PointerHwConst, {1, 1, 1, 0}}};
|
|
static const uint BufTypeTotal = sizeof(BufType) / sizeof(BufDataConst);
|
|
|
|
//! The mathlib constants for each kernel execution
|
|
static const float MathLibConst[4] = {0.0f, 0.5f, 1.0f, 2.0f};
|
|
|
|
bool expect(const std::string& str, size_t* pos, const std::string& sym) {
|
|
bool result = true;
|
|
uint i;
|
|
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
// Check if we have expected symbols
|
|
for (i = 0; i < sym.size(); ++i) {
|
|
char deb = str[*pos + i];
|
|
if (deb != sym[i]) {
|
|
result = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (result) *pos += i;
|
|
|
|
return result;
|
|
}
|
|
|
|
bool getword(const std::string& str, size_t* pos, std::string& sym) {
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;", *pos);
|
|
size_t count = posEnd - *pos;
|
|
|
|
if (count != 0) {
|
|
sym = str.substr(*pos, count);
|
|
}
|
|
sym[count] = 0;
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool getstring(const std::string& str, size_t* pos, std::string* out) {
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(":\n\r;", *pos);
|
|
size_t count = posEnd - *pos;
|
|
|
|
char* sym = new char[count + 1];
|
|
if (count != 0) {
|
|
if (!str.copy(sym, count, *pos)) {
|
|
return false;
|
|
}
|
|
}
|
|
sym[count] = 0;
|
|
*out = sym;
|
|
delete[] sym;
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool getuint(const std::string& str, size_t* pos, uint* val) {
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
char sym[16];
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
|
|
|
|
if (!str.copy(sym, posEnd - *pos, *pos)) {
|
|
return false;
|
|
}
|
|
*val = 0;
|
|
for (size_t i = 0; i < (posEnd - *pos); ++i) {
|
|
*val = (*val * 10) + (sym[i] - 0x30);
|
|
}
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool getuintHex(const std::string& str, size_t* pos, uint* val) {
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
char sym[16];
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
|
|
|
|
if (!str.copy(sym, posEnd - *pos, *pos)) {
|
|
return false;
|
|
}
|
|
*val = 0;
|
|
for (size_t i = 0; i < (posEnd - *pos); ++i) {
|
|
if (sym[i] >= '0' && sym[i] <= 'F') {
|
|
*val = (*val * 16) + (sym[i] - '0');
|
|
} else if (sym[i] >= 'a' && sym[i] <= 'f') {
|
|
*val = (*val * 16) + (sym[i] - 'a' + 10);
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool getuint64Hex(const std::string& str, size_t* pos, uint64_t* val) {
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
char sym[16];
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
|
|
|
|
if (!str.copy(sym, posEnd - *pos, *pos)) {
|
|
return false;
|
|
}
|
|
*val = 0;
|
|
for (size_t i = 0; i < (posEnd - *pos); ++i) {
|
|
if (sym[i] >= '0' && sym[i] <= 'F') {
|
|
*val = (*val * 16) + (sym[i] - '0');
|
|
} else if (sym[i] >= 'a' && sym[i] <= 'f') {
|
|
*val = (*val * 16) + (sym[i] - 'a' + 10);
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
void intToStr(size_t value, char* str, size_t size) {
|
|
static const uint MaxDigits32bit = 10;
|
|
char result[MaxDigits32bit];
|
|
uint idx = MaxDigits32bit;
|
|
|
|
do {
|
|
idx--;
|
|
result[idx] = static_cast<char>((value % 10) + '0');
|
|
value /= 10;
|
|
} while ((value != 0) && (idx > 0));
|
|
size_t len = MaxDigits32bit - idx;
|
|
size_t n = std::min<size_t>(len, size - 1);
|
|
memcpy(str, &result[idx], n);
|
|
str[n] = '\0';
|
|
}
|
|
|
|
//! Default destructor
|
|
CalImageReference::~CalImageReference() {
|
|
// Free CAL image
|
|
free(image_);
|
|
}
|
|
|
|
KernelArg::KernelArg()
|
|
: type_(KernelArg::NoType),
|
|
size_(0),
|
|
cbIdx_(0),
|
|
cbPos_(0),
|
|
index_(0),
|
|
alignment_(1),
|
|
dataType_(KernelArg::NoType) {
|
|
name_ = "";
|
|
buf_ = "";
|
|
memory_.value_ = 0;
|
|
typeQualifier_ = CL_KERNEL_ARG_TYPE_NONE;
|
|
}
|
|
|
|
KernelArg::KernelArg(const KernelArg& data) {
|
|
// Fill the new object
|
|
*this = data;
|
|
}
|
|
|
|
KernelArg& KernelArg::operator=(const KernelArg& data) {
|
|
// Fill the fields of the current object
|
|
name_ = data.name_;
|
|
typeName_ = data.typeName_;
|
|
typeQualifier_ = data.typeQualifier_;
|
|
type_ = data.type_;
|
|
size_ = data.size_;
|
|
cbIdx_ = data.cbIdx_;
|
|
cbPos_ = data.cbPos_;
|
|
buf_ = data.buf_;
|
|
index_ = data.index_;
|
|
alignment_ = data.alignment_;
|
|
dataType_ = data.dataType_;
|
|
memory_.value_ = data.memory_.value_;
|
|
return *this;
|
|
}
|
|
|
|
bool KernelArg::isCbNeeded() const {
|
|
//! \note not a safe way
|
|
bool result = ((type_ > NoType) && (type_ < Sampler)) ? true : false;
|
|
if ((type_ == Sampler) && (location_ == 0)) {
|
|
// Sampler is defined outside the kernel
|
|
result = true;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
size_t KernelArg::size(bool gpuLayer) const {
|
|
switch (type_) {
|
|
case NoType:
|
|
return 0;
|
|
case PointerConst:
|
|
case PointerHwConst:
|
|
case PointerGlobal:
|
|
return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem);
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
return (gpuLayer) ? sizeof(ImageConstants) : sizeof(cl_mem);
|
|
case Sampler:
|
|
return (gpuLayer) ? 2 * sizeof(uint32_t) : sizeof(cl_sampler);
|
|
case Counter:
|
|
return (gpuLayer) ? 0 : sizeof(cl_mem);
|
|
case PointerLocal:
|
|
case PointerHwLocal:
|
|
return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem);
|
|
case PointerPrivate:
|
|
case PointerHwPrivate:
|
|
return (gpuLayer) ? sizeof(uint32_t) * size_ : 0;
|
|
case Float:
|
|
return sizeof(cl_float) * amd::nextPowerOfTwo(size_);
|
|
case Double:
|
|
return sizeof(cl_double) * amd::nextPowerOfTwo(size_);
|
|
case Char:
|
|
case UChar:
|
|
return sizeof(cl_char) * amd::nextPowerOfTwo(size_);
|
|
case Short:
|
|
case UShort:
|
|
return sizeof(cl_short) * amd::nextPowerOfTwo(size_);
|
|
case Int:
|
|
case UInt:
|
|
return sizeof(cl_uint) * amd::nextPowerOfTwo(size_);
|
|
case Long:
|
|
case ULong:
|
|
return sizeof(cl_ulong) * amd::nextPowerOfTwo(size_);
|
|
case Struct:
|
|
case Union:
|
|
return (gpuLayer) ? amd::alignUp(size_, 16) : size_;
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
cl_kernel_arg_address_qualifier KernelArg::addressQualifier() const {
|
|
switch (type_) {
|
|
case PointerGlobal:
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
case PointerLocal:
|
|
case PointerHwLocal:
|
|
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
|
case PointerConst:
|
|
case PointerHwConst:
|
|
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
|
default:
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
}
|
|
|
|
cl_kernel_arg_access_qualifier KernelArg::accessQualifier() const {
|
|
switch (type_) {
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
if (memory_.readOnly_) {
|
|
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
|
} else if (memory_.writeOnly_) {
|
|
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
|
} else if (memory_.readWrite_) {
|
|
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
|
}
|
|
// Fall through ...
|
|
default:
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
}
|
|
|
|
//! temporary solution for the vectors handling in compiler
|
|
size_t KernelArg::specialVector() const {
|
|
if (size_ > VectorSizeLimit) {
|
|
switch (type_) {
|
|
case Char:
|
|
case UChar:
|
|
return sizeof(cl_char);
|
|
case Short:
|
|
case UShort:
|
|
return sizeof(cl_short);
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
clk_value_type_t KernelArg::type() const {
|
|
switch (type_) {
|
|
case PointerGlobal:
|
|
case PointerLocal:
|
|
case PointerHwLocal:
|
|
case PointerConst:
|
|
case PointerHwConst:
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
case Counter:
|
|
return T_POINTER;
|
|
case Float:
|
|
return T_FLOAT;
|
|
case Double:
|
|
return T_DOUBLE;
|
|
case Char:
|
|
case UChar:
|
|
return T_CHAR;
|
|
case Short:
|
|
case UShort:
|
|
return T_SHORT;
|
|
case Int:
|
|
return T_INT;
|
|
case UInt:
|
|
//! \note No UINT type
|
|
return T_INT;
|
|
case Long:
|
|
return T_LONG;
|
|
case ULong:
|
|
//! \note No ULONG type
|
|
return T_LONG;
|
|
case Struct:
|
|
case Union:
|
|
//! @todo What should we report?
|
|
return T_CHAR;
|
|
case Sampler:
|
|
return T_SAMPLER;
|
|
case PointerPrivate:
|
|
case PointerHwPrivate:
|
|
case NoType:
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
}
|
|
|
|
NullKernel::NullKernel(const std::string& name, const NullDevice& gpuNullDev,
|
|
const NullProgram& nullprog)
|
|
: device::Kernel(gpuNullDev, name),
|
|
buildError_(CL_BUILD_PROGRAM_FAILURE),
|
|
gpuDev_(gpuNullDev),
|
|
prog_(nullprog),
|
|
calRef_(NULL),
|
|
internal_(false),
|
|
flags_(0),
|
|
cbSizes_(NULL),
|
|
numCb_(0),
|
|
rwAttributes_(false),
|
|
instructionCnt_(4) {
|
|
// UAV raw index will be detected
|
|
uavRaw_ = UavIdUndefined;
|
|
// CB index will be detected
|
|
cbId_ = UavIdUndefined;
|
|
// Printf index will be detected
|
|
printfId_ = UavIdUndefined;
|
|
}
|
|
|
|
NullKernel::~NullKernel() {
|
|
uint idx;
|
|
|
|
if (calRef_ == NULL) {
|
|
return;
|
|
}
|
|
calRef_->release();
|
|
|
|
// Destroy all kernel arguments
|
|
for (idx = 0; idx < arguments_.size(); ++idx) {
|
|
delete arguments_[idx];
|
|
}
|
|
arguments_.clear();
|
|
|
|
// Destroy all sampler kernel arguments
|
|
for (idx = 0; idx < intSamplers_.size(); ++idx) {
|
|
delete intSamplers_[idx];
|
|
}
|
|
intSamplers_.clear();
|
|
}
|
|
|
|
|
|
static int scComponentToArrayIndex(E_SC_COMPONENT dstComp) {
|
|
switch (dstComp) {
|
|
case SC_COMPONENT_X:
|
|
return 0;
|
|
case SC_COMPONENT_Y:
|
|
return 1;
|
|
case SC_COMPONENT_Z:
|
|
return 2;
|
|
case SC_COMPONENT_W:
|
|
return 3;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void addLoopConst(const SC_HWSHADER* shader, AMUabiAddEncoding& encoding) {
|
|
uint count = shader->dep.NumIntrlIConstants;
|
|
encoding.litConstsCount = shader->dep.NumIntrlIConstants;
|
|
|
|
// only suppport loop consts (int consts)
|
|
if (count) {
|
|
AMUabiLiteralConst* allocatedconsts = encoding.litConsts;
|
|
memset(allocatedconsts, 0, count * sizeof(AMUabiLiteralConst));
|
|
uint usedConsts = 0;
|
|
for (uint i = 0; i < count; ++i) {
|
|
uint currentConst;
|
|
for (currentConst = 0; currentConst < usedConsts; ++currentConst) {
|
|
if (allocatedconsts[currentConst].addr ==
|
|
HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber) {
|
|
break;
|
|
}
|
|
}
|
|
if (currentConst == usedConsts) {
|
|
usedConsts++;
|
|
assert(usedConsts <= count);
|
|
}
|
|
allocatedconsts[currentConst].addr = HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber;
|
|
allocatedconsts[currentConst].type = AMU_ABI_INT32;
|
|
allocatedconsts[currentConst].value.int32[scComponentToArrayIndex(
|
|
HWSHADER_Get(shader, dep.IntrlIConstants)[i].eDstComp)] =
|
|
HWSHADER_Get(shader, dep.IntrlIConstants)[i].iValue;
|
|
}
|
|
encoding.litConstsCount = usedConsts;
|
|
}
|
|
}
|
|
|
|
bool NullKernel::create(const std::string& code, const std::string& metadata,
|
|
const void* binaryCode, size_t binarySize) {
|
|
std::auto_ptr<uint> uavRefCount(new uint[MaxUavArguments]);
|
|
if (NULL == uavRefCount.get()) {
|
|
return false;
|
|
}
|
|
|
|
// Set all ref counts to 0
|
|
memset(uavRefCount.get(), 0, sizeof(uavRefCount.get()[0]) * MaxUavArguments);
|
|
|
|
// parse the metadata fields
|
|
if (!parseArguments(metadata, uavRefCount.get())) {
|
|
return false;
|
|
}
|
|
|
|
CALimage calImage;
|
|
// Save source if DEBUG build
|
|
#if DEBUG
|
|
ilSource_ = code;
|
|
#endif // DEBUG
|
|
|
|
amd::option::Options* options = nullProg().getCompilerOptions();
|
|
internal_ = options->oVariables->clInternalKernel;
|
|
|
|
if ((binaryCode == NULL) && (binarySize == 0) && !code.empty()) {
|
|
acl_error err;
|
|
std::string arch = "amdil";
|
|
if (nullDev().settings().use64BitPtr_) {
|
|
arch += "64";
|
|
}
|
|
aclTargetInfo info = aclGetTargetInfo(arch.c_str(), nullDev().hwInfo()->targetName_, &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclGetTargetInfo failed");
|
|
return false;
|
|
}
|
|
|
|
aclBinaryOptions binOpts = {0};
|
|
binOpts.struct_size = sizeof(binOpts);
|
|
binOpts.elfclass = info.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32;
|
|
binOpts.bitness = ELFDATA2LSB;
|
|
binOpts.alloc = &::malloc;
|
|
binOpts.dealloc = &::free;
|
|
|
|
aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &info, &binOpts, &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclBinaryInit failed");
|
|
return false;
|
|
}
|
|
|
|
if (ACL_SUCCESS !=
|
|
aclInsertSection(nullDev().amdilCompiler(), bin, code.data(), code.size(), aclSOURCE)) {
|
|
LogWarning("aclInsertSection failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
|
|
amd::option::Options* Opts = (amd::option::Options*)bin->options;
|
|
|
|
// Append an option so that we can selectively enable a SCOption on CZ
|
|
// whenever IOMMUv2 is enabled.
|
|
if (nullDev().settings().svmFineGrainSystem_) {
|
|
options->origOptionStr.append(" -sc-xnack-iommu");
|
|
}
|
|
// temporary solution to synchronize buildNo between runtime and complib
|
|
// until we move runtime inside complib
|
|
Opts->setBuildNo(options->getBuildNo());
|
|
|
|
// pass kernel name to compiler
|
|
Opts->setCurrKernelName(name().c_str());
|
|
|
|
err = aclCompile(nullDev().amdilCompiler(), bin, options->origOptionStr.c_str(), ACL_TYPE_AMDIL_TEXT,
|
|
ACL_TYPE_ISA, NULL);
|
|
|
|
buildLog_ += aclGetCompilerLog(nullDev().amdilCompiler());
|
|
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclCompile failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
if (!options->oVariables->BinEXE) {
|
|
// Early exit if binary doesn't contain EXE
|
|
aclBinaryFini(bin);
|
|
return true;
|
|
}
|
|
size_t len;
|
|
const void* isa = aclExtractSection(nullDev().amdilCompiler(), bin, &len, aclTEXT, &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclExtractSection failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
|
|
uint calImageSize;
|
|
if (!createMultiBinary(&calImageSize, reinterpret_cast<void**>(&calImage), isa)) {
|
|
LogWarning("initSrcEncoding failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
|
|
aclBinaryFini(bin);
|
|
} else if ((binaryCode != NULL) && (binarySize != 0)) {
|
|
uint size = 0;
|
|
if (!amuABIMultiBinaryGetSize(&size, const_cast<void*>(binaryCode)) || size > binarySize) {
|
|
buildLog_ += "Invalid binary image";
|
|
LogError("amuABIMultiBinaryGetSize failed!");
|
|
return false;
|
|
}
|
|
|
|
calImage = static_cast<CALimage>(malloc(size));
|
|
::memcpy(calImage, binaryCode, size);
|
|
} else {
|
|
LogError("Incorrect initialization parameters!");
|
|
return false;
|
|
}
|
|
|
|
calRef_ = new CalImageReference(calImage);
|
|
if (calRef_ == NULL) {
|
|
LogError("Memory allocation failure!");
|
|
// Free CAL image
|
|
free(calImage);
|
|
return false;
|
|
}
|
|
|
|
CALfuncInfo calFuncInfo;
|
|
|
|
// Get kernel compiled information
|
|
getFuncInfoFromImage(calImage, &calFuncInfo);
|
|
if (calFuncInfo.maxScratchRegsNeeded > 0) {
|
|
LogPrintfInfo(
|
|
"%s kernel has register spilling."
|
|
"Lower performance is expected.",
|
|
name().c_str());
|
|
}
|
|
|
|
workGroupInfo_.scratchRegs_ = calFuncInfo.maxScratchRegsNeeded;
|
|
workGroupInfo_.wavefrontPerSIMD_ = calFuncInfo.numWavefrontPerSIMD;
|
|
workGroupInfo_.wavefrontSize_ = calFuncInfo.wavefrontSize;
|
|
workGroupInfo_.availableGPRs_ = calFuncInfo.numGPRsAvailable;
|
|
workGroupInfo_.usedGPRs_ = calFuncInfo.numGPRsUsed;
|
|
workGroupInfo_.availableSGPRs_ = calFuncInfo.numSGPRsAvailable;
|
|
workGroupInfo_.usedSGPRs_ = calFuncInfo.numSGPRsUsed;
|
|
workGroupInfo_.availableVGPRs_ = calFuncInfo.numVGPRsAvailable;
|
|
workGroupInfo_.usedVGPRs_ = calFuncInfo.numVGPRsUsed;
|
|
workGroupInfo_.availableLDSSize_ = calFuncInfo.LDSSizeAvailable;
|
|
workGroupInfo_.usedLDSSize_ = calFuncInfo.LDSSizeUsed;
|
|
workGroupInfo_.availableStackSize_ = calFuncInfo.stackSizeAvailable;
|
|
workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed;
|
|
|
|
device::Kernel::parameters_t params;
|
|
if (!createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
size_t NullKernel::getCalBinarySize() const {
|
|
CALuint imageSize;
|
|
if (!amuABIMultiBinaryGetSize(&imageSize, calImage())) {
|
|
LogError("Failed to get the image size!");
|
|
return 0;
|
|
}
|
|
return static_cast<size_t>(imageSize);
|
|
}
|
|
|
|
bool NullKernel::getCalBinary(void* binary, size_t size) const {
|
|
uint calImageSize = 0;
|
|
if (!amuABIMultiBinaryGetSize(&calImageSize, calImage()) || size < calImageSize) {
|
|
LogError("CAL failed to save the kernel binary!");
|
|
return false;
|
|
}
|
|
::memcpy(binary, calImage(), calImageSize);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Kernel::create(const std::string& code, const std::string& metadata, const void* binaryCode,
|
|
size_t binarySize) {
|
|
setPreferredSizeMultiple(dev().getAttribs().wavefrontSize);
|
|
|
|
if (!NullKernel::create(code, metadata, binaryCode, binarySize)) {
|
|
return false;
|
|
}
|
|
|
|
// initialize constant buffer sizes
|
|
if (!initConstBuffers()) {
|
|
return false;
|
|
}
|
|
|
|
// Initialize the kernel parameters
|
|
bool result = initParameters();
|
|
|
|
// Wave limiter needs to be initialized after kernel metadata is parsed
|
|
// Since it depends on it.
|
|
waveLimiter_.enable(dev().settings().ciPlus_);
|
|
|
|
if (result) {
|
|
buildError_ = CL_SUCCESS;
|
|
} else {
|
|
result = false;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
Kernel::Kernel(const std::string& name, const Device& gpuDev, const Program& prog,
|
|
const InitData* initData)
|
|
: NullKernel(name, gpuDev, prog) {
|
|
hwPrivateSize_ = 0;
|
|
if (NULL != initData) {
|
|
flags_ = initData->flags_;
|
|
hwPrivateSize_ = initData->hwPrivateSize_;
|
|
hwLocalSize_ = initData->hwLocalSize_;
|
|
}
|
|
// Workgroup info private memory size
|
|
workGroupInfo_.privateMemSize_ = hwPrivateSize_;
|
|
// Default wavesPerSimdHint_
|
|
workGroupInfo_.wavesPerSimdHint_ = ~0U;
|
|
}
|
|
|
|
Kernel::~Kernel() {
|
|
if (calRef_ == NULL) {
|
|
return;
|
|
}
|
|
|
|
{
|
|
Device::ScopedLockVgpus lock(dev());
|
|
|
|
// Release all virtual image objects on all virtual GPUs
|
|
for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
|
|
dev().vgpus()[idx]->releaseKernel(calImage());
|
|
}
|
|
}
|
|
|
|
if (0 != numCb_) {
|
|
delete[] cbSizes_;
|
|
}
|
|
}
|
|
|
|
const Device& Kernel::dev() const { return reinterpret_cast<const Device&>(gpuDev_); }
|
|
|
|
const Program& Kernel::prog() const { return reinterpret_cast<const Program&>(prog_); }
|
|
|
|
bool NullKernel::createMultiBinary(uint* imageSize, void** image, const void* isa) {
|
|
const SC_HWSHADER* shader = reinterpret_cast<const SC_HWSHADER*>(isa);
|
|
|
|
bool result = false;
|
|
AMUabiAddEncoding encoding;
|
|
memset(&encoding, 0, sizeof(AMUabiAddEncoding));
|
|
|
|
size_t allocSize = sizeof(uint) * MaxReadImage + sizeof(CALUavEntry) * MaxUavArguments +
|
|
sizeof(CALSamplerMapEntry) * MaxSamplers + sizeof(CALConstantBufferMask) * MaxConstBuffers +
|
|
sizeof(AMUabiLiteralConst) * shader->dep.NumIntrlIConstants;
|
|
char* tmpMem = new char[allocSize];
|
|
if (tmpMem == NULL) {
|
|
LogError("Error allocating memory");
|
|
return false;
|
|
}
|
|
|
|
CalcPtr(encoding.inputs, tmpMem, 0, 0);
|
|
CalcPtr(encoding.uav, encoding.inputs, sizeof(uint), MaxReadImage);
|
|
CalcPtr(encoding.inputSamplerMaps, encoding.uav, sizeof(CALUavEntry), MaxUavArguments);
|
|
CalcPtr(encoding.constBuffers, encoding.inputSamplerMaps, sizeof(CALSamplerMapEntry),
|
|
MaxSamplers);
|
|
if (shader->dep.NumIntrlIConstants != 0) {
|
|
CalcPtr(encoding.litConsts, encoding.constBuffers, sizeof(CALConstantBufferMask),
|
|
MaxConstBuffers);
|
|
}
|
|
AMUabiMultiBinary amuBinary;
|
|
amuABIMultiBinaryCreate(&amuBinary);
|
|
|
|
result = siCreateHwInfo(shader, encoding);
|
|
if (!result) {
|
|
delete[] tmpMem;
|
|
LogWarning("Error Creating program info");
|
|
return false;
|
|
}
|
|
|
|
addLoopConst(shader, encoding);
|
|
|
|
unsigned int outputCount = 0, condOut = 0, earlyExit = 0, globalCount = 0, persistentCount = 0;
|
|
unsigned int symbolCount = 0;
|
|
CALOutputEntry* outputs = 0;
|
|
unsigned int* globalBuffers = 0;
|
|
unsigned int* persistentBuffers = 0;
|
|
AMUabiUserSymbol* symbols = 0;
|
|
|
|
CALSamplerMapEntry* inputSamplers = encoding.inputSamplerMaps;
|
|
CALConstantBufferMask* constBuffers = encoding.constBuffers;
|
|
uint* inputResources = encoding.inputs;
|
|
CALUavEntry* uav = encoding.uav;
|
|
|
|
uint inputSamplerCount = samplerSize();
|
|
for (uint i = 0; i < inputSamplerCount; ++i) {
|
|
inputSamplers[i].resource = 0;
|
|
inputSamplers[i].sampler = sampler(i)->index_;
|
|
}
|
|
|
|
uint constBufferCount = 2;
|
|
|
|
constBuffers[0].index = 0;
|
|
constBuffers[1].index = 1;
|
|
|
|
uint inputResourceCount = 0;
|
|
|
|
uint uavCount = 0;
|
|
bool cbBound = false;
|
|
bool printfBound = false;
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerHwConst:
|
|
constBuffers[constBufferCount++].index = arg->index_;
|
|
break;
|
|
case KernelArg::PointerGlobal:
|
|
uav[uavCount].offset = arg->index_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
break;
|
|
case KernelArg::ConstBufId:
|
|
if (!cbBound) {
|
|
uav[uavCount].offset = cbId_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
cbBound = true;
|
|
break;
|
|
case KernelArg::PrintfBufId:
|
|
if (!printfBound) {
|
|
uav[uavCount].offset = printfId_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
printfBound = true;
|
|
break;
|
|
case KernelArg::UavId:
|
|
if ((UavIdUndefined != uavRaw_) && !(flags() & PrintfOutput)) {
|
|
uav[uavCount].offset = arg->index_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
} else {
|
|
if (UavIdUndefined != uavRaw_) {
|
|
uav[uavCount].offset = uavRaw_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
}
|
|
break;
|
|
case KernelArg::Sampler:
|
|
inputSamplers[inputSamplerCount].resource = 0;
|
|
inputSamplers[inputSamplerCount].sampler = arg->index_;
|
|
inputSamplerCount++;
|
|
break;
|
|
case KernelArg::Image1D:
|
|
case KernelArg::Image2D:
|
|
case KernelArg::Image3D:
|
|
case KernelArg::Image1DB:
|
|
case KernelArg::Image1DA:
|
|
case KernelArg::Image2DA:
|
|
if (arg->memory_.readOnly_) {
|
|
inputResources[inputResourceCount++] = arg->index_;
|
|
} else {
|
|
uav[uavCount].offset = arg->index_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPED;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_2D;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (uint i = 0; i < nullProg().glbCb().size(); ++i) {
|
|
constBuffers[constBufferCount++].index = nullProg().glbCb()[i];
|
|
}
|
|
|
|
encoding.machine = nullDev().hwInfo()->machine_;
|
|
encoding.type = ED_ATI_CAL_TYPE_COMPUTE;
|
|
encoding.inputCount = inputResourceCount;
|
|
encoding.outputCount = outputCount;
|
|
encoding.outputs = outputs;
|
|
encoding.condOut = condOut;
|
|
encoding.earlyExit = earlyExit;
|
|
encoding.globalBuffersCount = globalCount;
|
|
encoding.globalBuffers = globalBuffers;
|
|
encoding.persistentBuffersCount = persistentCount;
|
|
encoding.persistentBuffers = persistentBuffers;
|
|
encoding.constBuffersCount = constBufferCount;
|
|
encoding.inputSamplerMapCount = inputSamplerCount;
|
|
encoding.symbolsCount = symbolCount;
|
|
encoding.symbols = symbols;
|
|
encoding.uavCount = uavCount;
|
|
|
|
amuABIMultiBinaryAddEncoding(amuBinary, &encoding);
|
|
|
|
uint success = amuABIMultiBinaryPack(imageSize, image, amuBinary);
|
|
|
|
amuABIMultiBinaryDestroy(amuBinary);
|
|
|
|
delete[] tmpMem;
|
|
delete[] encoding.progInfos;
|
|
|
|
return (success == 0) ? false : true;
|
|
}
|
|
|
|
void Kernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
|
|
amd::NDRange& lclWorkSize) const {
|
|
// Initialize the default workgoup info
|
|
// Check if the kernel has the compiled sizes
|
|
if (workGroupInfo()->compileSize_[0] == 0) {
|
|
// Find the default local workgroup size, if it wasn't specified
|
|
if (lclWorkSize[0] == 0) {
|
|
if ((dev().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
|
|
// Find threads per group
|
|
size_t thrPerGrp = workGroupInfo()->size_;
|
|
|
|
// Check if kernel uses images
|
|
if ((flags() & ImageEnable) &&
|
|
// and thread group is a multiple value of wavefronts
|
|
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
|
|
// and it's 2 or 3-dimensional workload
|
|
(workDim > 1) && ((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)) {
|
|
// Use 8x8 workgroup size if kernel has image writes
|
|
if ((flags() & ImageWrite) || (thrPerGrp != nullDev().info().preferredWorkGroupSize_)) {
|
|
lclWorkSize[0] = 8;
|
|
lclWorkSize[1] = 8;
|
|
} else {
|
|
lclWorkSize[0] = 16;
|
|
lclWorkSize[1] = 16;
|
|
}
|
|
if (workDim == 3) {
|
|
lclWorkSize[2] = 1;
|
|
}
|
|
} else {
|
|
size_t tmp = thrPerGrp;
|
|
// Split the local workgroup into the most efficient way
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
size_t div = tmp;
|
|
for (; (gblWorkSize[d] % div) != 0; div--)
|
|
;
|
|
lclWorkSize[d] = div;
|
|
tmp /= div;
|
|
}
|
|
// Assuming DWORD access
|
|
const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2;
|
|
|
|
// Check if we couldn't find optimal workload
|
|
if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
|
|
// or size is too small for the cache line
|
|
(lclWorkSize[0] < cacheLineMatch)) {
|
|
size_t maxSize = 0;
|
|
size_t maxDim = 0;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (maxSize < gblWorkSize[d]) {
|
|
maxSize = gblWorkSize[d];
|
|
maxDim = d;
|
|
}
|
|
}
|
|
// Use X dimension as high priority. Runtime will assume that
|
|
// X dimension is more important for the address calculation
|
|
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
|
|
lclWorkSize[0] = cacheLineMatch;
|
|
thrPerGrp /= cacheLineMatch;
|
|
lclWorkSize[maxDim] = thrPerGrp;
|
|
for (uint d = 1; d < workDim; ++d) {
|
|
if (d != maxDim) {
|
|
lclWorkSize[d] = 1;
|
|
}
|
|
}
|
|
} else {
|
|
// Check if a local workgroup has the most optimal size
|
|
if (thrPerGrp > maxSize) {
|
|
thrPerGrp = maxSize;
|
|
}
|
|
lclWorkSize[maxDim] = thrPerGrp;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (d != maxDim) {
|
|
lclWorkSize[d] = 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Use overrides when app doesn't provide workgroup dimensions
|
|
if (workDim == 1) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
|
|
} else if (workDim == 2) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
|
|
} else if (workDim == 3) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
|
|
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
|
|
} else {
|
|
assert(0 && "Invalid workDim!");
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
|
|
}
|
|
}
|
|
}
|
|
|
|
void Kernel::setupProgramGrid(VirtualGPU& gpu, size_t workDim, const amd::NDRange& glbWorkOffset,
|
|
const amd::NDRange& gblWorkSize, amd::NDRange& lclWorkSize,
|
|
const amd::NDRange& groupOffset, const amd::NDRange& glbWorkOffsetOrg,
|
|
const amd::NDRange& glbWorkSizeOrg) const {
|
|
// ABI is always in CB0
|
|
address cbBuf = gpu.cb(0)->sysMemCopy();
|
|
uint* pGlobalSize =
|
|
reinterpret_cast<uint*>(cbBuf + GlobalWorkitemOffset * ConstBuffer::VectorSize);
|
|
uint* pLocalSize = reinterpret_cast<uint*>(cbBuf + LocalWorkitemOffset * ConstBuffer::VectorSize);
|
|
uint* pNumGroups = reinterpret_cast<uint*>(cbBuf + GroupsOffset * ConstBuffer::VectorSize);
|
|
uint* pGlobalOffset =
|
|
reinterpret_cast<uint*>(cbBuf + GlobalWorkOffsetOffset * ConstBuffer::VectorSize);
|
|
uint* pGroupOffset =
|
|
reinterpret_cast<uint*>(cbBuf + GroupWorkOffsetOffset * ConstBuffer::VectorSize);
|
|
uint32_t* debugInfo = reinterpret_cast<uint*>(cbBuf + DebugOffset * ConstBuffer::VectorSize);
|
|
uint* pNDRangeGlobalOffset =
|
|
reinterpret_cast<uint*>(cbBuf + NDRangeGlobalWorkOffsetOffset * ConstBuffer::VectorSize);
|
|
|
|
// Check for 64-bit metadata
|
|
uint glbABIShift = (abi64Bit()) ? 1 : 0;
|
|
|
|
VirtualGPU::CalVirtualDesc* progGrid = &gpu.cal_;
|
|
|
|
// Finds local workgroup size
|
|
findLocalWorkSize(workDim, gblWorkSize, lclWorkSize);
|
|
|
|
// Initialize the execution grid block and size/offset
|
|
pGlobalSize[0] = pGlobalSize[1] = pGlobalSize[2] = 1;
|
|
pGlobalSize[3] = static_cast<uint>(workDim);
|
|
|
|
pLocalSize[0] = pLocalSize[1] = pLocalSize[2] = 1;
|
|
pLocalSize[3] = 0;
|
|
|
|
pNumGroups[0] = pNumGroups[1] = pNumGroups[2] = 1;
|
|
pNumGroups[3] = 0;
|
|
|
|
pGlobalOffset[2] = pGlobalOffset[1] = pGlobalOffset[0] = 0;
|
|
pGroupOffset[2] = pGroupOffset[1] = pGroupOffset[0] = 0;
|
|
|
|
progGrid->gridBlock.width = progGrid->gridBlock.height = progGrid->gridBlock.depth = 1;
|
|
|
|
progGrid->gridSize.width = progGrid->gridSize.height = progGrid->gridSize.depth = 1;
|
|
|
|
progGrid->partialGridBlock.width = progGrid->partialGridBlock.height =
|
|
progGrid->partialGridBlock.depth = 1;
|
|
|
|
bool partialGrid = false;
|
|
|
|
// Fill the right values, based on the application request
|
|
switch (workDim) {
|
|
case 3:
|
|
pLocalSize[2] = progGrid->gridBlock.depth = static_cast<CALuint>(lclWorkSize[2]);
|
|
|
|
pGlobalSize[2] = static_cast<CALuint>(glbWorkSizeOrg[2]);
|
|
progGrid->gridSize.depth = static_cast<CALuint>(gblWorkSize[2]);
|
|
progGrid->gridSize.depth /= progGrid->gridBlock.depth;
|
|
pNumGroups[2] = pGlobalSize[2] / progGrid->gridBlock.depth;
|
|
|
|
pGlobalOffset[2] = glbWorkOffset[2];
|
|
pGroupOffset[2] = groupOffset[2];
|
|
pNDRangeGlobalOffset[2 + glbABIShift] = glbWorkOffsetOrg[2];
|
|
|
|
// Check if partial workgroup dispatch is required
|
|
progGrid->partialGridBlock.depth = gblWorkSize[2] % lclWorkSize[2];
|
|
if (progGrid->partialGridBlock.depth != 0) {
|
|
partialGrid = true;
|
|
// Increment the number of groups
|
|
progGrid->gridSize.depth++;
|
|
pNumGroups[2]++;
|
|
} else {
|
|
progGrid->partialGridBlock.depth = lclWorkSize[2];
|
|
}
|
|
// Fall through to fill 2D and 1D dimensions...
|
|
case 2:
|
|
pLocalSize[1] = progGrid->gridBlock.height = static_cast<CALuint>(lclWorkSize[1]);
|
|
|
|
pGlobalSize[1] = static_cast<CALuint>(glbWorkSizeOrg[1]);
|
|
progGrid->gridSize.height = static_cast<CALuint>(gblWorkSize[1]);
|
|
progGrid->gridSize.height /= progGrid->gridBlock.height;
|
|
pNumGroups[1] = pGlobalSize[1] / progGrid->gridBlock.height;
|
|
|
|
pGlobalOffset[1] = glbWorkOffset[1];
|
|
pGroupOffset[1] = groupOffset[1];
|
|
pNDRangeGlobalOffset[1 + glbABIShift] = glbWorkOffsetOrg[1];
|
|
|
|
// Check if partial workgroup dispatch is required
|
|
progGrid->partialGridBlock.height = gblWorkSize[1] % lclWorkSize[1];
|
|
if (progGrid->partialGridBlock.height != 0) {
|
|
partialGrid = true;
|
|
// Increment the number of groups
|
|
progGrid->gridSize.height++;
|
|
pNumGroups[1]++;
|
|
} else {
|
|
progGrid->partialGridBlock.height = lclWorkSize[1];
|
|
}
|
|
// Fall through to fill 1D dimension...
|
|
case 1:
|
|
pLocalSize[0] = progGrid->gridBlock.width = static_cast<CALuint>(lclWorkSize[0]);
|
|
|
|
pGlobalSize[0] = static_cast<CALuint>(glbWorkSizeOrg[0]);
|
|
progGrid->gridSize.width = static_cast<CALuint>(gblWorkSize[0]);
|
|
progGrid->gridSize.width /= progGrid->gridBlock.width;
|
|
pNumGroups[0] = pGlobalSize[0] / progGrid->gridBlock.width;
|
|
|
|
pGlobalOffset[0] = glbWorkOffset[0];
|
|
pGroupOffset[0] = groupOffset[0];
|
|
pNDRangeGlobalOffset[0 + glbABIShift] = glbWorkOffsetOrg[0];
|
|
|
|
// Check if partial workgroup dispatch is required
|
|
progGrid->partialGridBlock.width = gblWorkSize[0] % lclWorkSize[0];
|
|
if (progGrid->partialGridBlock.width != 0) {
|
|
partialGrid = true;
|
|
// Increment the number of groups
|
|
progGrid->gridSize.width++;
|
|
pNumGroups[0]++;
|
|
} else {
|
|
progGrid->partialGridBlock.width = lclWorkSize[0];
|
|
}
|
|
break;
|
|
default:
|
|
LogWarning("Wrong dimensions. Force to 1x1x1!");
|
|
break;
|
|
}
|
|
|
|
if (!partialGrid) {
|
|
progGrid->partialGridBlock.width = progGrid->partialGridBlock.height =
|
|
progGrid->partialGridBlock.depth = 0;
|
|
}
|
|
|
|
// Calculate the total number of workitems and workgroups
|
|
pGlobalOffset[3] = pGroupOffset[3] = 1;
|
|
for (uint i = 0; i < workDim; ++i) {
|
|
pGlobalOffset[3] *= pGlobalOffset[i];
|
|
pGroupOffset[3] *= pGroupOffset[i];
|
|
}
|
|
|
|
// Setup debug output buffer (if printf is active)
|
|
if (flags() & PrintfOutput) {
|
|
if (abi64Bit()) {
|
|
// Setup the debug info in constant buffer
|
|
reinterpret_cast<uint64_t*>(debugInfo)[1] = gpu.printfDbg().bufOffset();
|
|
// Size in DWORDs
|
|
debugInfo[4] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
|
|
debugInfo[4] /= sizeof(uint32_t);
|
|
} else {
|
|
// Setup the debug info in constant buffer
|
|
debugInfo[1] = static_cast<uint32_t>(gpu.printfDbg().bufOffset());
|
|
// Size in DWORDs
|
|
debugInfo[2] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
|
|
debugInfo[2] /= sizeof(uint32_t);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Kernel::initParameters() {
|
|
size_t offset = 0;
|
|
device::Kernel::parameters_t params;
|
|
amd::KernelParameterDescriptor desc;
|
|
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
|
|
// Initialize the arguments for the abstraction layer
|
|
if (arg->isCbNeeded()) {
|
|
desc.name_ = arg->name_.data();
|
|
desc.type_ = arg->type();
|
|
desc.size_ = arg->size(false);
|
|
desc.addressQualifier_ = arg->addressQualifier();
|
|
desc.accessQualifier_ = arg->accessQualifier();
|
|
desc.typeName_ = arg->typeName();
|
|
desc.typeQualifier_ = arg->typeQualifier();
|
|
|
|
// Make offset alignment to match CPU metadata, since
|
|
// in multidevice config abstraction layer has a single signature
|
|
// and CPU sends the paramaters as they are allocated in memory
|
|
size_t size = desc.size_;
|
|
if (size == 0) {
|
|
// Local memory for CPU
|
|
size = sizeof(cl_mem);
|
|
}
|
|
offset = amd::alignUp(offset, std::min(size, size_t(16)));
|
|
desc.offset_ = offset;
|
|
offset += amd::alignUp(size, sizeof(uint32_t));
|
|
params.push_back(desc);
|
|
}
|
|
}
|
|
|
|
// Report the allocated local memory size (emulated and hw)
|
|
if (hwLocalSize_ != 0) {
|
|
CondLog((dev().info().localMemSize_ < hwLocalSize_),
|
|
"Requested local size is bigger than reported");
|
|
workGroupInfo_.localMemSize_ = hwLocalSize_;
|
|
}
|
|
|
|
if (!createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Kernel::bindGlobalHwCb(VirtualGPU& gpu, VirtualGPU::GslKernelDesc* desc) const {
|
|
bool result = true;
|
|
|
|
// Bind HW constant buffers used for the global data store
|
|
const Program::HwConstBuffers& gds = prog().glbHwCb();
|
|
for (const auto& it : gds) {
|
|
uint idx = it.first;
|
|
result = bindResource(gpu, *(it.second), idx, ConstantBuffer, idx);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool Kernel::bindConstantBuffers(VirtualGPU& gpu) const {
|
|
bool result = true;
|
|
|
|
assert((numCb_ <= MaxConstBuffersArguments) && "Runtime doesn't support more CBs for arguments!");
|
|
|
|
// Upload the parameters to HW and bind all constant buffers
|
|
for (uint i = 0; i < numCb_; i++) {
|
|
ConstBuffer* cb = gpu.constBufs_[i];
|
|
result &= cb->uploadDataToHw(cbSizes_[i]) &&
|
|
bindResource(gpu, *cb, i, ConstantBuffer, i, cb->wrtOffset());
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
void Kernel::processMemObjects(VirtualGPU& gpu, const amd::Kernel& kernel, const_address params,
|
|
bool nativeMem) const {
|
|
// Mark the tracker with a new kernel,
|
|
// so we can avoid checks of the aliased objects
|
|
gpu.memoryDependency().newKernel();
|
|
|
|
// Check all parameters for the current kernel
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
amd::Memory* const* memories =
|
|
reinterpret_cast<amd::Memory* const*>(params + kernel.parameters().memoryObjOffset());
|
|
|
|
for (size_t i = 0; i < signature.numParameters(); ++i) {
|
|
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
|
const KernelArg* arg = argument(i);
|
|
Memory* memory = NULL;
|
|
|
|
// Find if current argument is a buffer
|
|
if ((desc.type_ == T_POINTER) && (arg->type_ != KernelArg::PointerLocal) &&
|
|
(arg->type_ != KernelArg::PointerHwLocal)) {
|
|
uint32_t index = desc.info_.arrayIndex_;
|
|
if (nativeMem) {
|
|
memory = reinterpret_cast<Memory* const*>(memories)[index];
|
|
} else if (*reinterpret_cast<amd::Memory* const*>(params + desc.offset_) != NULL) {
|
|
memory = dev().getGpuMemory(memories[index]);
|
|
// Synchronize data with other memory instances if necessary
|
|
memory->syncCacheFromHost(gpu);
|
|
}
|
|
|
|
if (memory != NULL) {
|
|
// Validate memory for a dependency in the queue
|
|
gpu.memoryDependency().validate(gpu, memory, arg->memory_.readOnly_);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Kernel::loadParameters(VirtualGPU& gpu, const amd::Kernel& kernel, const_address params,
|
|
bool nativeMem) const {
|
|
bool result = true;
|
|
uint i;
|
|
|
|
// Initialize local private ranges
|
|
if (!initLocalPrivateRanges(gpu)) {
|
|
return false;
|
|
}
|
|
|
|
if ((UavIdUndefined != uavRaw_) && (!(flags() & PrintfOutput) || (printfId_ != UavIdUndefined))) {
|
|
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
|
|
// Bind a buffer for a dummy read
|
|
result = bindResource(gpu, *gpuMemory, 0, ArgumentUavID, uavRaw_);
|
|
}
|
|
|
|
// Find all parameters for the current kernel
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
for (i = 0; i != signature.numParameters(); ++i) {
|
|
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
|
// Set current argument
|
|
if (!setArgument(gpu, kernel, i, params, desc, nativeMem)) {
|
|
result = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (result) {
|
|
// Update the ring ranges and math constant
|
|
setLocalPrivateRanges(gpu);
|
|
|
|
result = bindConstantBuffers(gpu);
|
|
|
|
if (flags() & PrivateFixed) {
|
|
result &= bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_);
|
|
}
|
|
|
|
// Setup debug output buffer (if printf is active)
|
|
if (flags() & PrintfOutput) {
|
|
gpu.addVmMemory(gpu.printfDbg().dbgBuffer());
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool Kernel::run(VirtualGPU& gpu, GpuEvent* calEvent, bool lastRun, bool lastDoppCmd,
|
|
bool pfpaDoppCmd) const {
|
|
const VirtualGPU::CalVirtualDesc* dispatch = gpu.cal();
|
|
|
|
auto compProg = static_cast<gsl::ComputeProgramObject*>(gpu.gslKernelDesc()->func_);
|
|
compProg->setWavesPerSH(waveLimiter_.getWavesPerSH(&gpu));
|
|
|
|
gpu.eventBegin(MainEngine);
|
|
gpu.rs()->Dispatch(gpu.cs(), &dispatch->gridBlock, &dispatch->partialGridBlock,
|
|
&dispatch->gridSize, dispatch->localSize, gpu.vmMems(), dispatch->memCount_,
|
|
lastDoppCmd, pfpaDoppCmd);
|
|
gpu.eventEnd(MainEngine, *calEvent);
|
|
|
|
// Unbind all resources
|
|
unbindResources(gpu, *calEvent, lastRun);
|
|
|
|
return true;
|
|
}
|
|
|
|
static size_t counter = 0;
|
|
void Kernel::debug(VirtualGPU& gpu) const {
|
|
std::fstream stubWrite;
|
|
address src = NULL;
|
|
|
|
std::cerr << "--- " << name_ << " ---" << std::endl;
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
const Memory* gpuMem = gpu.slots_[i].memory_;
|
|
std::stringstream fileName;
|
|
bool bufferObj =
|
|
((arg->type_ == KernelArg::PointerGlobal) || (arg->type_ == KernelArg::PointerConst) ||
|
|
(arg->type_ == KernelArg::PointerHwConst));
|
|
|
|
if ((src != NULL) && arg->isCbNeeded() && bufferObj) {
|
|
address memory = gpu.cb(arg->cbIdx_)->sysMemCopy();
|
|
std::cerr.setf(std::ios::hex);
|
|
uint* location =
|
|
reinterpret_cast<uint*>(src + *reinterpret_cast<uint*>(memory + arg->cbPos_));
|
|
std::cerr << " > " << arg->name_ << ": 0x" << location << std::endl;
|
|
|
|
// Dump the data
|
|
fileName << counter << "_kernel_" << name() << "_" << arg->name_ << "_" << location << ".bin";
|
|
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
|
|
|
|
// Write data to a file
|
|
if (stubWrite.is_open()) {
|
|
stubWrite.write(reinterpret_cast<char*>(location), gpuMem->size());
|
|
stubWrite.close();
|
|
}
|
|
}
|
|
if (((arg->type_ >= KernelArg::Image1D) && (arg->type_ <= KernelArg::Image3D)) ||
|
|
((src == NULL) && bufferObj)) {
|
|
//@todo Replace the current map
|
|
Memory* resource = const_cast<Memory*>(gpu.slots_[i].memory_);
|
|
void* memory = resource->map(&gpu);
|
|
uint* location = reinterpret_cast<uint*>(memory);
|
|
std::cerr << " > " << arg->name_ << (bufferObj ? ": buffer" : ": image") << std::endl;
|
|
// Dump the data
|
|
fileName << counter << "_kernel_" << name() << "_" << arg->name_ << "_" << location << ".bin";
|
|
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
|
|
|
|
// Write data to a file
|
|
if (stubWrite.is_open()) {
|
|
stubWrite.write(reinterpret_cast<char*>(location), gpuMem->size());
|
|
stubWrite.close();
|
|
}
|
|
resource->unmap(&gpu);
|
|
}
|
|
}
|
|
|
|
for (uint i = 0; i < gpu.constBufs_.size(); ++i) {
|
|
std::stringstream fileName;
|
|
fileName << counter++ << "_kernel_" << name() << "_const" << i << ".bin";
|
|
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
|
|
if (stubWrite.is_open()) {
|
|
address memory = reinterpret_cast<address>(gpu.constBufs_[i]->map(&gpu, Resource::ReadOnly));
|
|
// Check if we have OpenCL program
|
|
stubWrite.write(reinterpret_cast<char*>(memory + gpu.cb(i)->wrtOffset()),
|
|
gpu.cb(i)->lastWrtSize());
|
|
gpu.constBufs_[i]->unmap(&gpu);
|
|
stubWrite.close();
|
|
}
|
|
}
|
|
const Program::HwConstBuffers& gds = prog().glbHwCb();
|
|
for (const auto& it : gds) {
|
|
uint idx = it.first;
|
|
std::stringstream fileName;
|
|
fileName << counter++ << "_kernel_" << name() << "_const" << idx << ".bin";
|
|
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
|
|
if (stubWrite.is_open()) {
|
|
address memory = reinterpret_cast<address>(it.second->map(&gpu, Resource::ReadOnly));
|
|
// Check if we have OpenCL program
|
|
stubWrite.write(reinterpret_cast<char*>(memory), it.second->size());
|
|
it.second->unmap(&gpu);
|
|
stubWrite.close();
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Kernel::initConstBuffers() {
|
|
bool result = true;
|
|
size_t i;
|
|
|
|
assert((numCb_ != 0) && "We have 0 constant buffers!");
|
|
|
|
// Allocate an array for CB sizes
|
|
cbSizes_ = new size_t[numCb_];
|
|
if (cbSizes_ == NULL) {
|
|
return false;
|
|
}
|
|
memset(cbSizes_, 0, sizeof(size_t) * numCb_);
|
|
|
|
// CB0 is reserved for ABI data
|
|
cbSizes_[0] = TotalABIVectors * ConstBuffer::VectorSize;
|
|
|
|
// Find sizes of all constant buffers
|
|
for (i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
size_t size = arg->cbPos_ + arg->size(true);
|
|
size_t specVec = arg->specialVector();
|
|
if (specVec != 0) {
|
|
size = arg->cbPos_ + (arg->size_ / KernelArg::VectorSizeLimit) * ConstBuffer::VectorSize;
|
|
}
|
|
// Do we need a CB?
|
|
if (arg->isCbNeeded() && (cbSizes_[arg->cbIdx_] < size)) {
|
|
cbSizes_[arg->cbIdx_] = size;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool Kernel::setInternalSamplers(VirtualGPU& gpu) const {
|
|
for (uint i = 0; i < samplerSize(); ++i) {
|
|
const KernelArg* arg = sampler(i);
|
|
uint state = arg->cbPos_;
|
|
uint idx = arg->index_;
|
|
|
|
if (gpu.cal()->samplersState_[idx] != state) {
|
|
setSampler(gpu, state, idx);
|
|
gpu.cal_.samplersState_[idx] = state;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Kernel::setArgument(VirtualGPU& gpu, const amd::Kernel& kernel,
|
|
uint idx, const_address params,
|
|
const amd::KernelParameterDescriptor& desc,
|
|
bool nativeMem) const {
|
|
size_t size = desc.size_;
|
|
const void* param = params + desc.offset_;
|
|
bool result = true;
|
|
const KernelArg* arg;
|
|
address memory;
|
|
size_t argSize;
|
|
static const bool waitOnBusyEngine = true;
|
|
|
|
assert((idx < arguments_.size()) && "Param index is out of range!");
|
|
|
|
arg = argument(idx);
|
|
assert((arg->cbIdx_ == 1) && "Runtime supports CB1 only for the arguments buffer!");
|
|
memory = gpu.cb(1)->sysMemCopy();
|
|
argSize = arg->size(true);
|
|
|
|
// Bind the global heap for emulation mode
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
// Fall through ...
|
|
default:
|
|
break;
|
|
}
|
|
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerHwConst:
|
|
case KernelArg::PointerGlobal: {
|
|
gpu::Memory* gpuMem = NULL;
|
|
amd::Memory* const* memories =
|
|
reinterpret_cast<amd::Memory* const*>(params + kernel.parameters().memoryObjOffset());
|
|
uint32_t index = desc.info_.arrayIndex_;
|
|
if (nativeMem) {
|
|
gpuMem = reinterpret_cast<Memory*>(memories[index]);
|
|
} else if (memories[index] != nullptr) {
|
|
gpuMem = dev().getGpuMemory(memories[index]);
|
|
}
|
|
bool forceZeroOffset = false;
|
|
|
|
if (gpuMem == NULL) {
|
|
forceZeroOffset = true;
|
|
gpuMem = dev().getGpuMemory(dev().dummyPage());
|
|
}
|
|
uint64_t offset = gpuMem->pinOffset();
|
|
|
|
// Make sure the passed argument is a buffer object
|
|
if (!gpuMem->cal()->buffer_) {
|
|
LogError("The kernel buffer argument isn't a buffer object!");
|
|
return false;
|
|
}
|
|
|
|
if (arg->type_ == KernelArg::PointerHwConst) {
|
|
// Bind current memory object with the kernel
|
|
if (!bindResource(gpu, *gpuMem, idx, ArgumentConstBuffer, arg->index_)) {
|
|
return false;
|
|
}
|
|
assert((offset == 0) && "No offset for HW CB");
|
|
// Add a fake offset to make sure (ptr != NULL) is TRUE
|
|
offset = 1;
|
|
} else {
|
|
ResourceType type = ArgumentHeapBuffer;
|
|
|
|
// Check if kernel expects UAV binding
|
|
if (arg->memory_.uavBuf_) {
|
|
type = ArgumentBuffer;
|
|
} else {
|
|
// Bind global buffer to UAV this buffer is bound to
|
|
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Bind current memory object with the kernel
|
|
// Note: it's a fake binding, if the buffer is part of
|
|
// the global heap
|
|
if (!bindResource(gpu, *gpuMem, idx, type, arg->index_)) {
|
|
return false;
|
|
}
|
|
|
|
// Update offset only if we bind HeapBuffer or
|
|
// it's global address space in UAV setup on SI+
|
|
offset += gpuMem->hbOffset();
|
|
if (!forceZeroOffset) {
|
|
assert((offset != 0) && "Offset 0 with a real allocation!");
|
|
}
|
|
gpu.addVmMemory(gpuMem);
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, waitOnBusyEngine);
|
|
|
|
if (forceZeroOffset) {
|
|
offset = 0;
|
|
}
|
|
|
|
// Copy memory offset into the constant buffer
|
|
if (abi64Bit()) {
|
|
*(reinterpret_cast<uint64_t*>(memory + arg->cbPos_)) = offset;
|
|
} else {
|
|
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) = static_cast<uint>(offset);
|
|
}
|
|
} break;
|
|
case KernelArg::Image1D:
|
|
case KernelArg::Image2D:
|
|
case KernelArg::Image3D:
|
|
case KernelArg::Image1DB:
|
|
case KernelArg::Image1DA:
|
|
case KernelArg::Image2DA: {
|
|
gpu::Memory* gpuMem = NULL;
|
|
amd::Memory* const* memories =
|
|
reinterpret_cast<amd::Memory* const*>(params + kernel.parameters().memoryObjOffset());
|
|
uint32_t index = desc.info_.arrayIndex_;
|
|
if (nativeMem) {
|
|
gpuMem = reinterpret_cast<Memory*>(memories[index]);
|
|
} else if (memories[index] != nullptr) {
|
|
gpuMem = dev().getGpuMemory(memories[index]);
|
|
}
|
|
|
|
if (gpuMem == NULL) {
|
|
return false;
|
|
}
|
|
// Make sure the passed argument is an image object
|
|
if (gpuMem->cal()->buffer_) {
|
|
LogError("The kernel image argument isn't an image object!");
|
|
return false;
|
|
}
|
|
|
|
ResourceType resType = arg->memory_.readOnly_ ? ArgumentImageRead : ArgumentImageWrite;
|
|
|
|
// Bind current memory object with the shader.
|
|
if (!bindResource(gpu, *gpuMem, idx, resType, arg->index_)) {
|
|
return false;
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, waitOnBusyEngine);
|
|
|
|
// Copy image constants into the constant buffer
|
|
if (gpuMem->owner() != NULL) {
|
|
copyImageConstants(gpuMem->owner()->asImage(),
|
|
reinterpret_cast<ImageConstants*>(memory + arg->cbPos_));
|
|
}
|
|
|
|
// Handle DOPP texture resource
|
|
gslMemObject gslMem = gpuMem->gslResource();
|
|
if (gslMem->getAttribs().isDOPPDesktopTexture) {
|
|
gpu.addVmMemory(gpuMem);
|
|
}
|
|
} break;
|
|
case KernelArg::Sampler: {
|
|
uint32_t index = desc.info_.arrayIndex_;
|
|
const amd::Sampler* amdSampler = reinterpret_cast<amd::Sampler* const*>(params +
|
|
kernel.parameters().samplerObjOffset())[index];
|
|
uint idx = arg->index_;
|
|
uint32_t state = amdSampler->state();
|
|
|
|
if (state != gpu.cal()->samplersState_[idx]) {
|
|
setSampler(gpu, state, idx);
|
|
gpu.cal_.samplersState_[idx] = state;
|
|
}
|
|
|
|
// Copy sampler state into the constant buffer
|
|
*(reinterpret_cast<uint32_t*>(memory + arg->cbPos_)) = state;
|
|
} break;
|
|
case KernelArg::Counter: {
|
|
gpu::Memory* gpuMem = NULL;
|
|
if (nativeMem) {
|
|
gpuMem = *reinterpret_cast<Memory* const*>(param);
|
|
} else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
|
|
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, waitOnBusyEngine);
|
|
|
|
// Bind current memory object with the shader.
|
|
if (!bindResource(gpu, *gpuMem, idx, ArgumentCounter, idx)) {
|
|
return false;
|
|
}
|
|
} break;
|
|
case KernelArg::PointerHwLocal: {
|
|
// Calculate current offset in the local ring
|
|
uint offset = gpu.cal_.localSize;
|
|
uint extra = amd::alignUp(offset, arg->alignment_) - offset;
|
|
|
|
offset = amd::alignUp(offset, arg->alignment_);
|
|
size_t memSize = *static_cast<const uintptr_t*>(param);
|
|
|
|
// Allocate new memory from the local ring
|
|
gpu.cal_.localSize += static_cast<uint>(memSize) + extra;
|
|
// Copy current local argument's offset into the CB
|
|
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) = offset;
|
|
|
|
CondLog((gpu.cal_.localSize > dev().info().localMemSize_),
|
|
"Requested local size is bigger than reported!");
|
|
} break;
|
|
case KernelArg::Float:
|
|
case KernelArg::Double:
|
|
case KernelArg::Char:
|
|
case KernelArg::UChar:
|
|
case KernelArg::Short:
|
|
case KernelArg::UShort:
|
|
case KernelArg::Int:
|
|
case KernelArg::UInt:
|
|
case KernelArg::Long:
|
|
case KernelArg::ULong:
|
|
if (size != argSize) {
|
|
LogWarning("Parameter's sizes are unmatched!");
|
|
}
|
|
// Fall through ...
|
|
case KernelArg::Struct:
|
|
case KernelArg::Union: {
|
|
size_t specVec = arg->specialVector();
|
|
if (specVec != 0) {
|
|
uint iter = (arg->size_ / KernelArg::VectorSizeLimit);
|
|
for (uint i = 0; i < iter; ++i) {
|
|
amd::Os::fastMemcpy(
|
|
(memory + arg->cbPos_ + i * ConstBuffer::VectorSize),
|
|
reinterpret_cast<const char*>(param) + i * KernelArg::VectorSizeLimit * specVec,
|
|
specVec * KernelArg::VectorSizeLimit);
|
|
}
|
|
} else {
|
|
// Copy data into the CB
|
|
amd::Os::fastMemcpy((memory + arg->cbPos_), param, size);
|
|
}
|
|
} break;
|
|
default:
|
|
LogError("Unhandled argument's type!");
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool Kernel::initLocalPrivateRanges(VirtualGPU& gpu) const {
|
|
// Initialize HW local
|
|
gpu.cal_.localSize = hwLocalSize_;
|
|
|
|
// Bind the global buffer if emulated local or private memory
|
|
// was allocated by the kernel
|
|
if ((flags() & PrintfOutput && (printfId_ == UavIdUndefined)) && (uavRaw_ != UavIdUndefined)) {
|
|
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Bind the global buffer if emulated constant buffers are enabled
|
|
if (cbId_ != UavIdUndefined) {
|
|
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentCbID, cbId_)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Bind the printf buffer
|
|
if (printfId_ != UavIdUndefined) {
|
|
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentPrintfID, printfId_)) {
|
|
return false;
|
|
}
|
|
}
|
|
// Initialize the iterations count
|
|
gpu.cal_.iterations_ = 1;
|
|
|
|
return true;
|
|
}
|
|
|
|
void Kernel::setLocalPrivateRanges(VirtualGPU& gpu) const {
|
|
address cbBuf = gpu.cb(0)->sysMemCopy();
|
|
uint* data;
|
|
uint gridSize =
|
|
gpu.cal()->gridSize.width * gpu.cal()->gridSize.height * gpu.cal()->gridSize.depth;
|
|
uint blockSize =
|
|
gpu.cal()->gridBlock.width * gpu.cal()->gridBlock.height * gpu.cal()->gridBlock.depth;
|
|
|
|
//! \todo validate if the compiler still generates PrivateFixed
|
|
if (flags() & PrivateFixed) {
|
|
// Update private ring
|
|
data = reinterpret_cast<uint*>(cbBuf + PrivateRingOffset * ConstBuffer::VectorSize);
|
|
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
|
|
|
|
if (abi64Bit()) {
|
|
reinterpret_cast<uint64_t*>(data)[0] = gpuMemory->hbOffset();
|
|
data[2] = 0;
|
|
data[3] = 0;
|
|
} else {
|
|
data[0] = static_cast<uint>(gpuMemory->hbOffset());
|
|
data[1] = 0;
|
|
data[2] = data[3] = 0;
|
|
}
|
|
gpu.addVmMemory(gpuMemory);
|
|
}
|
|
|
|
// Copy the math lib constants
|
|
amd::Os::fastMemcpy((cbBuf + MathLibOffset * ConstBuffer::VectorSize), MathLibConst,
|
|
sizeof(MathLibConst));
|
|
|
|
// Update the offset to the global data
|
|
if (prog().glbData() != NULL) {
|
|
gpu.addVmMemory(prog().glbData());
|
|
uint64_t glbDataOffset = prog().glbData()->hbOffset();
|
|
if (abi64Bit()) {
|
|
*reinterpret_cast<uint64_t*>(cbBuf + GlobalDataStoreOffset * ConstBuffer::VectorSize) =
|
|
glbDataOffset;
|
|
} else {
|
|
*reinterpret_cast<uint*>(cbBuf + GlobalDataStoreOffset * ConstBuffer::VectorSize) =
|
|
static_cast<uint>(glbDataOffset);
|
|
}
|
|
}
|
|
|
|
// Split workload if it was requested
|
|
if ((gpu.cal_.iterations_ < 2) && gpu.dmaFlushMgmt().dispatchSplitSize() != 0) {
|
|
uint totalSize = gridSize * blockSize;
|
|
if (totalSize > gpu.dmaFlushMgmt().dispatchSplitSize()) {
|
|
gpu.cal_.iterations_ =
|
|
std::max(gpu.cal_.iterations_, (totalSize / gpu.dmaFlushMgmt().dispatchSplitSize()));
|
|
}
|
|
}
|
|
|
|
// Initialize the number of iterations to the grid size
|
|
if (flags() & PrintfOutput) {
|
|
gpu.cal_.iterations_ = gridSize;
|
|
}
|
|
}
|
|
|
|
void Kernel::setSampler(VirtualGPU& gpu, uint32_t state, uint physUnit) const {
|
|
// All CAL sampler's parameters are in floats
|
|
float gslAddress = GSL_CLAMP_TO_BORDER;
|
|
float gslMinFilter = GSL_MIN_NEAREST;
|
|
float gslMagFilter = GSL_MAG_NEAREST;
|
|
|
|
state &= ~amd::Sampler::StateNormalizedCoordsMask;
|
|
|
|
// Program the sampler address mode
|
|
switch (state & amd::Sampler::StateAddressMask) {
|
|
case amd::Sampler::StateAddressRepeat:
|
|
gslAddress = GSL_REPEAT;
|
|
break;
|
|
case amd::Sampler::StateAddressClampToEdge:
|
|
gslAddress = GSL_CLAMP_TO_EDGE;
|
|
break;
|
|
case amd::Sampler::StateAddressMirroredRepeat:
|
|
gslAddress = GSL_MIRRORED_REPEAT;
|
|
break;
|
|
case amd::Sampler::StateAddressClamp:
|
|
case amd::Sampler::StateAddressNone:
|
|
default:
|
|
break;
|
|
}
|
|
state &= ~amd::Sampler::StateAddressMask;
|
|
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_S, &gslAddress);
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_T, &gslAddress);
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_R, &gslAddress);
|
|
|
|
// Program texture filter mode
|
|
if (state == amd::Sampler::StateFilterLinear) {
|
|
gslMinFilter = GSL_MIN_LINEAR;
|
|
gslMagFilter = GSL_MAG_LINEAR;
|
|
}
|
|
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MIN_FILTER, &gslMinFilter);
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MAG_FILTER, &gslMagFilter);
|
|
}
|
|
|
|
bool Kernel::bindResource(VirtualGPU& gpu, const Memory& memory, uint paramIdx, ResourceType type,
|
|
uint physUnit, size_t offset) const {
|
|
gslUAVType uavType = GSL_UAV_TYPE_UNKNOWN;
|
|
|
|
// Find the original resource name from the IL program
|
|
switch (type) {
|
|
case GlobalBuffer:
|
|
if (gpu.state_.boundGlobal_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundGlobal_ = true;
|
|
physUnit = uavRaw_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case ArgumentCbID:
|
|
if (gpu.state_.boundCb_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundCb_ = true;
|
|
physUnit = cbId_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case ArgumentPrintfID:
|
|
if (gpu.state_.boundPrintf_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundPrintf_ = true;
|
|
physUnit = printfId_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case ArgumentHeapBuffer:
|
|
case ArgumentBuffer:
|
|
case ArgumentImageRead:
|
|
case ArgumentImageWrite:
|
|
case ArgumentConstBuffer:
|
|
case ArgumentCounter:
|
|
// Early exit if resource is bound already
|
|
if (gpu.slots_[paramIdx].state_.bound_) {
|
|
return true;
|
|
}
|
|
|
|
// Associate resource with the slot
|
|
gpu.slots_[paramIdx].memory_ = &memory;
|
|
|
|
// Mark resource as bound
|
|
gpu.slots_[paramIdx].state_.bound_ = true;
|
|
|
|
if (type == ArgumentCounter) {
|
|
GpuEvent calEvent;
|
|
|
|
// Bind memory with atomic counter
|
|
gpu.cs()->bindAtomicCounter(argument(paramIdx)->index_, memory.gslResource());
|
|
|
|
// Copy the counter value into GDS
|
|
gpu.eventBegin(MainEngine);
|
|
gpu.cs()->syncAtomicCounter(argument(paramIdx)->index_, false);
|
|
gpu.eventEnd(MainEngine, calEvent);
|
|
|
|
// Mark resource as busy
|
|
memory.setBusy(gpu, calEvent);
|
|
return true;
|
|
} else if (type == ArgumentHeapBuffer) {
|
|
// We return here, since we just have to bind the global heap
|
|
return true;
|
|
} else if (type == ArgumentConstBuffer) {
|
|
gpu.slots_[paramIdx].state_.constant_ = true;
|
|
}
|
|
break;
|
|
case ArgumentUavID:
|
|
case ConstantBuffer:
|
|
break;
|
|
default:
|
|
LogPrintfError("Unspecified argument type ()!", type);
|
|
return false;
|
|
}
|
|
|
|
gslMemObject gslMem = NULL;
|
|
// Use global address space on SI+ for UAV setup
|
|
if ((type == ArgumentBuffer) || (type == ArgumentCbID) || (type == ArgumentUavID) ||
|
|
(type == ArgumentPrintfID)) {
|
|
gslMem = dev().heap().resource().gslResource();
|
|
} else {
|
|
gslMem = memory.gslResource();
|
|
}
|
|
|
|
// Associate memory with the physical unit, the actual binding
|
|
bool result = true;
|
|
switch (type) {
|
|
case GlobalBuffer:
|
|
case ArgumentBuffer:
|
|
case ArgumentImageWrite:
|
|
case ArgumentUavID:
|
|
case ArgumentCbID:
|
|
case ArgumentPrintfID:
|
|
if (type == ArgumentImageWrite) {
|
|
uavType = GSL_UAV_TYPE_TYPED;
|
|
} else if ((type == ArgumentBuffer) || (type == ArgumentUavID)) {
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
}
|
|
if (gpu.cal_.uavs_[physUnit] != gslMem) {
|
|
result = gpu.setUAVBuffer(physUnit, gslMem, uavType);
|
|
gpu.setUAVChannelOrder(physUnit, gslMem);
|
|
gpu.cal_.uavs_[physUnit] = gslMem;
|
|
}
|
|
break;
|
|
case ConstantBuffer:
|
|
case ArgumentConstBuffer:
|
|
if ((gpu.cal_.constBuffers_[physUnit] != gslMem) || (offset != 0)) {
|
|
result = gpu.setConstantBuffer(physUnit, gslMem, offset, memory.hbSize());
|
|
gpu.cal_.constBuffers_[physUnit] = gslMem;
|
|
}
|
|
break;
|
|
case ArgumentImageRead:
|
|
if (gpu.cal_.readImages_[physUnit] != gslMem) {
|
|
result = gpu.setInput(physUnit, gslMem);
|
|
gpu.cal_.readImages_[physUnit] = gslMem;
|
|
}
|
|
break;
|
|
default:
|
|
result = false;
|
|
assert(false);
|
|
break;
|
|
}
|
|
if (!result) {
|
|
LogPrintfError("setMem failed unit:%d mem:0x%08x!", physUnit, gslMem);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void Kernel::unbindResources(VirtualGPU& gpu, GpuEvent calEvent, bool lastRun) const {
|
|
// Make sure unbind occurs on the last run, in case the execution had a split
|
|
if (lastRun) {
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
if (gpu.slots_[i].state_.bound_) {
|
|
GpuEvent calEventTmp = calEvent;
|
|
|
|
if (KernelArg::Counter == argument(i)->type_) {
|
|
// Copy the counter value from GDS
|
|
gpu.eventBegin(MainEngine);
|
|
gpu.cs()->syncAtomicCounter(argument(i)->index_, true);
|
|
gpu.eventEnd(MainEngine, calEventTmp);
|
|
} else if (!(gpu.slots_[i].state_.constant_ || argument(i)->memory_.readOnly_)) {
|
|
// Signal the abstraction layer that GPU memory is dirty
|
|
if (gpu.slots_[i].memory_->owner() != NULL) {
|
|
gpu.slots_[i].memory_->owner()->signalWrite(&gpu.dev());
|
|
}
|
|
}
|
|
// Mark resource as busy
|
|
gpu.slots_[i].memory_->setBusy(gpu, calEventTmp);
|
|
|
|
gpu.slots_[i].state_.value_ = 0;
|
|
}
|
|
}
|
|
|
|
// Unbind the global buffer
|
|
gpu.state_.boundGlobal_ = false;
|
|
|
|
// Unbind the constant buffer
|
|
gpu.state_.boundCb_ = false;
|
|
|
|
// Unbind the pritnf buffer
|
|
gpu.state_.boundPrintf_ = false;
|
|
}
|
|
|
|
// Mark CB busy
|
|
for (uint i = 0; i < numCb_; ++i) {
|
|
gpu.constBufs_[i]->setBusy(gpu, calEvent);
|
|
}
|
|
|
|
// Set the event object for the scratch buffer
|
|
if (workGroupInfo()->scratchRegs_ > 0) {
|
|
dev().scratch(gpu.hwRing())->memObj_->setBusy(gpu, calEvent);
|
|
}
|
|
}
|
|
|
|
void Kernel::copyImageConstants(const amd::Image* amdImage, ImageConstants* imageData) const {
|
|
imageData->width_ = static_cast<uint32_t>(amdImage->getWidth());
|
|
imageData->height_ = static_cast<uint32_t>(amdImage->getHeight());
|
|
imageData->depth_ = static_cast<uint32_t>(amdImage->getDepth());
|
|
imageData->dataType_ = static_cast<uint32_t>(amdImage->getImageFormat().image_channel_data_type);
|
|
|
|
imageData->widthFloat_ = 1.f / static_cast<float>(amdImage->getWidth());
|
|
imageData->heightFloat_ = 1.f / static_cast<float>(amdImage->getHeight());
|
|
imageData->depthFloat_ = 1.f / static_cast<float>(amdImage->getDepth());
|
|
imageData->channelOrder_ = static_cast<uint32_t>(amdImage->getImageFormat().image_channel_order);
|
|
}
|
|
|
|
union MetadataVersion {
|
|
struct {
|
|
uint64_t revision_ : 16; //!< LLVM metadata revision
|
|
uint64_t minorVersion_ : 16; //!< LLVM metadata minor verison
|
|
uint64_t majorVersion_ : 16; //!< LLVM metadata major version
|
|
};
|
|
uint64_t value_;
|
|
MetadataVersion(uint mj, uint mi, uint rev) : value_(0) {
|
|
revision_ = rev;
|
|
minorVersion_ = mi;
|
|
majorVersion_ = mj;
|
|
}
|
|
MetadataVersion() : value_(0) {}
|
|
};
|
|
|
|
//! Version of metadata with buffer attributes
|
|
const MetadataVersion MetadataBufferAttributes = MetadataVersion(2, 0, 88);
|
|
|
|
//! Version of metadata with type qualifiers
|
|
const MetadataVersion MetadataTypeQualifiers = MetadataVersion(3, 1, 103);
|
|
|
|
bool NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount) {
|
|
// Initialize workgroup info
|
|
workGroupInfo_.size_ = nullDev().info().preferredWorkGroupSize_;
|
|
MetadataVersion mdVersion;
|
|
|
|
// Find first tag
|
|
size_t pos = metaData.find(";");
|
|
|
|
// Loop through all provided program arguments
|
|
while (pos != std::string::npos) {
|
|
KernelArg arg;
|
|
|
|
if (!expect(metaData, &pos, ";")) {
|
|
break;
|
|
}
|
|
|
|
arg.type_ = KernelArg::NoType;
|
|
|
|
// Loop through all available metadata types
|
|
for (uint i = 0; i < ArgStateTotal; ++i) {
|
|
uint tmpValue;
|
|
// Find the name tag
|
|
if (expect(metaData, &pos, ArgState[i].typeName_)) {
|
|
switch (ArgState[i].type_) {
|
|
case KernelArg::NoType:
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::Reflection: {
|
|
uint argIdx;
|
|
// Read the argument's index
|
|
if (!getuint(metaData, &pos, &argIdx)) {
|
|
LogWarning("Couldn't get the argument index!");
|
|
return false;
|
|
}
|
|
KernelArg* tmpArg = arguments_[argIdx];
|
|
if (!getstring(metaData, &pos, &tmpArg->typeName_)) {
|
|
LogWarning("Couldn't get the argument type!");
|
|
return false;
|
|
}
|
|
}
|
|
continue;
|
|
case KernelArg::ConstArg: {
|
|
uint argIdx;
|
|
// Read the argument's index
|
|
if (!getuint(metaData, &pos, &argIdx)) {
|
|
LogWarning("Couldn't get the argument index!");
|
|
return false;
|
|
}
|
|
KernelArg* tmpArg = arguments_[argIdx];
|
|
tmpArg->typeQualifier_ |= CL_KERNEL_ARG_TYPE_CONST;
|
|
}
|
|
continue;
|
|
case KernelArg::Grouping:
|
|
for (uint j = 0; j < 3; ++j) {
|
|
uint temp;
|
|
// Read the compile workgroup size
|
|
if (!getuint(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the compile workgroup size!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.compileSize_[j] = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::WrkgrpSize: {
|
|
uint temp;
|
|
// Read the workgroup size
|
|
if (!getuint(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the workgroup size!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.size_ = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::Wavefront:
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::UavId:
|
|
// Read index
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
return false;
|
|
}
|
|
break;
|
|
case KernelArg::ConstBufId:
|
|
// Read index
|
|
if (!getuint(metaData, &pos, &cbId_)) {
|
|
return false;
|
|
}
|
|
continue;
|
|
case KernelArg::PrintfBufId:
|
|
// Read index
|
|
if (!getuint(metaData, &pos, &printfId_)) {
|
|
return false;
|
|
}
|
|
continue;
|
|
case KernelArg::MetadataVersion:
|
|
// Read metadata version
|
|
if (!getuint(metaData, &pos, &tmpValue)) {
|
|
return false;
|
|
}
|
|
mdVersion.majorVersion_ = tmpValue;
|
|
if (!getuint(metaData, &pos, &tmpValue)) {
|
|
return false;
|
|
}
|
|
mdVersion.minorVersion_ = tmpValue;
|
|
if (!getuint(metaData, &pos, &tmpValue)) {
|
|
return false;
|
|
}
|
|
mdVersion.revision_ = tmpValue;
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::GroupingHint:
|
|
for (uint j = 0; j < 3; ++j) {
|
|
uint temp;
|
|
// Read the compile workgroup size hint
|
|
if (!getuint(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the compile workgroup size hint!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.compileSizeHint_[j] = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::VecTypeHint: {
|
|
std::string temp;
|
|
// Read the compile vector type hint
|
|
if (!getstring(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the compile vector type hint!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.compileVecTypeHint_ = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::WavesPerSimdHint: {
|
|
uint tmp;
|
|
if (!getuint(metaData, &pos, &tmp)) {
|
|
return false;
|
|
}
|
|
workGroupInfo_.wavesPerSimdHint_ = tmp;
|
|
}
|
|
continue;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
std::string argName;
|
|
// Save the argument type
|
|
arg.type_ = ArgState[i].type_;
|
|
|
|
// Check if we should expect the name
|
|
if (ArgState[i].name_) {
|
|
// Read the parameter's name
|
|
if (!getword(metaData, &pos, argName)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
arg.name_ = argName;
|
|
}
|
|
|
|
if (arg.type_ == KernelArg::Sampler) {
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
if (!getuint(metaData, &pos, &arg.location_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
if (!getuint(metaData, &pos, &arg.cbPos_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check if we should expect the resource data type
|
|
if (ArgState[i].resType_) {
|
|
uint k;
|
|
// Search for the data type
|
|
for (k = 0; k < DataTypeTotal; k++) {
|
|
if (expect(metaData, &pos, DataType[k].tagName_)) {
|
|
arg.dataType_ = DataType[k].type_;
|
|
if (arg.type_ == KernelArg::Image) {
|
|
flags_ |= ImageEnable;
|
|
if (expect(metaData, &pos, "RO:")) {
|
|
arg.memory_.readOnly_ = 1;
|
|
} else if (expect(metaData, &pos, "RW:")) {
|
|
arg.memory_.readWrite_ = 1;
|
|
flags_ |= ImageWrite;
|
|
} else if (expect(metaData, &pos, "WO:")) {
|
|
arg.memory_.writeOnly_ = 1;
|
|
flags_ |= ImageWrite;
|
|
}
|
|
} else if (arg.type_ == KernelArg::Value) {
|
|
arg.type_ = DataType[k].type_;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
if (k == DataTypeTotal) {
|
|
LogWarning("We couldn't find the argument's type.");
|
|
if ((arg.type_ == KernelArg::Value) || !getword(metaData, &pos, argName)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
//! @todo temporary condition
|
|
if ((arg.type_ == KernelArg::Opaque) || (arg.type_ == KernelArg::Sampler)) {
|
|
assert(false);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Check if we should expect the data size
|
|
if (ArgState[i].size_) {
|
|
uint tmpData;
|
|
// Read the data size
|
|
if (!getuint(metaData, &pos, &tmpData)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
if (arg.type_ == KernelArg::Image) {
|
|
arg.type_ = arg.dataType_;
|
|
arg.index_ = tmpData;
|
|
} else {
|
|
arg.size_ = tmpData;
|
|
}
|
|
}
|
|
|
|
if (arg.type_ == KernelArg::Counter) {
|
|
// Read a counter index
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
LogWarning("Couldn't get a counter index!");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check if we should expect a resource index
|
|
if (ArgState[i].cbIdx_) {
|
|
// Read resource index
|
|
if (!getuint(metaData, &pos, &arg.cbIdx_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
|
|
if (arg.isCbNeeded() && (numCb_ < arg.cbIdx_)) {
|
|
numCb_ = arg.cbIdx_;
|
|
}
|
|
}
|
|
// Check if we should expect the CB offset
|
|
if (ArgState[i].cbPos_) {
|
|
// Read position in the constant buffer
|
|
if (!getuint(metaData, &pos, &arg.cbPos_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
// Check if we should expect the buffer type
|
|
if (ArgState[i].buf_) {
|
|
// Read the buffer type
|
|
if (!getword(metaData, &pos, argName)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
arg.buf_ = argName;
|
|
|
|
for (uint k = 0; k < BufTypeTotal; ++k) {
|
|
if (0 == arg.buf_.compare(BufType[k].tagName_)) {
|
|
// Update the parameter type
|
|
arg.type_ = BufType[k].type_;
|
|
// Check if we should expect a buffer index
|
|
if (BufType[k].number_) {
|
|
// Read a buffer index
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
// Check for the required alignment
|
|
if (BufType[k].alignment_) {
|
|
// Read data alignment
|
|
if (!getuint(metaData, &pos, &arg.alignment_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
// Check for the buffer's attribute
|
|
if ((mdVersion.value_ >= MetadataBufferAttributes.value_) && BufType[k].attribute_) {
|
|
if (expect(metaData, &pos, "RO")) {
|
|
arg.memory_.readOnly_ = 1;
|
|
} else if (expect(metaData, &pos, "RW")) {
|
|
arg.memory_.readWrite_ = 1;
|
|
} else if (expect(metaData, &pos, "WO")) {
|
|
arg.memory_.writeOnly_ = 1;
|
|
}
|
|
}
|
|
// Check for the type qualifier
|
|
if ((mdVersion.value_ >= MetadataTypeQualifiers.value_) && BufType[k].attribute_) {
|
|
uint tmp;
|
|
pos += 1;
|
|
if (!getuint(metaData, &pos, &tmp)) {
|
|
LogWarning("Couldn't get volatile type!");
|
|
return false;
|
|
}
|
|
if (tmp == 1) {
|
|
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
|
}
|
|
if (!getuint(metaData, &pos, &tmp)) {
|
|
LogWarning("Couldn't get restrict type!");
|
|
return false;
|
|
}
|
|
if (tmp == 1) {
|
|
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Find multiple UAV references
|
|
switch (arg.type_) {
|
|
case KernelArg::PointerGlobal:
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
case KernelArg::UavId:
|
|
uavRefCount[arg.index_]++;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
// Check if this argument will be passed in constant buffer
|
|
if (arg.isCbNeeded() || (arg.type_ == KernelArg::UavId)) {
|
|
if (arg.type_ == KernelArg::Sampler) {
|
|
// Serach for the passed by value sampler
|
|
for (uint i = 0; i < argSize(); ++i) {
|
|
KernelArg* value = arguments_[i];
|
|
if (0 == value->name_.compare(arg.name_)) {
|
|
value->type_ = arg.type_;
|
|
value->index_ = arg.index_;
|
|
value->location_ = 0;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
KernelArg* argument = new KernelArg(arg);
|
|
if (argument != NULL) {
|
|
addArgument(argument);
|
|
} else {
|
|
LogError("Couldn't allocate memory!");
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
// Check if we have a pre-defined sampler
|
|
else if (arg.type_ == KernelArg::Sampler) {
|
|
KernelArg* sampler = new KernelArg(arg);
|
|
if (sampler != NULL) {
|
|
addSampler(sampler);
|
|
} else {
|
|
LogError("Couldn't allocate memory!");
|
|
return false;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Next argument
|
|
pos = metaData.find(";", pos);
|
|
}
|
|
|
|
// Find arguments that will require a reallocation
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
KernelArg* arg = arguments_[i];
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerGlobal:
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
// Check if can't use a dedicated UAV,
|
|
// so realloc memory in the heap
|
|
arg->memory_.realloc_ = false;
|
|
arg->memory_.uavBuf_ = true;
|
|
break;
|
|
case KernelArg::PointerHwConst:
|
|
arg->memory_.realloc_ = true;
|
|
break;
|
|
case KernelArg::UavId:
|
|
uavRaw_ = arg->index_;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
// If argument marked with the const qualifier, then overwrite
|
|
// Read-Write attributes, since compiler doesn't mark it properly
|
|
if (arg->typeQualifier() & CL_KERNEL_ARG_TYPE_CONST) {
|
|
arg->memory_.readOnly_ = 1;
|
|
arg->memory_.readWrite_ = 0;
|
|
arg->memory_.writeOnly_ = 0;
|
|
}
|
|
}
|
|
|
|
if ((uavRaw_ != UavIdUndefined) && !(flags() & PrintfOutput)) {
|
|
// Find if default UAV is already assigned to an argument
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
KernelArg* arg = arguments_[i];
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerGlobal:
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
if (uavRaw_ == arg->index_) {
|
|
uavRaw_ = UavIdUndefined;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// There is always 1 constant buffer, associated with the kernel
|
|
numCb_++;
|
|
assert((numCb_ <= MaxConstBuffersArguments) &&
|
|
"Runtime doesn't support more than max CBs for arguments!");
|
|
|
|
// Limit workgroup size if requested
|
|
if ((flags() & LimitWorkgroup) && (GPU_MAX_WORKGROUP_SIZE == 0)) {
|
|
size_t temp = 1;
|
|
workGroupInfo_.size_ = workGroupInfo()->wavefrontSize_;
|
|
for (uint j = 0; j < 3; ++j) {
|
|
if (workGroupInfo()->compileSize_[j] != 0) {
|
|
temp *= workGroupInfo_.compileSize_[j];
|
|
}
|
|
}
|
|
// Report a compilation error if requested compile size doesn't
|
|
// match the required workgroup size
|
|
if (workGroupInfo()->size_ < temp) {
|
|
char str[8];
|
|
intToStr(workGroupInfo_.size_, str, 8);
|
|
buildError_ = CL_OUT_OF_RESOURCES;
|
|
buildLog_ += "Error: Requested compile size is bigger than the required workgroup size of ";
|
|
buildLog_ += str;
|
|
buildLog_ += " elements\n";
|
|
LogError(buildLog().c_str());
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Read/Write attributes are provided in metadata
|
|
if (mdVersion.value_ >= MetadataBufferAttributes.value_) {
|
|
rwAttributes_ = true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
inline static HSAIL_ARG_TYPE GetHSAILArgType(const aclArgData* argInfo) {
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return HSAIL_ARGTYPE_POINTER;
|
|
case ARG_TYPE_QUEUE:
|
|
return HSAIL_ARGTYPE_QUEUE;
|
|
case ARG_TYPE_VALUE:
|
|
return HSAIL_ARGTYPE_VALUE;
|
|
case ARG_TYPE_IMAGE:
|
|
return HSAIL_ARGTYPE_IMAGE;
|
|
case ARG_TYPE_SAMPLER:
|
|
return HSAIL_ARGTYPE_SAMPLER;
|
|
case ARG_TYPE_ERROR:
|
|
default:
|
|
return HSAIL_ARGTYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
inline static size_t GetHSAILArgAlignment(const aclArgData* argInfo) {
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return argInfo->arg.pointer.align;
|
|
default:
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
inline static HSAIL_ACCESS_TYPE GetHSAILArgAccessType(const aclArgData* argInfo) {
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
switch (argInfo->arg.pointer.type) {
|
|
case ACCESS_TYPE_RO:
|
|
return HSAIL_ACCESS_TYPE_RO;
|
|
case ACCESS_TYPE_WO:
|
|
return HSAIL_ACCESS_TYPE_WO;
|
|
case ACCESS_TYPE_RW:
|
|
default:
|
|
return HSAIL_ACCESS_TYPE_RW;
|
|
}
|
|
}
|
|
return HSAIL_ACCESS_TYPE_NONE;
|
|
}
|
|
|
|
inline static HSAIL_ADDRESS_QUALIFIER GetHSAILAddrQual(const aclArgData* argInfo) {
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_CONSTANT_EMU:
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV:
|
|
case PTR_MT_GLOBAL:
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
case PTR_MT_LDS_EMU:
|
|
case PTR_MT_LDS:
|
|
return HSAIL_ADDRESS_LOCAL;
|
|
case PTR_MT_SCRATCH_EMU:
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
case PTR_MT_ERROR:
|
|
default:
|
|
LogError("Unsupported address type");
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
} else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
} else if (argInfo->type == ARG_TYPE_QUEUE) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
}
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
|
|
/* f16 returns f32 - workaround due to comp lib */
|
|
inline static HSAIL_DATA_TYPE GetHSAILDataType(const aclArgData* argInfo) {
|
|
aclArgDataType dataType;
|
|
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
dataType = argInfo->arg.pointer.data;
|
|
} else if (argInfo->type == ARG_TYPE_VALUE) {
|
|
dataType = argInfo->arg.value.data;
|
|
} else {
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
switch (dataType) {
|
|
case DATATYPE_i1:
|
|
return HSAIL_DATATYPE_B1;
|
|
case DATATYPE_i8:
|
|
return HSAIL_DATATYPE_S8;
|
|
case DATATYPE_i16:
|
|
return HSAIL_DATATYPE_S16;
|
|
case DATATYPE_i32:
|
|
return HSAIL_DATATYPE_S32;
|
|
case DATATYPE_i64:
|
|
return HSAIL_DATATYPE_S64;
|
|
case DATATYPE_u8:
|
|
return HSAIL_DATATYPE_U8;
|
|
case DATATYPE_u16:
|
|
return HSAIL_DATATYPE_U16;
|
|
case DATATYPE_u32:
|
|
return HSAIL_DATATYPE_U32;
|
|
case DATATYPE_u64:
|
|
return HSAIL_DATATYPE_U64;
|
|
case DATATYPE_f16:
|
|
return HSAIL_DATATYPE_F32;
|
|
case DATATYPE_f32:
|
|
return HSAIL_DATATYPE_F32;
|
|
case DATATYPE_f64:
|
|
return HSAIL_DATATYPE_F64;
|
|
case DATATYPE_struct:
|
|
return HSAIL_DATATYPE_STRUCT;
|
|
case DATATYPE_opaque:
|
|
return HSAIL_DATATYPE_OPAQUE;
|
|
case DATATYPE_ERROR:
|
|
default:
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
inline static int GetHSAILArgSize(const aclArgData* argInfo) {
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_VALUE:
|
|
switch (GetHSAILDataType(argInfo)) {
|
|
case HSAIL_DATATYPE_B1:
|
|
return 1;
|
|
case HSAIL_DATATYPE_B8:
|
|
case HSAIL_DATATYPE_S8:
|
|
case HSAIL_DATATYPE_U8:
|
|
return 1;
|
|
case HSAIL_DATATYPE_B16:
|
|
case HSAIL_DATATYPE_U16:
|
|
case HSAIL_DATATYPE_S16:
|
|
case HSAIL_DATATYPE_F16:
|
|
return 2;
|
|
case HSAIL_DATATYPE_B32:
|
|
case HSAIL_DATATYPE_U32:
|
|
case HSAIL_DATATYPE_S32:
|
|
case HSAIL_DATATYPE_F32:
|
|
return 4;
|
|
case HSAIL_DATATYPE_B64:
|
|
case HSAIL_DATATYPE_U64:
|
|
case HSAIL_DATATYPE_S64:
|
|
case HSAIL_DATATYPE_F64:
|
|
return 8;
|
|
case HSAIL_DATATYPE_STRUCT:
|
|
return argInfo->arg.value.numElements;
|
|
default:
|
|
return -1;
|
|
}
|
|
case ARG_TYPE_POINTER:
|
|
case ARG_TYPE_IMAGE:
|
|
case ARG_TYPE_SAMPLER:
|
|
case ARG_TYPE_QUEUE:
|
|
return sizeof(void*);
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
inline static clk_value_type_t GetOclType(const aclArgData* argInfo) {
|
|
static const clk_value_type_t ClkValueMapType[6][6] = {
|
|
{T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16},
|
|
{T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16},
|
|
{T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16},
|
|
{T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16},
|
|
{T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16},
|
|
{T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16},
|
|
};
|
|
|
|
uint sizeType;
|
|
if (argInfo->type == ARG_TYPE_QUEUE) {
|
|
return T_QUEUE;
|
|
}
|
|
if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) {
|
|
return T_POINTER;
|
|
} else if (argInfo->type == ARG_TYPE_VALUE) {
|
|
switch (argInfo->arg.value.data) {
|
|
case DATATYPE_i8:
|
|
case DATATYPE_u8:
|
|
sizeType = 0;
|
|
break;
|
|
case DATATYPE_i16:
|
|
case DATATYPE_u16:
|
|
sizeType = 1;
|
|
break;
|
|
case DATATYPE_i32:
|
|
case DATATYPE_u32:
|
|
sizeType = 2;
|
|
break;
|
|
case DATATYPE_i64:
|
|
case DATATYPE_u64:
|
|
sizeType = 3;
|
|
break;
|
|
case DATATYPE_f16:
|
|
case DATATYPE_f32:
|
|
sizeType = 4;
|
|
break;
|
|
case DATATYPE_f64:
|
|
sizeType = 5;
|
|
break;
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
switch (argInfo->arg.value.numElements) {
|
|
case 1:
|
|
return ClkValueMapType[sizeType][0];
|
|
case 2:
|
|
return ClkValueMapType[sizeType][1];
|
|
case 3:
|
|
return ClkValueMapType[sizeType][2];
|
|
case 4:
|
|
return ClkValueMapType[sizeType][3];
|
|
case 8:
|
|
return ClkValueMapType[sizeType][4];
|
|
case 16:
|
|
return ClkValueMapType[sizeType][5];
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
} else if (argInfo->type == ARG_TYPE_SAMPLER) {
|
|
return T_SAMPLER;
|
|
} else {
|
|
return T_VOID;
|
|
}
|
|
}
|
|
|
|
inline static cl_kernel_arg_address_qualifier GetOclAddrQual(const aclArgData* argInfo) {
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_UAV:
|
|
case PTR_MT_GLOBAL:
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV_CONSTANT:
|
|
case PTR_MT_CONSTANT_EMU:
|
|
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
|
case PTR_MT_LDS_EMU:
|
|
case PTR_MT_LDS:
|
|
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
|
default:
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
} else if (argInfo->type == ARG_TYPE_IMAGE) {
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
}
|
|
// default for all other cases
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
|
|
inline static cl_kernel_arg_access_qualifier GetOclAccessQual(const aclArgData* argInfo) {
|
|
if (argInfo->type == ARG_TYPE_IMAGE) {
|
|
switch (argInfo->arg.image.type) {
|
|
case ACCESS_TYPE_RO:
|
|
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
|
case ACCESS_TYPE_WO:
|
|
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
|
case ACCESS_TYPE_RW:
|
|
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
|
default:
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
}
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
|
|
inline static cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) {
|
|
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
if (argInfo->arg.pointer.isVolatile) {
|
|
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
|
}
|
|
if (argInfo->arg.pointer.isRestrict) {
|
|
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
|
}
|
|
if (argInfo->arg.pointer.isPipe) {
|
|
rv |= CL_KERNEL_ARG_TYPE_PIPE;
|
|
}
|
|
if (argInfo->isConst) {
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
}
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV_CONSTANT:
|
|
case PTR_MT_CONSTANT_EMU:
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
static int GetOclSize(const aclArgData* argInfo) {
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return sizeof(void*);
|
|
case ARG_TYPE_VALUE:
|
|
//! \note OCL 6.1.5. For 3-component vector data types,
|
|
//! the size of the data type is 4 * sizeof(component).
|
|
switch (argInfo->arg.value.data) {
|
|
case DATATYPE_struct:
|
|
return 1 * argInfo->arg.value.numElements;
|
|
case DATATYPE_i8:
|
|
case DATATYPE_u8:
|
|
return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_u16:
|
|
case DATATYPE_i16:
|
|
case DATATYPE_f16:
|
|
return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_u32:
|
|
case DATATYPE_i32:
|
|
case DATATYPE_f32:
|
|
return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_i64:
|
|
case DATATYPE_u64:
|
|
case DATATYPE_f64:
|
|
return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_ERROR:
|
|
default:
|
|
return -1;
|
|
}
|
|
case ARG_TYPE_IMAGE:
|
|
return sizeof(cl_mem);
|
|
case ARG_TYPE_SAMPLER:
|
|
return sizeof(cl_sampler);
|
|
case ARG_TYPE_QUEUE:
|
|
return sizeof(cl_command_queue);
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
void HSAILKernel::initArgList(const aclArgData* aclArg) {
|
|
// Initialize the hsail argument list too
|
|
initHsailArgs(aclArg);
|
|
|
|
// Iterate through the arguments and insert into parameterList
|
|
device::Kernel::parameters_t params;
|
|
amd::KernelParameterDescriptor desc;
|
|
size_t offset = 0;
|
|
|
|
// Reserved arguments for HSAIL launch
|
|
aclArg += MaxExtraArgumentsNum;
|
|
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
|
|
desc.name_ = arguments_[i]->name_.c_str();
|
|
desc.type_ = GetOclType(aclArg);
|
|
desc.addressQualifier_ = GetOclAddrQual(aclArg);
|
|
desc.accessQualifier_ = GetOclAccessQual(aclArg);
|
|
desc.typeQualifier_ = GetOclTypeQual(aclArg);
|
|
desc.typeName_ = arguments_[i]->typeName_.c_str();
|
|
|
|
// Make a check if it is local or global
|
|
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
|
desc.size_ = sizeof(cl_mem);
|
|
} else {
|
|
desc.size_ = GetOclSize(aclArg);
|
|
}
|
|
|
|
// Make offset alignment to match CPU metadata, since
|
|
// in multidevice config abstraction layer has a single signature
|
|
// and CPU sends the paramaters as they are allocated in memory
|
|
size_t size = desc.size_;
|
|
|
|
offset = amd::alignUp(offset, std::min(size, size_t(16)));
|
|
desc.offset_ = offset;
|
|
offset += amd::alignUp(size, sizeof(uint32_t));
|
|
params.push_back(desc);
|
|
|
|
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
|
|
flags_.imageEna_ = true;
|
|
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
|
|
flags_.imageWriteEna_ = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0);
|
|
}
|
|
|
|
void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
|
|
int offset = 0;
|
|
|
|
// Reserved arguments for HSAIL launch
|
|
aclArg += MaxExtraArgumentsNum;
|
|
|
|
// Iterate through the each kernel argument
|
|
for (; aclArg->struct_size != 0; aclArg++) {
|
|
Argument* arg = new Argument;
|
|
// Initialize HSAIL kernel argument
|
|
arg->name_ = aclArg->argStr;
|
|
arg->typeName_ = aclArg->typeStr;
|
|
arg->size_ = GetHSAILArgSize(aclArg);
|
|
arg->offset_ = offset;
|
|
arg->type_ = GetHSAILArgType(aclArg);
|
|
arg->addrQual_ = GetHSAILAddrQual(aclArg);
|
|
arg->dataType_ = GetHSAILDataType(aclArg);
|
|
// If vector of args we add additional arguments to flatten it out
|
|
arg->numElem_ =
|
|
((aclArg->type == ARG_TYPE_VALUE) && (aclArg->arg.value.data != DATATYPE_struct))
|
|
? aclArg->arg.value.numElements
|
|
: 1;
|
|
arg->alignment_ = GetHSAILArgAlignment(aclArg);
|
|
arg->access_ = GetHSAILArgAccessType(aclArg);
|
|
offset += GetHSAILArgSize(aclArg);
|
|
arguments_.push_back(arg);
|
|
}
|
|
}
|
|
|
|
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions,
|
|
uint extraArgsNum)
|
|
: device::Kernel(prog->dev(), name),
|
|
compileOptions_(compileOptions),
|
|
prog_(*prog),
|
|
index_(0),
|
|
code_(NULL),
|
|
codeSize_(0),
|
|
hwMetaData_(NULL),
|
|
extraArgumentsNum_(extraArgsNum) {
|
|
flags_.hsa_ = true;
|
|
}
|
|
|
|
HSAILKernel::~HSAILKernel() {
|
|
while (!arguments_.empty()) {
|
|
Argument* arg = arguments_.back();
|
|
delete arg;
|
|
arguments_.pop_back();
|
|
}
|
|
|
|
delete[] hwMetaData_;
|
|
|
|
delete code_;
|
|
}
|
|
|
|
bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
|
|
if (extraArgumentsNum_ > MaxExtraArgumentsNum) {
|
|
LogError("Failed to initialize kernel: extra arguments number is bigger than is supported");
|
|
return false;
|
|
}
|
|
acl_error error = ACL_SUCCESS;
|
|
std::string openClKernelName = openclMangledName(name());
|
|
flags_.internalKernel_ =
|
|
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
|
// compile kernel down to ISA
|
|
if (finalize) {
|
|
std::string options(compileOptions_.c_str());
|
|
options.append(" -just-kernel=");
|
|
options.append(openClKernelName.c_str());
|
|
// Append an option so that we can selectively enable a SCOption on CZ
|
|
// whenever IOMMUv2 is enabled.
|
|
if (dev().settings().svmFineGrainSystem_) {
|
|
options.append(" -sc-xnack-iommu");
|
|
}
|
|
error = aclCompile(dev().hsaCompiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG,
|
|
ACL_TYPE_ISA, NULL);
|
|
buildLog_ += aclGetCompilerLog(dev().hsaCompiler());
|
|
if (error != ACL_SUCCESS) {
|
|
LogError("Failed to finalize kernel");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
aqlCreateHWInfo(sym);
|
|
|
|
// Pull out metadata from the ELF
|
|
size_t sizeOfArgList;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
|
|
openClKernelName.c_str(), NULL, &sizeOfArgList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
char* aclArgList = new char[sizeOfArgList];
|
|
if (NULL == aclArgList) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
|
|
openClKernelName.c_str(), aclArgList, &sizeOfArgList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
size_t sizeOfWorkGroupSize;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
|
|
openClKernelName.c_str(), NULL, &sizeOfWorkGroupSize);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
|
|
openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Copy wavefront size
|
|
workGroupInfo_.wavefrontSize_ = prog().isNull() ? 64 : dev().getAttribs().wavefrontSize;
|
|
|
|
// Find total workgroup size
|
|
if (workGroupInfo_.compileSize_[0] != 0) {
|
|
workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
|
|
workGroupInfo_.compileSize_[2];
|
|
} else {
|
|
workGroupInfo_.size_ = dev().info().preferredWorkGroupSize_;
|
|
}
|
|
|
|
// Pull out printf metadata from the ELF
|
|
size_t sizeOfPrintfList;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
|
|
openClKernelName.c_str(), NULL, &sizeOfPrintfList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Make sure kernel has any printf info
|
|
if (0 != sizeOfPrintfList) {
|
|
char* aclPrintfList = new char[sizeOfPrintfList];
|
|
if (NULL == aclPrintfList) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
|
|
openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Set the PrintfList
|
|
InitPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
|
|
delete[] aclPrintfList;
|
|
}
|
|
|
|
aclMetadata md;
|
|
md.enqueue_kernel = false;
|
|
size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel);
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE,
|
|
openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
flags_.dynamicParallelism_ = md.enqueue_kernel;
|
|
|
|
md.kernel_index = -1;
|
|
size_t sizeOfIndex = sizeof(md.kernel_index);
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_KERNEL_INDEX,
|
|
openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
index_ = md.kernel_index;
|
|
|
|
size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT,
|
|
openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_,
|
|
&sizeOfWavesPerSimdHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
waveLimiter_.enable(dev().settings().ciPlus_);
|
|
|
|
size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_);
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
|
|
openClKernelName.c_str(), workGroupInfo_.compileSizeHint_,
|
|
&sizeOfWorkGroupSizeHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
size_t sizeOfVecTypeHint;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
|
|
openClKernelName.c_str(), NULL, &sizeOfVecTypeHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
if (0 != sizeOfVecTypeHint) {
|
|
char* VecTypeHint = new char[sizeOfVecTypeHint + 1];
|
|
if (NULL == VecTypeHint) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
|
|
openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
VecTypeHint[sizeOfVecTypeHint] = '\0';
|
|
workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint);
|
|
delete[] VecTypeHint;
|
|
}
|
|
|
|
// Set the argList
|
|
initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
|
|
delete[] aclArgList;
|
|
|
|
return true;
|
|
}
|
|
|
|
const Device& HSAILKernel::dev() const { return reinterpret_cast<const Device&>(dev_); }
|
|
|
|
const HSAILProgram& HSAILKernel::prog() const {
|
|
return reinterpret_cast<const HSAILProgram&>(prog_);
|
|
}
|
|
|
|
inline static void WriteAqlArg(
|
|
unsigned char** dst, //!< The write pointer to the buffer
|
|
const void* src, //!< The source pointer
|
|
uint size, //!< The size in bytes to copy
|
|
uint alignment = 0 //!< The alignment to follow while writing to the buffer
|
|
) {
|
|
if (alignment == 0) {
|
|
*dst = amd::alignUp(*dst, size);
|
|
} else {
|
|
*dst = amd::alignUp(*dst, alignment);
|
|
}
|
|
memcpy(*dst, src, size);
|
|
*dst += size;
|
|
}
|
|
|
|
const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
|
(1 << HSA_PACKET_HEADER_BARRIER) |
|
|
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
|
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
|
|
|
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
|
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
|
|
const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap,
|
|
std::vector<const Memory*>& memList) const {
|
|
static const bool WaitOnBusyEngine = true;
|
|
uint64_t ldsAddress = ldsSize();
|
|
address aqlArgBuf = gpu.cb(0)->sysMemCopy();
|
|
address aqlStruct = gpu.cb(1)->sysMemCopy();
|
|
bool srdResource = false;
|
|
|
|
if (extraArgumentsNum_ > 0) {
|
|
assert(MaxExtraArgumentsNum >= 6 &&
|
|
"MaxExtraArgumentsNum has changed, the below algorithm should be changed accordingly");
|
|
size_t extraArgs[MaxExtraArgumentsNum] = {0, 0, 0, 0, 0, 0};
|
|
// The HLC generates up to 3 additional arguments for the global offsets
|
|
for (uint i = 0; i < sizes.dimensions(); ++i) {
|
|
extraArgs[i] = sizes.offset()[i];
|
|
}
|
|
// Check if the kernel may have printf output
|
|
if ((printfInfo().size() > 0) &&
|
|
// and printf buffer was allocated
|
|
(gpu.printfDbgHSA().dbgBuffer() != NULL)) {
|
|
// and set the fourth argument as the printf_buffer pointer
|
|
extraArgs[3] = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
|
|
memList.push_back(gpu.printfDbgHSA().dbgBuffer());
|
|
}
|
|
if (dynamicParallelism()) {
|
|
// Provide the host parent AQL wrap object to the kernel
|
|
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(aqlStruct);
|
|
memset(wrap, 0, sizeof(AmdAqlWrap));
|
|
wrap->state = AQL_WRAP_BUSY;
|
|
ConstBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(sizeof(AmdAqlWrap));
|
|
*vmParentWrap = cb->vmAddress() + cb->wrtOffset();
|
|
// and set 5th & 6th arguments
|
|
extraArgs[4] = vmDefQueue;
|
|
extraArgs[5] = *vmParentWrap;
|
|
memList.push_back(cb);
|
|
}
|
|
WriteAqlArg(&aqlArgBuf, extraArgs, sizeof(size_t) * extraArgumentsNum_, sizeof(size_t));
|
|
}
|
|
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
const amd::KernelParameters& kernelParams = kernel.parameters();
|
|
|
|
amd::Memory* const* memories =
|
|
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
|
|
|
|
// Find all parameters for the current kernel
|
|
for (uint i = 0; i != signature.numParameters(); ++i) {
|
|
const HSAILKernel::Argument* arg = argument(i);
|
|
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
|
const_address paramaddr = parameters + desc.offset_;
|
|
|
|
switch (arg->type_) {
|
|
case HSAIL_ARGTYPE_POINTER:
|
|
// If it is a global pointer
|
|
if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) {
|
|
Memory* gpuMem = NULL;
|
|
amd::Memory* mem = NULL;
|
|
|
|
uint32_t index = signature.at(i).info_.arrayIndex_;
|
|
if (nativeMem) {
|
|
gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
|
|
if (nullptr != gpuMem) {
|
|
mem = gpuMem->owner();
|
|
}
|
|
} else {
|
|
mem = memories[index];
|
|
if (mem != nullptr) {
|
|
gpuMem = dev().getGpuMemory(mem);
|
|
}
|
|
}
|
|
|
|
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr), sizeof(paramaddr));
|
|
if (gpuMem == nullptr) {
|
|
break;
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, WaitOnBusyEngine);
|
|
|
|
//! @todo Compiler has to return read/write attributes
|
|
if ((NULL != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
memList.push_back(gpuMem);
|
|
|
|
// save the memory object pointer to allow global memory access
|
|
if (NULL != dev().hwDebugMgr()) {
|
|
dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner());
|
|
}
|
|
}
|
|
// If it is a local pointer
|
|
else {
|
|
assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) && "Unsupported address type");
|
|
ldsAddress = amd::alignUp(ldsAddress, arg->alignment_);
|
|
WriteAqlArg(&aqlArgBuf, &ldsAddress, desc.size_);
|
|
if (desc.size_ == 8) {
|
|
ldsAddress += *reinterpret_cast<const uint64_t*>(paramaddr);
|
|
} else {
|
|
ldsAddress += *reinterpret_cast<const uint32_t*>(paramaddr);
|
|
}
|
|
}
|
|
break;
|
|
case HSAIL_ARGTYPE_VALUE:
|
|
// Special case for structrues
|
|
if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) {
|
|
// Copy the current structre into CB1
|
|
memcpy(aqlStruct, paramaddr, arg->size_);
|
|
ConstBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(arg->size_);
|
|
// Then use a pointer in aqlArgBuffer to CB1
|
|
uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset();
|
|
WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*));
|
|
memList.push_back(cb);
|
|
} else {
|
|
WriteAqlArg(&aqlArgBuf, paramaddr, arg->numElem_ * arg->size_, arg->size_);
|
|
}
|
|
break;
|
|
case HSAIL_ARGTYPE_IMAGE: {
|
|
Image* image = nullptr;
|
|
amd::Memory* mem = nullptr;
|
|
uint32_t index = signature.at(i).info_.arrayIndex_;
|
|
if (nativeMem) {
|
|
image = reinterpret_cast<Image* const*>(memories)[index];
|
|
if (nullptr != image) {
|
|
mem = image->owner();
|
|
}
|
|
} else {
|
|
mem = memories[index];
|
|
if (mem == NULL) {
|
|
LogError("The kernel image argument isn't an image object!");
|
|
return nullptr;
|
|
}
|
|
image = static_cast<Image*>(dev().getGpuMemory(mem));
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
image->wait(gpu, WaitOnBusyEngine);
|
|
|
|
//! \note Special case for the image views.
|
|
//! Copy SRD to CB1, so blit manager will be able to release
|
|
//! this view without a wait for SRD resource.
|
|
if (image->memoryType() == Resource::ImageView) {
|
|
// Copy the current structre into CB1
|
|
memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
|
|
ConstBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(HsaImageObjectSize);
|
|
// Then use a pointer in aqlArgBuffer to CB1
|
|
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
memList.push_back(cb);
|
|
} else {
|
|
uint64_t srd = image->hwSrd();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
srdResource = true;
|
|
}
|
|
|
|
//! @todo Compiler has to return read/write attributes
|
|
if ((NULL != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
|
|
memList.push_back(image);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_SAMPLER: {
|
|
uint32_t index = signature.at(i).info_.arrayIndex_;
|
|
const amd::Sampler* sampler = reinterpret_cast<amd::Sampler* const*>(parameters +
|
|
kernelParams.samplerObjOffset())[index];
|
|
const Sampler* gpuSampler = static_cast<Sampler*>(sampler->getDeviceSampler(dev()));
|
|
uint64_t srd = gpuSampler->hwSrd();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
srdResource = true;
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_QUEUE: {
|
|
uint32_t index = signature.at(i).info_.arrayIndex_;
|
|
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
|
|
parameters + kernelParams.queueObjOffset())[index];
|
|
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
|
|
uint64_t vmQueue;
|
|
if (dev().settings().useDeviceQueue_) {
|
|
vmQueue = gpuQueue->vQueue()->vmAddress();
|
|
} else {
|
|
if (!gpu.createVirtualQueue(queue->size())) {
|
|
LogError("Virtual queue creation failed!");
|
|
return nullptr;
|
|
}
|
|
vmQueue = gpu.vQueue()->vmAddress();
|
|
}
|
|
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
|
|
break;
|
|
}
|
|
default:
|
|
LogError(" Unsupported address type ");
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
if (ldsAddress > dev().info().localMemSize_) {
|
|
LogError("No local memory available\n");
|
|
return NULL;
|
|
}
|
|
|
|
// HSAIL kernarg segment size is rounded up to multiple of 16.
|
|
aqlArgBuf = amd::alignUp(aqlArgBuf, 16);
|
|
assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) &&
|
|
"Size and the number of arguments don't match!");
|
|
hsa_kernel_dispatch_packet_t* hsaDisp =
|
|
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlArgBuf);
|
|
|
|
amd::NDRange local(sizes.local());
|
|
const amd::NDRange& global = sizes.global();
|
|
|
|
// Check if runtime has to find local workgroup size
|
|
FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
|
|
|
|
hsaDisp->header = kDispatchPacketHeader;
|
|
hsaDisp->setup = sizes.dimensions();
|
|
|
|
hsaDisp->workgroup_size_x = local[0];
|
|
hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
|
|
hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
|
|
|
|
hsaDisp->grid_size_x = global[0];
|
|
hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
|
|
hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
|
|
hsaDisp->reserved2 = 0;
|
|
|
|
// Initialize kernel ISA and execution buffer requirements
|
|
hsaDisp->private_segment_size = spillSegSize();
|
|
hsaDisp->group_segment_size = ldsAddress;
|
|
hsaDisp->kernel_object = gpuAqlCode()->vmAddress();
|
|
|
|
ConstBuffer* cb = gpu.constBufs_[0];
|
|
cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
|
|
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
|
|
|
|
hsaDisp->kernarg_address = reinterpret_cast<void*>(argList);
|
|
hsaDisp->reserved2 = 0;
|
|
hsaDisp->completion_signal.handle = 0;
|
|
|
|
memList.push_back(cb);
|
|
memList.push_back(gpuAqlCode());
|
|
for (gpu::Memory* mem : prog().globalStores()) {
|
|
memList.push_back(mem);
|
|
}
|
|
if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
|
|
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
|
|
memList.push_back(gpu.hsaQueueMem());
|
|
}
|
|
|
|
if (srdResource || prog().isStaticSampler()) {
|
|
dev().srds().fillResourceList(memList);
|
|
}
|
|
|
|
return hsaDisp;
|
|
}
|
|
|
|
} // namespace gpu
|