e87e2d4c11
ECR #304775 - Device enqueuing - Report proper size for the device queue. Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#259 edit
4054 строки
137 KiB
C++
4054 строки
137 KiB
C++
//
|
|
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
|
|
#include "device/gpu/gpukernel.hpp"
|
|
#include "device/gpu/gpuprogram.hpp"
|
|
#include "device/gpu/gpublit.hpp"
|
|
#include "device/gpu/gpuconstbuf.hpp"
|
|
#include "device/gpu/gpusched.hpp"
|
|
#include "platform/commandqueue.hpp"
|
|
#include "utils/options.hpp"
|
|
|
|
#include "acl.h"
|
|
#include "SCShadersR678XXCommon.h"
|
|
|
|
#include <string>
|
|
#include <memory>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <ctime>
|
|
|
|
namespace gpu {
|
|
|
|
const MetaDataConst ArgState[ArgStateTotal] =
|
|
{
|
|
// Note: the order is important
|
|
// Name Type Properties
|
|
// Kernel description (special properties)
|
|
{ "memory:compilerwrite", KernelArg::PrivateFixed, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "uniqueid:", KernelArg::None, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "memory:private:", KernelArg::PrivateSize, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "memory:local:", KernelArg::LocalSize, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "memory:hwprivate:", KernelArg::HwPrivateSize, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "memory:uavprivate:", KernelArg::HwPrivateSize, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "memory:hwlocal:", KernelArg::HwLocalSize, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "memory:64bitABI", KernelArg::ABI64Bit, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "limitgroupsize", KernelArg::Wavefront, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "function:", KernelArg::None, { 1, 1, 0, 0, 0, 0, 0 } },
|
|
{ "intrinsic:", KernelArg::None, { 1, 0, 0, 0, 0, 0, 0 } },
|
|
{ "error:", KernelArg::ErrorMessage, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "warning:", KernelArg::WarningMessage, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "printf_fmt:", KernelArg::PrintfFormatStr, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "version:", KernelArg::MetadataVersion, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
// Kernel basic types
|
|
{ "pointer:", KernelArg::PointerGlobal, { 1, 1, 1, 1, 1, 1, 0 } },
|
|
{ "value:", KernelArg::Value, { 1, 1, 1, 1, 1, 0, 0 } },
|
|
{ "image:", KernelArg::Image, { 1, 1, 1, 1, 1, 0, 0 } },
|
|
{ "sampler:", KernelArg::Sampler, { 0, 1, 0, 0, 0, 0, 0 } },
|
|
{ "counter:", KernelArg::Counter, { 1, 1, 0, 1, 1, 0, 0 } },
|
|
{ "cws:", KernelArg::Grouping, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "lws:", KernelArg::WrkgrpSize, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "uavid:", KernelArg::UavId, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "reflection:", KernelArg::Reflection, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "constarg:", KernelArg::ConstArg, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "cbid:", KernelArg::ConstBufId, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "printfid:", KernelArg::PrintfBufId, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "wsh:", KernelArg::GroupingHint, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "vth:", KernelArg::VecTypeHint, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
};
|
|
|
|
const DataTypeConst DataType[] =
|
|
{
|
|
{ "i8:", KernelArg::Char, },
|
|
{ "i16:", KernelArg::Short, },
|
|
{ "i32:", KernelArg::Int, },
|
|
{ "i64:", KernelArg::Long, },
|
|
{ "u8:", KernelArg::UChar, },
|
|
{ "u16:", KernelArg::UShort, },
|
|
{ "u32:", KernelArg::UInt, },
|
|
{ "u64:", KernelArg::ULong, },
|
|
{ "float:", KernelArg::Float, },
|
|
{ "double:", KernelArg::Double, },
|
|
{ "struct:", KernelArg::Struct, },
|
|
{ "union:", KernelArg::Union, },
|
|
{ "1D:", KernelArg::Image1D, },
|
|
{ "2D:", KernelArg::Image2D, },
|
|
{ "3D:", KernelArg::Image3D, },
|
|
{ "1DB:", KernelArg::Image1DB, },
|
|
{ "1DA:", KernelArg::Image1DA, },
|
|
{ "2DA:", KernelArg::Image2DA, },
|
|
{ "opaque:", KernelArg::Opaque, },
|
|
{ "event:", KernelArg::Event, },
|
|
{ "sampler:", KernelArg::Sampler, },
|
|
{ "half:", KernelArg::Half, },
|
|
};
|
|
|
|
const uint DataTypeTotal = sizeof(DataType) / sizeof(DataTypeConst);
|
|
|
|
struct BufDataConst
|
|
{
|
|
const char* tagName_; //!< buffer's name
|
|
KernelArg::ArgumentType type_; //!< type of argument
|
|
struct
|
|
{
|
|
uint number_ : 1; //!< buffer's number
|
|
uint alignment_ : 1; //!< buffer's alignment
|
|
uint attribute_ : 1; //!< buffer's read/write attribute
|
|
uint reserved : 29; //!< reserved
|
|
};
|
|
};
|
|
|
|
static const BufDataConst BufType[] =
|
|
{
|
|
{ "g", KernelArg::PointerGlobal, { 1, 0, 0, 0 } },
|
|
{ "p", KernelArg::PointerPrivate, { 1, 1, 1, 0 } },
|
|
{ "l", KernelArg::PointerLocal, { 1, 1, 1, 0 } },
|
|
{ "uav", KernelArg::PointerGlobal, { 1, 1, 1, 0 } },
|
|
{ "c", KernelArg::PointerConst, { 1, 1, 1, 0 } },
|
|
{ "hl", KernelArg::PointerHwLocal, { 1, 1, 1, 0 } },
|
|
{ "hp", KernelArg::PointerHwPrivate,{ 1, 1, 1, 0 } },
|
|
{ "hc", KernelArg::PointerHwConst, { 1, 1, 1, 0 } }
|
|
};
|
|
static const uint BufTypeTotal = sizeof(BufType) / sizeof(BufDataConst);
|
|
|
|
//! The mathlib constants for each kernel execution
|
|
static const float MathLibConst[4] = { 0.0f, 0.5f, 1.0f, 2.0f };
|
|
|
|
bool
|
|
expect(const std::string& str, size_t* pos, const std::string& sym)
|
|
{
|
|
bool result = true;
|
|
uint i;
|
|
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
// Check if we have expected symbols
|
|
for (i = 0; i < sym.size(); ++i) {
|
|
char deb = str[*pos + i];
|
|
if (deb != sym[i]) {
|
|
result = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (result) *pos += i;
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
getword(const std::string& str, size_t* pos, char* sym)
|
|
{
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;", *pos);
|
|
size_t count = posEnd - *pos;
|
|
|
|
if (count != 0) {
|
|
if (!str.copy(sym, count, *pos)) {
|
|
return false;
|
|
}
|
|
}
|
|
sym[count] = 0;
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
getstring(const std::string& str, size_t* pos, std::string* out)
|
|
{
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(":\n\r;", *pos);
|
|
size_t count = posEnd - *pos;
|
|
|
|
char* sym = new char[count + 1];
|
|
if (count != 0) {
|
|
if (!str.copy(sym, count, *pos)) {
|
|
return false;
|
|
}
|
|
}
|
|
sym[count] = 0;
|
|
*out = sym;
|
|
delete [] sym;
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
getuint(const std::string& str, size_t* pos, uint* val)
|
|
{
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
char sym[16];
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
|
|
|
|
if (!str.copy(sym, posEnd - *pos, *pos)) {
|
|
return false;
|
|
}
|
|
*val = 0;
|
|
for (size_t i = 0; i < (posEnd - *pos); ++i) {
|
|
*val = (*val * 10) + (sym[i] - 0x30);
|
|
}
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
getuintHex(const std::string& str, size_t* pos, uint* val)
|
|
{
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
char sym[16];
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
|
|
|
|
if (!str.copy(sym, posEnd - *pos, *pos)) {
|
|
return false;
|
|
}
|
|
*val = 0;
|
|
for (size_t i = 0; i < (posEnd - *pos); ++i) {
|
|
if (sym[i] >= '0' && sym[i] <= 'F') {
|
|
*val = (*val * 16) + (sym[i] - '0');
|
|
}
|
|
else if (sym[i] >= 'a' && sym[i] <= 'f') {
|
|
*val = (*val * 16) + (sym[i] - 'a' + 10);
|
|
}
|
|
else {
|
|
return false;
|
|
}
|
|
}
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
getuint64Hex(const std::string& str, size_t* pos, uint64_t* val)
|
|
{
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
char sym[16];
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
|
|
|
|
if (!str.copy(sym, posEnd - *pos, *pos)) {
|
|
return false;
|
|
}
|
|
*val = 0;
|
|
for (size_t i = 0; i < (posEnd - *pos); ++i) {
|
|
if (sym[i] >= '0' && sym[i] <= 'F') {
|
|
*val = (*val * 16) + (sym[i] - '0');
|
|
}
|
|
else if (sym[i] >= 'a' && sym[i] <= 'f') {
|
|
*val = (*val * 16) + (sym[i] - 'a' + 10);
|
|
}
|
|
else {
|
|
return false;
|
|
}
|
|
}
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
void
|
|
intToStr(size_t value, char* str, size_t size)
|
|
{
|
|
static const uint MaxDigits32bit = 10;
|
|
char result[MaxDigits32bit];
|
|
uint idx = MaxDigits32bit;
|
|
|
|
do {
|
|
idx--;
|
|
result[idx] = static_cast<char>((value % 10) + '0');
|
|
value /= 10;
|
|
} while ((value != 0) && (idx > 0));
|
|
size_t len = MaxDigits32bit - idx;
|
|
size_t n = std::min<size_t>(len, size-1);
|
|
memcpy(str, &result[idx], n);
|
|
str[n] = '\0';
|
|
}
|
|
|
|
//! Default destructor
|
|
CalImageReference::~CalImageReference()
|
|
{
|
|
// Free CAL image
|
|
free(image_);
|
|
}
|
|
|
|
KernelArg::KernelArg()
|
|
: type_(KernelArg::None)
|
|
, size_(0)
|
|
, cbIdx_(0)
|
|
, cbPos_(0)
|
|
, index_(0)
|
|
, alignment_(1)
|
|
, dataType_(KernelArg::None)
|
|
{
|
|
name_ = "";
|
|
buf_ = "";
|
|
memory_.value_ = 0;
|
|
typeQualifier_ = CL_KERNEL_ARG_TYPE_NONE;
|
|
}
|
|
|
|
KernelArg::KernelArg(const KernelArg& data)
|
|
{
|
|
// Fill the new object
|
|
*this = data;
|
|
}
|
|
|
|
KernelArg&
|
|
KernelArg::operator=(const KernelArg& data)
|
|
{
|
|
// Fill the fields of the current object
|
|
name_ = data.name_;
|
|
typeName_ = data.typeName_;
|
|
typeQualifier_ = data.typeQualifier_;
|
|
type_ = data.type_;
|
|
size_ = data.size_;
|
|
cbIdx_ = data.cbIdx_;
|
|
cbPos_ = data.cbPos_;
|
|
buf_ = data.buf_;
|
|
index_ = data.index_;
|
|
alignment_ = data.alignment_;
|
|
dataType_ = data.dataType_;
|
|
memory_.value_ = data.memory_.value_;
|
|
return *this;
|
|
}
|
|
|
|
bool
|
|
KernelArg::isCbNeeded() const
|
|
{
|
|
//! \note not a safe way
|
|
bool result = ((type_ > None) && (type_ < Sampler)) ? true : false;
|
|
if ((type_ == Sampler) && (location_ == 0)) {
|
|
// Sampler is defined outside the kernel
|
|
result = true;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
size_t
|
|
KernelArg::size(bool gpuLayer)const
|
|
{
|
|
switch (type_) {
|
|
case None:
|
|
return 0;
|
|
case PointerConst:
|
|
case PointerHwConst:
|
|
case PointerGlobal:
|
|
return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem);
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
return (gpuLayer) ? sizeof(ImageConstants) : sizeof(cl_mem);
|
|
case Sampler:
|
|
return (gpuLayer) ? 2 * sizeof(uint32_t) : sizeof(cl_sampler);
|
|
case Counter:
|
|
return (gpuLayer) ? 0 : sizeof(cl_mem);
|
|
case PointerLocal:
|
|
case PointerHwLocal:
|
|
return (gpuLayer) ? sizeof(uint32_t) * size_ : 0;
|
|
case PointerPrivate:
|
|
case PointerHwPrivate:
|
|
return (gpuLayer) ? sizeof(uint32_t) * size_ : 0;
|
|
case Float:
|
|
return sizeof(cl_float) * amd::nextPowerOfTwo(size_);
|
|
case Double:
|
|
return sizeof(cl_double) * amd::nextPowerOfTwo(size_);
|
|
case Char:
|
|
case UChar:
|
|
return sizeof(cl_char) * amd::nextPowerOfTwo(size_);
|
|
case Short:
|
|
case UShort:
|
|
return sizeof(cl_short) * amd::nextPowerOfTwo(size_);
|
|
case Int:
|
|
case UInt:
|
|
return sizeof(cl_uint) * amd::nextPowerOfTwo(size_);
|
|
case Long:
|
|
case ULong:
|
|
return sizeof(cl_ulong) * amd::nextPowerOfTwo(size_);
|
|
case Struct:
|
|
case Union:
|
|
return (gpuLayer) ? amd::alignUp(size_, 16) : size_;
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
cl_kernel_arg_address_qualifier
|
|
KernelArg::addressQualifier() const
|
|
{
|
|
switch (type_) {
|
|
case PointerGlobal:
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
case PointerLocal:
|
|
case PointerHwLocal:
|
|
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
|
case PointerConst:
|
|
case PointerHwConst:
|
|
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
|
default:
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
}
|
|
|
|
cl_kernel_arg_access_qualifier
|
|
KernelArg::accessQualifier() const
|
|
{
|
|
switch (type_) {
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
if (memory_.readOnly_) {
|
|
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
|
}
|
|
else if (memory_.writeOnly_) {
|
|
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
|
}
|
|
else if (memory_.readWrite_) {
|
|
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
|
}
|
|
// Fall through ...
|
|
default:
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
}
|
|
|
|
//! temporary solution for the vectors handling in compiler
|
|
size_t
|
|
KernelArg::specialVector() const
|
|
{
|
|
if (size_ > VectorSizeLimit) {
|
|
switch (type_) {
|
|
case Char:
|
|
case UChar:
|
|
return sizeof(cl_char);
|
|
case Short:
|
|
case UShort:
|
|
return sizeof(cl_short);
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
clk_value_type_t
|
|
KernelArg::type()const
|
|
{
|
|
switch (type_) {
|
|
case PointerGlobal:
|
|
case PointerLocal:
|
|
case PointerHwLocal:
|
|
case PointerConst:
|
|
case PointerHwConst:
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
case Counter:
|
|
return T_POINTER;
|
|
case Float:
|
|
return T_FLOAT;
|
|
case Double:
|
|
return T_DOUBLE;
|
|
case Char:
|
|
case UChar:
|
|
return T_CHAR;
|
|
case Short:
|
|
case UShort:
|
|
return T_SHORT;
|
|
case Int:
|
|
return T_INT;
|
|
case UInt:
|
|
//! \note No UINT type
|
|
return T_INT;
|
|
case Long:
|
|
return T_LONG;
|
|
case ULong:
|
|
//! \note No ULONG type
|
|
return T_LONG;
|
|
case Struct:
|
|
case Union:
|
|
//! @todo What should we report?
|
|
return T_CHAR;
|
|
case Sampler:
|
|
return T_SAMPLER;
|
|
case PointerPrivate:
|
|
case PointerHwPrivate:
|
|
case None:
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
}
|
|
|
|
NullKernel::NullKernel(
|
|
const std::string& name,
|
|
const NullDevice& gpuNullDev,
|
|
const NullProgram& nullprog)
|
|
: device::Kernel(name)
|
|
, buildError_(CL_BUILD_PROGRAM_FAILURE)
|
|
, gpuDev_(gpuNullDev)
|
|
, prog_(nullprog)
|
|
, calRef_(NULL)
|
|
, internal_(false)
|
|
, flags_(0)
|
|
, cbSizes_(NULL)
|
|
, numCb_(0)
|
|
, rwAttributes_(false)
|
|
, instructionCnt_(4)
|
|
{
|
|
// UAV raw index will be detected
|
|
uavRaw_ = UavIdUndefined;
|
|
// Initialize UAV arena index(should be 8)
|
|
uavArena_ = VirtualGPU::UavArena;
|
|
// CB index will be detected
|
|
cbId_ = UavIdUndefined;
|
|
// Printf index will be detected
|
|
printfId_ = UavIdUndefined;
|
|
}
|
|
|
|
NullKernel::~NullKernel()
|
|
{
|
|
uint idx;
|
|
|
|
if (calRef_ == NULL) {
|
|
return;
|
|
}
|
|
calRef_->release();
|
|
|
|
// Destroy all kernel arguments
|
|
for (idx = 0; idx < arguments_.size(); ++idx) {
|
|
delete arguments_[idx];
|
|
}
|
|
arguments_.clear();
|
|
|
|
// Destroy all sampler kernel arguments
|
|
for (idx = 0; idx < intSamplers_.size(); ++idx) {
|
|
delete intSamplers_[idx];
|
|
}
|
|
intSamplers_.clear();
|
|
}
|
|
|
|
|
|
static int
|
|
scComponentToArrayIndex(E_SC_COMPONENT dstComp)
|
|
{
|
|
switch (dstComp) {
|
|
case SC_COMPONENT_X:
|
|
return 0;
|
|
case SC_COMPONENT_Y:
|
|
return 1;
|
|
case SC_COMPONENT_Z:
|
|
return 2;
|
|
case SC_COMPONENT_W:
|
|
return 3;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
addLoopConst(const SC_HWSHADER* shader, AMUabiAddEncoding& encoding)
|
|
{
|
|
uint count = shader->dep.NumIntrlIConstants;
|
|
encoding.litConstsCount = shader->dep.NumIntrlIConstants;
|
|
|
|
// only suppport loop consts (int consts)
|
|
if (count) {
|
|
AMUabiLiteralConst* allocatedconsts = encoding.litConsts;
|
|
memset(allocatedconsts, 0, count * sizeof(AMUabiLiteralConst));
|
|
uint usedConsts = 0;
|
|
for (uint i = 0; i < count; ++i) {
|
|
uint currentConst;
|
|
for (currentConst = 0; currentConst < usedConsts; ++currentConst) {
|
|
if (allocatedconsts[currentConst].addr ==
|
|
HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber) {
|
|
break;
|
|
}
|
|
}
|
|
if (currentConst == usedConsts) {
|
|
usedConsts++;
|
|
assert(usedConsts <= count);
|
|
}
|
|
allocatedconsts[currentConst].addr = HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber;
|
|
allocatedconsts[currentConst].type = AMU_ABI_INT32;
|
|
allocatedconsts[currentConst].value.
|
|
int32[scComponentToArrayIndex(HWSHADER_Get(shader, dep.IntrlIConstants)[i].eDstComp)] =
|
|
HWSHADER_Get(shader, dep.IntrlIConstants)[i].iValue;
|
|
}
|
|
encoding.litConstsCount = usedConsts;
|
|
}
|
|
}
|
|
|
|
bool
|
|
NullKernel::create(
|
|
const std::string& code,
|
|
const std::string& metadata,
|
|
const void* binaryCode,
|
|
size_t binarySize)
|
|
{
|
|
std::auto_ptr<uint> uavRefCount (new uint[MaxUavArguments]);
|
|
if (NULL == uavRefCount.get()) {
|
|
return false;
|
|
}
|
|
|
|
// Set all ref counts to 0
|
|
memset(uavRefCount.get(), 0, sizeof(uavRefCount.get()[0]) * MaxUavArguments);
|
|
|
|
// parse the metadata fields
|
|
if (!parseArguments(metadata, uavRefCount.get())) {
|
|
return false;
|
|
}
|
|
|
|
CALimage calImage;
|
|
// Save source if DEBUG build
|
|
#if DEBUG
|
|
ilSource_ = code;
|
|
#endif // DEBUG
|
|
|
|
amd::option::Options *options = nullProg().getCompilerOptions();
|
|
internal_ = options->oVariables->clInternalKernel;
|
|
|
|
if ((binaryCode == NULL) && (binarySize == 0) && !code.empty()) {
|
|
acl_error err;
|
|
std::string arch = GPU_TARGET_INFO_ARCH;
|
|
if (nullDev().settings().use64BitPtr_) {
|
|
arch += "64";
|
|
}
|
|
aclTargetInfo info = aclGetTargetInfo(
|
|
arch.c_str(), nullDev().hwInfo()->targetName_, &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclGetTargetInfo failed");
|
|
return false;
|
|
}
|
|
|
|
aclBinaryOptions binOpts = {0};
|
|
binOpts.struct_size = sizeof(binOpts);
|
|
binOpts.elfclass = info.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32;
|
|
binOpts.bitness = ELFDATA2LSB;
|
|
binOpts.alloc = &::malloc;
|
|
binOpts.dealloc = &::free;
|
|
|
|
aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &info, &binOpts, &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclBinaryInit failed");
|
|
return false;
|
|
}
|
|
|
|
if (ACL_SUCCESS != aclInsertSection(nullDev().compiler(), bin,
|
|
code.data(), code.size(), aclSOURCE)) {
|
|
LogWarning("aclInsertSection failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
|
|
amd::option::Options* Opts = (amd::option::Options*)bin->options;
|
|
// temporary solution to synchronize buildNo between runtime and complib
|
|
// until we move runtime inside complib
|
|
Opts->setBuildNo(options->getBuildNo());
|
|
|
|
// pass kernel name to compiler
|
|
Opts->setCurrKernelName(name().c_str());
|
|
|
|
err = aclCompile(nullDev().compiler(), bin, options->origOptionStr.c_str(),
|
|
ACL_TYPE_AMDIL_TEXT, ACL_TYPE_ISA, NULL);
|
|
|
|
buildLog_ += aclGetCompilerLog(nullDev().compiler());
|
|
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclCompile failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
if (!options->oVariables->BinEXE) {
|
|
// Early exit if binary doesn't contain EXE
|
|
aclBinaryFini(bin);
|
|
return true;
|
|
}
|
|
size_t len;
|
|
const void* isa = aclExtractSection(nullDev().compiler(), bin,
|
|
&len, aclTEXT, &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclExtractSection failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
|
|
uint calImageSize;
|
|
if (!createMultiBinary(
|
|
&calImageSize, reinterpret_cast<void**>(&calImage), isa)) {
|
|
LogWarning("initSrcEncoding failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
|
|
aclBinaryFini(bin);
|
|
}
|
|
else if ((binaryCode != NULL) && (binarySize != 0)) {
|
|
uint size = 0;
|
|
if (!amuABIMultiBinaryGetSize(&size, const_cast<void*>(binaryCode))
|
|
|| size > binarySize) {
|
|
buildLog_ += "Invalid binary image";
|
|
LogError("amuABIMultiBinaryGetSize failed!");
|
|
return false;
|
|
}
|
|
|
|
calImage = static_cast<CALimage>(malloc(size));
|
|
::memcpy(calImage, binaryCode, size);
|
|
}
|
|
else {
|
|
LogError("Incorrect initialization parameters!");
|
|
return false;
|
|
}
|
|
|
|
calRef_ = new CalImageReference(calImage);
|
|
if (calRef_ == NULL) {
|
|
LogError("Memory allocation failure!");
|
|
// Free CAL image
|
|
free(calImage);
|
|
return false;
|
|
}
|
|
|
|
CALfuncInfo calFuncInfo;
|
|
|
|
// Get kernel compiled information
|
|
getFuncInfoFromImage(calImage, &calFuncInfo);
|
|
if (calFuncInfo.maxScratchRegsNeeded > 0) {
|
|
LogPrintfInfo("%s kernel has register spilling."
|
|
"Lower performance is expected.", name().c_str());
|
|
}
|
|
|
|
workGroupInfo_.scratchRegs_ = calFuncInfo.maxScratchRegsNeeded;
|
|
workGroupInfo_.wavefrontPerSIMD_ = calFuncInfo.numWavefrontPerSIMD;
|
|
workGroupInfo_.wavefrontSize_ = calFuncInfo.wavefrontSize;
|
|
workGroupInfo_.availableGPRs_ = calFuncInfo.numGPRsAvailable;
|
|
workGroupInfo_.usedGPRs_ = calFuncInfo.numGPRsUsed;
|
|
workGroupInfo_.availableSGPRs_ = calFuncInfo.numSGPRsAvailable;
|
|
workGroupInfo_.usedSGPRs_ = calFuncInfo.numSGPRsUsed;
|
|
workGroupInfo_.availableVGPRs_ = calFuncInfo.numVGPRsAvailable;
|
|
workGroupInfo_.usedVGPRs_ = calFuncInfo.numVGPRsUsed;
|
|
workGroupInfo_.availableLDSSize_ = calFuncInfo.LDSSizeAvailable;
|
|
workGroupInfo_.usedLDSSize_ = calFuncInfo.LDSSizeUsed;
|
|
workGroupInfo_.availableStackSize_ = calFuncInfo.stackSizeAvailable;
|
|
workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed;
|
|
|
|
device::Kernel::parameters_t params;
|
|
if (!createSignature(params)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
size_t
|
|
NullKernel::getCalBinarySize() const
|
|
{
|
|
CALuint imageSize;
|
|
if (!amuABIMultiBinaryGetSize(&imageSize, calImage())) {
|
|
LogError("Failed to get the image size!");
|
|
return 0;
|
|
}
|
|
return static_cast<size_t>(imageSize);
|
|
}
|
|
|
|
bool
|
|
NullKernel::getCalBinary(void* binary, size_t size) const
|
|
{
|
|
uint calImageSize = 0;
|
|
if (!amuABIMultiBinaryGetSize(&calImageSize, calImage())
|
|
|| size < calImageSize) {
|
|
LogError("CAL failed to save the kernel binary!");
|
|
return false;
|
|
}
|
|
::memcpy(binary, calImage(), calImageSize);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
Kernel::create(
|
|
const std::string& code,
|
|
const std::string& metadata,
|
|
const void* binaryCode,
|
|
size_t binarySize)
|
|
{
|
|
setPreferredSizeMultiple(dev().getAttribs().wavefrontSize);
|
|
|
|
if (!NullKernel::create(code, metadata, binaryCode, binarySize)) {
|
|
return false;
|
|
}
|
|
|
|
// initialize constant buffer sizes
|
|
if (!initConstBuffers()) {
|
|
return false;
|
|
}
|
|
|
|
// Initialize the kernel parameters
|
|
bool result = initParameters();
|
|
|
|
if (!dev().heap()->isVirtual()) {
|
|
amd::option::Options *options = nullProg().getCompilerOptions();
|
|
// @todo Remove this. This is a hack for no VM mode
|
|
if (!options->oVariables->EnableDumpKernel) {
|
|
if (!name().compare(BlitName[KernelBlitManager::BlitCopyImageToBuffer]) ||
|
|
!name().compare(BlitName[KernelBlitManager::BlitCopyBufferToImage])) {
|
|
blitKernelHack_ = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (result) {
|
|
buildError_ = CL_SUCCESS;
|
|
}
|
|
else {
|
|
result = false;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
Kernel::Kernel(
|
|
const std::string& name,
|
|
const Device& gpuDev,
|
|
const Program& prog,
|
|
const InitData* initData)
|
|
: NullKernel(name, gpuDev, prog)
|
|
, blitKernelHack_(false)
|
|
{
|
|
hwPrivateSize_ = 0;
|
|
if (NULL != initData) {
|
|
flags_ = initData->flags_;
|
|
hwPrivateSize_ = initData->hwPrivateSize_;
|
|
hwLocalSize_ = initData->hwLocalSize_;
|
|
}
|
|
// Workgroup info private memory size
|
|
workGroupInfo_.privateMemSize_ = hwPrivateSize_;
|
|
hsa_ = false;
|
|
}
|
|
|
|
Kernel::~Kernel()
|
|
{
|
|
if (calRef_ == NULL) {
|
|
return;
|
|
}
|
|
|
|
{
|
|
Device::ScopedLockVgpus lock(dev());
|
|
|
|
// Release all virtual image objects on all virtual GPUs
|
|
for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
|
|
dev().vgpus()[idx]->releaseKernel(calImage());
|
|
}
|
|
}
|
|
|
|
if (0 != numCb_) {
|
|
delete [] cbSizes_;
|
|
}
|
|
}
|
|
|
|
const Device&
|
|
Kernel::dev() const
|
|
{
|
|
return reinterpret_cast<const Device&>(gpuDev_);
|
|
}
|
|
|
|
const Program&
|
|
Kernel::prog() const
|
|
{
|
|
return reinterpret_cast<const Program&>(prog_);
|
|
}
|
|
|
|
bool
|
|
NullKernel::createMultiBinary(uint* imageSize, void** image, const void* isa)
|
|
{
|
|
const SC_HWSHADER* shader = reinterpret_cast<const SC_HWSHADER*>(isa);
|
|
|
|
bool result = false;
|
|
AMUabiAddEncoding encoding;
|
|
memset(&encoding, 0, sizeof(AMUabiAddEncoding));
|
|
|
|
size_t allocSize =
|
|
sizeof(uint) * MaxReadImage +
|
|
sizeof(CALUavEntry) * MaxUavArguments +
|
|
sizeof(CALSamplerMapEntry) * MaxSamplers +
|
|
sizeof(CALConstantBufferMask) * MaxConstBuffers +
|
|
sizeof(AMUabiLiteralConst) * shader->dep.NumIntrlIConstants;
|
|
char* tmpMem = new char[allocSize];
|
|
if (tmpMem == NULL) {
|
|
LogError("Error allocating memory");
|
|
return false;
|
|
}
|
|
|
|
CalcPtr(encoding.inputs, tmpMem, 0, 0);
|
|
CalcPtr(encoding.uav, encoding.inputs, sizeof(uint), MaxReadImage);
|
|
CalcPtr(encoding.inputSamplerMaps, encoding.uav, sizeof(CALUavEntry), MaxUavArguments);
|
|
CalcPtr(encoding.constBuffers, encoding.inputSamplerMaps, sizeof(CALSamplerMapEntry), MaxSamplers);
|
|
if (shader->dep.NumIntrlIConstants != 0) {
|
|
CalcPtr(encoding.litConsts, encoding.constBuffers, sizeof(CALConstantBufferMask), MaxConstBuffers);
|
|
}
|
|
AMUabiMultiBinary amuBinary;
|
|
amuABIMultiBinaryCreate(&amuBinary);
|
|
|
|
if (nullDev().settings().siPlus_) {
|
|
result = siCreateHwInfo(shader, encoding);
|
|
}
|
|
else {
|
|
result = r800CreateHwInfo(shader, encoding);
|
|
}
|
|
if (!result) {
|
|
delete [] tmpMem;
|
|
LogWarning("Error Creating program info");
|
|
return false;
|
|
}
|
|
|
|
addLoopConst(shader, encoding);
|
|
|
|
unsigned int outputCount=0, condOut=0, earlyExit=0, globalCount=0, persistentCount=0;
|
|
unsigned int symbolCount=0;
|
|
CALOutputEntry* outputs=0;
|
|
unsigned int* globalBuffers=0;
|
|
unsigned int* persistentBuffers=0;
|
|
AMUabiUserSymbol* symbols=0;
|
|
|
|
CALSamplerMapEntry* inputSamplers = encoding.inputSamplerMaps;
|
|
CALConstantBufferMask* constBuffers = encoding.constBuffers;
|
|
uint* inputResources = encoding.inputs;
|
|
CALUavEntry* uav = encoding.uav;
|
|
|
|
uint inputSamplerCount = samplerSize();
|
|
for (uint i = 0; i < inputSamplerCount; ++i) {
|
|
inputSamplers[i].resource = 0;
|
|
inputSamplers[i].sampler = sampler(i)->index_;
|
|
}
|
|
|
|
uint constBufferCount = 2;
|
|
|
|
constBuffers[0].index = 0;
|
|
constBuffers[1].index = 1;
|
|
|
|
uint inputResourceCount = 0;
|
|
|
|
uint uavCount = 0;
|
|
bool globalBound = false;
|
|
bool cbBound = false;
|
|
bool printfBound = false;
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerHwConst:
|
|
constBuffers[constBufferCount++].index = arg->index_;
|
|
break;
|
|
case KernelArg::PointerGlobal:
|
|
if (nullDev().settings().useAliases_) {
|
|
if (!globalBound) {
|
|
uav[uavCount].offset = uavRaw_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
if (uavArena_ != 0) {
|
|
uav[uavCount].offset = uavArena_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_ARENA;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
}
|
|
globalBound = true;
|
|
}
|
|
else {
|
|
uav[uavCount].offset = arg->index_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
break;
|
|
case KernelArg::ConstBufId:
|
|
if (!cbBound) {
|
|
uav[uavCount].offset = cbId_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
cbBound = true;
|
|
break;
|
|
case KernelArg::PrintfBufId:
|
|
if (!printfBound) {
|
|
uav[uavCount].offset = printfId_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
printfBound = true;
|
|
break;
|
|
case KernelArg::UavId:
|
|
if (!nullDev().settings().useAliases_ &&
|
|
(UavIdUndefined != uavRaw_) &&
|
|
!(flags() & PrintfOutput)) {
|
|
uav[uavCount].offset = arg->index_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
else {
|
|
if (UavIdUndefined != uavRaw_) {
|
|
uav[uavCount].offset = uavRaw_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
if (uavArena_ != 0) {
|
|
uav[uavCount].offset = uavArena_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_ARENA;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
}
|
|
break;
|
|
case KernelArg::Sampler:
|
|
inputSamplers[inputSamplerCount].resource = 0;
|
|
inputSamplers[inputSamplerCount].sampler = arg->index_;
|
|
inputSamplerCount++;
|
|
break;
|
|
case KernelArg::Image1D:
|
|
case KernelArg::Image2D:
|
|
case KernelArg::Image3D:
|
|
case KernelArg::Image1DB:
|
|
case KernelArg::Image1DA:
|
|
case KernelArg::Image2DA:
|
|
if (arg->memory_.readOnly_) {
|
|
inputResources[inputResourceCount++] = arg->index_;
|
|
}
|
|
else {
|
|
uav[uavCount].offset = arg->index_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPED;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_2D;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (uint i = 0; i < nullProg().glbCb().size(); ++i) {
|
|
constBuffers[constBufferCount++].index = nullProg().glbCb()[i];
|
|
}
|
|
|
|
encoding.machine = nullDev().hwInfo()->machine_;
|
|
encoding.type = ED_ATI_CAL_TYPE_COMPUTE;
|
|
encoding.inputCount = inputResourceCount;
|
|
encoding.outputCount = outputCount;
|
|
encoding.outputs = outputs;
|
|
encoding.condOut = condOut;
|
|
encoding.earlyExit = earlyExit;
|
|
encoding.globalBuffersCount = globalCount;
|
|
encoding.globalBuffers = globalBuffers;
|
|
encoding.persistentBuffersCount = persistentCount;
|
|
encoding.persistentBuffers = persistentBuffers;
|
|
encoding.constBuffersCount = constBufferCount;
|
|
encoding.inputSamplerMapCount = inputSamplerCount;
|
|
encoding.symbolsCount = symbolCount;
|
|
encoding.symbols = symbols;
|
|
encoding.uavCount = uavCount;
|
|
|
|
amuABIMultiBinaryAddEncoding(amuBinary, &encoding);
|
|
|
|
uint success = amuABIMultiBinaryPack(imageSize, image, amuBinary);
|
|
|
|
amuABIMultiBinaryDestroy(amuBinary);
|
|
|
|
delete [] tmpMem;
|
|
delete [] encoding.progInfos;
|
|
|
|
return (success == 0) ? false : true;
|
|
}
|
|
|
|
void
|
|
Kernel::findLocalWorkSize(
|
|
size_t workDim,
|
|
const amd::NDRange& gblWorkSize,
|
|
amd::NDRange& lclWorkSize) const
|
|
{
|
|
// Initialize the default workgoup info
|
|
// Check if the kernel has the compiled sizes
|
|
if (workGroupInfo()->compileSize_[0] == 0) {
|
|
// Find the default local workgroup size, if it wasn't specified
|
|
if (lclWorkSize[0] == 0) {
|
|
size_t thrPerGrp;
|
|
bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
|
|
bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
|
|
bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
|
|
|
|
bool overrideSet = ((workDim == 1) && b1DOverrideSet) ||
|
|
((workDim == 2) && b2DOverrideSet) ||
|
|
((workDim == 3) && b3DOverrideSet);
|
|
if (!overrideSet) {
|
|
// Find threads per group
|
|
thrPerGrp = workGroupInfo()->size_;
|
|
|
|
// Check if kernel uses images
|
|
if ((flags() & ImageEnable) &&
|
|
// and thread group is a multiple value of wavefronts
|
|
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
|
|
// and it's 2 or 3-dimensional workload
|
|
(workDim > 1) &&
|
|
((dev().settings().partialDispatch_) ||
|
|
(((gblWorkSize[0] % 16) == 0) &&
|
|
((gblWorkSize[1] % 16) == 0)))) {
|
|
// Use 8x8 workgroup size if kernel has image writes
|
|
if ((flags() & ImageWrite) ||
|
|
(thrPerGrp != nullDev().info().maxWorkGroupSize_)) {
|
|
lclWorkSize[0] = 8;
|
|
lclWorkSize[1] = 8;
|
|
}
|
|
else {
|
|
lclWorkSize[0] = 16;
|
|
lclWorkSize[1] = 16;
|
|
}
|
|
if (workDim == 3) {
|
|
lclWorkSize[2] = 1;
|
|
}
|
|
}
|
|
else {
|
|
size_t tmp = thrPerGrp;
|
|
// Split the local workgroup into the most efficient way
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
size_t div = tmp;
|
|
for (; (gblWorkSize[d] % div) != 0; div--);
|
|
lclWorkSize[d] = div;
|
|
tmp /= div;
|
|
}
|
|
|
|
// Check if partial dispatch is enabled and
|
|
if (dev().settings().partialDispatch_ &&
|
|
// we couldn't find optimal workload
|
|
(lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
|
|
size_t maxSize = 0;
|
|
size_t maxDim = 0;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (maxSize < gblWorkSize[d]) {
|
|
maxSize = gblWorkSize[d];
|
|
maxDim = d;
|
|
}
|
|
}
|
|
// Check if a local workgroup has the most optimal size
|
|
if (thrPerGrp > maxSize) {
|
|
thrPerGrp = maxSize;
|
|
}
|
|
lclWorkSize[maxDim] = thrPerGrp;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (d != maxDim) {
|
|
lclWorkSize[d] = 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
// Use overrides when app doesn't provide workgroup dimensions
|
|
if (workDim == 1) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
|
|
}
|
|
else if (workDim == 2) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
|
|
}
|
|
else if (workDim == 3) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
|
|
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
|
|
}
|
|
else
|
|
{
|
|
assert(0 && "Invalid workDim!");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
Kernel::setupProgramGrid(
|
|
VirtualGPU& gpu,
|
|
size_t workDim,
|
|
const amd::NDRange& glbWorkOffset,
|
|
const amd::NDRange& gblWorkSize,
|
|
amd::NDRange& lclWorkSize,
|
|
const amd::NDRange& groupOffset,
|
|
const amd::NDRange& glbWorkOffsetOrg,
|
|
const amd::NDRange& glbWorkSizeOrg
|
|
) const
|
|
{
|
|
// ABI is always in CB0
|
|
address cbBuf = gpu.cb(0)->sysMemCopy();
|
|
uint* pGlobalSize = reinterpret_cast<uint*>
|
|
(cbBuf + GlobalWorkitemOffset * ConstBuffer::VectorSize);
|
|
uint* pLocalSize = reinterpret_cast<uint*>
|
|
(cbBuf + LocalWorkitemOffset * ConstBuffer::VectorSize);
|
|
uint* pNumGroups = reinterpret_cast<uint*>
|
|
(cbBuf + GroupsOffset * ConstBuffer::VectorSize);
|
|
uint* pGlobalOffset = reinterpret_cast<uint*>
|
|
(cbBuf + GlobalWorkOffsetOffset * ConstBuffer::VectorSize);
|
|
uint* pGroupOffset = reinterpret_cast<uint*>
|
|
(cbBuf + GroupWorkOffsetOffset * ConstBuffer::VectorSize);
|
|
uint32_t* debugInfo = reinterpret_cast<uint*>
|
|
(cbBuf + DebugOffset * ConstBuffer::VectorSize);
|
|
uint* pNDRangeGlobalOffset = reinterpret_cast<uint*>
|
|
(cbBuf + NDRangeGlobalWorkOffsetOffset * ConstBuffer::VectorSize);
|
|
|
|
// Check for 64-bit metadata
|
|
uint glbABIShift = (abi64Bit()) ? 1 : 0;
|
|
|
|
ProgramGrid* progGrid = &gpu.cal_.progGrid_;
|
|
|
|
// Finds local workgroup size
|
|
findLocalWorkSize(workDim, gblWorkSize, lclWorkSize);
|
|
|
|
// Initialize the execution grid block and size/offset
|
|
pGlobalSize[0] = pGlobalSize[1] = pGlobalSize[2] = 1;
|
|
pGlobalSize[3] = static_cast<uint>(workDim);
|
|
|
|
pLocalSize[0] = pLocalSize[1] = pLocalSize[2] = 1;
|
|
pLocalSize[3] = 0;
|
|
|
|
pNumGroups[0] = pNumGroups[1] = pNumGroups[2] = 1;
|
|
pNumGroups[3] = 0;
|
|
|
|
pGlobalOffset[2] = pGlobalOffset[1] = pGlobalOffset[0] = 0;
|
|
pGroupOffset[2] = pGroupOffset[1] = pGroupOffset[0] = 0;
|
|
|
|
progGrid->gridBlock.width =
|
|
progGrid->gridBlock.height =
|
|
progGrid->gridBlock.depth = 1;
|
|
|
|
progGrid->gridSize.width =
|
|
progGrid->gridSize.height =
|
|
progGrid->gridSize.depth = 1;
|
|
|
|
progGrid->partialGridBlock.width =
|
|
progGrid->partialGridBlock.height =
|
|
progGrid->partialGridBlock.depth = 1;
|
|
|
|
bool partialGrid = false;
|
|
|
|
// Fill the right values, based on the application request
|
|
switch (workDim) {
|
|
case 3:
|
|
pLocalSize[2] =
|
|
progGrid->gridBlock.depth = static_cast<CALuint>(lclWorkSize[2]);
|
|
|
|
pGlobalSize[2] = static_cast<CALuint>(glbWorkSizeOrg[2]);
|
|
progGrid->gridSize.depth = static_cast<CALuint>(gblWorkSize[2]);
|
|
progGrid->gridSize.depth /= progGrid->gridBlock.depth;
|
|
pNumGroups[2] = pGlobalSize[2] / progGrid->gridBlock.depth;
|
|
|
|
pGlobalOffset[2] = glbWorkOffset[2];
|
|
pGroupOffset[2] = groupOffset[2];
|
|
pNDRangeGlobalOffset[2 + glbABIShift] = glbWorkOffsetOrg[2];
|
|
|
|
if (dev().settings().partialDispatch_) {
|
|
// Check if partial workgroup dispatch is required
|
|
progGrid->partialGridBlock.depth = gblWorkSize[2] % lclWorkSize[2];
|
|
if (progGrid->partialGridBlock.depth != 0) {
|
|
partialGrid = true;
|
|
// Increment the number of groups
|
|
progGrid->gridSize.depth++;
|
|
pNumGroups[2]++;
|
|
}
|
|
else {
|
|
progGrid->partialGridBlock.depth = lclWorkSize[2];
|
|
}
|
|
}
|
|
// Fall through to fill 2D and 1D dimensions...
|
|
case 2:
|
|
pLocalSize[1] =
|
|
progGrid->gridBlock.height = static_cast<CALuint>(lclWorkSize[1]);
|
|
|
|
pGlobalSize[1] = static_cast<CALuint>(glbWorkSizeOrg[1]);
|
|
progGrid->gridSize.height = static_cast<CALuint>(gblWorkSize[1]);
|
|
progGrid->gridSize.height /= progGrid->gridBlock.height;
|
|
pNumGroups[1] = pGlobalSize[1] / progGrid->gridBlock.height;
|
|
|
|
pGlobalOffset[1] = glbWorkOffset[1];
|
|
pGroupOffset[1] = groupOffset[1];
|
|
pNDRangeGlobalOffset[1 + glbABIShift] = glbWorkOffsetOrg[1];
|
|
|
|
if (dev().settings().partialDispatch_) {
|
|
// Check if partial workgroup dispatch is required
|
|
progGrid->partialGridBlock.height = gblWorkSize[1] % lclWorkSize[1];
|
|
if (progGrid->partialGridBlock.height != 0) {
|
|
partialGrid = true;
|
|
// Increment the number of groups
|
|
progGrid->gridSize.height++;
|
|
pNumGroups[1]++;
|
|
}
|
|
else {
|
|
progGrid->partialGridBlock.height = lclWorkSize[1];
|
|
}
|
|
}
|
|
// Fall through to fill 1D dimension...
|
|
case 1:
|
|
pLocalSize[0] =
|
|
progGrid->gridBlock.width = static_cast<CALuint>(lclWorkSize[0]);
|
|
|
|
pGlobalSize[0] = static_cast<CALuint>(glbWorkSizeOrg[0]);
|
|
progGrid->gridSize.width = static_cast<CALuint>(gblWorkSize[0]);
|
|
progGrid->gridSize.width /= progGrid->gridBlock.width;
|
|
pNumGroups[0] = pGlobalSize[0] / progGrid->gridBlock.width;
|
|
|
|
pGlobalOffset[0] = glbWorkOffset[0];
|
|
pGroupOffset[0] = groupOffset[0];
|
|
pNDRangeGlobalOffset[0 + glbABIShift] = glbWorkOffsetOrg[0];
|
|
|
|
if (dev().settings().partialDispatch_) {
|
|
// Check if partial workgroup dispatch is required
|
|
progGrid->partialGridBlock.width = gblWorkSize[0] % lclWorkSize[0];
|
|
if (progGrid->partialGridBlock.width != 0) {
|
|
partialGrid = true;
|
|
// Increment the number of groups
|
|
progGrid->gridSize.width++;
|
|
pNumGroups[0]++;
|
|
}
|
|
else {
|
|
progGrid->partialGridBlock.width = lclWorkSize[0];
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
LogWarning("Wrong dimensions. Force to 1x1x1!");
|
|
break;
|
|
}
|
|
|
|
if (!partialGrid) {
|
|
progGrid->partialGridBlock.width =
|
|
progGrid->partialGridBlock.height =
|
|
progGrid->partialGridBlock.depth = 0;
|
|
}
|
|
|
|
// Calculate the total number of workitems and workgroups
|
|
pGlobalOffset[3] = pGroupOffset[3] = 1;
|
|
for (uint i = 0; i < workDim; ++i) {
|
|
pGlobalOffset[3] *= pGlobalOffset[i];
|
|
pGroupOffset[3] *= pGroupOffset[i];
|
|
}
|
|
|
|
// Setup debug output buffer (if printf is active)
|
|
if (flags() & PrintfOutput) {
|
|
if (abi64Bit()) {
|
|
// Setup the debug info in constant buffer
|
|
reinterpret_cast<uint64_t*>(debugInfo)[1] =
|
|
gpu.printfDbg().bufOffset();
|
|
// Size in DWORDs
|
|
debugInfo[4] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
|
|
debugInfo[4] /= sizeof(uint32_t);
|
|
}
|
|
else {
|
|
// Setup the debug info in constant buffer
|
|
debugInfo[1] = static_cast<uint32_t>(gpu.printfDbg().bufOffset());
|
|
// Size in DWORDs
|
|
debugInfo[2] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
|
|
debugInfo[2] /= sizeof(uint32_t);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool
|
|
Kernel::initParameters()
|
|
{
|
|
size_t offset = 0;
|
|
device::Kernel::parameters_t params;
|
|
amd::KernelParameterDescriptor desc;
|
|
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
|
|
// Initialize the arguments for the abstraction layer
|
|
if (arg->isCbNeeded()) {
|
|
desc.name_ = arg->name_.data();
|
|
desc.type_ = arg->type();
|
|
desc.size_ = arg->size(false);
|
|
desc.addressQualifier_ = arg->addressQualifier();
|
|
desc.accessQualifier_ = arg->accessQualifier();
|
|
desc.typeName_ = arg->typeName();
|
|
desc.typeQualifier_ = arg->typeQualifier();
|
|
|
|
// Make offset alignment to match CPU metadata, since
|
|
// in multidevice config abstraction layer has a single signature
|
|
// and CPU sends the paramaters as they are allocated in memory
|
|
size_t size = desc.size_;
|
|
if (size == 0) {
|
|
// Local memory for CPU
|
|
size = sizeof(cl_mem);
|
|
}
|
|
offset = amd::alignUp(offset, std::min(size, size_t(16)));
|
|
desc.offset_ = offset;
|
|
offset += amd::alignUp(size, sizeof(uint32_t));
|
|
params.push_back(desc);
|
|
}
|
|
}
|
|
|
|
// Report the allocated local memory size (emulated and hw)
|
|
if (hwLocalSize_ != 0) {
|
|
CondLog((dev().info().localMemSize_ < hwLocalSize_),
|
|
"Requested local size is bigger than reported");
|
|
workGroupInfo_.localMemSize_ = hwLocalSize_;
|
|
}
|
|
|
|
if (!createSignature(params)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
Kernel::bindGlobalHwCb(
|
|
VirtualGPU& gpu,
|
|
VirtualGPU::GslKernelDesc* desc) const
|
|
{
|
|
bool result = true;
|
|
|
|
// Bind HW constant buffers used for the global data store
|
|
const Program::HwConstBuffers& gds = prog().glbHwCb();
|
|
for (Program::HwConstBuffers::const_iterator it = gds.begin();
|
|
(it != gds.end() && result); ++it) {
|
|
uint idx = it->first;
|
|
result = bindResource(gpu, *(it->second), idx, ConstantBuffer, idx);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
Kernel::bindConstantBuffers(VirtualGPU& gpu) const
|
|
{
|
|
bool result = true;
|
|
|
|
assert((numCb_ <= MaxConstBuffersArguments) &&
|
|
"Runtime doesn't support more CBs for arguments!");
|
|
|
|
// Upload the parameters to HW and bind all constant buffers
|
|
for (uint i = 0; i < numCb_; i++) {
|
|
ConstBuffer* cb = gpu.constBufs_[i];
|
|
result &= cb->uploadDataToHw(cbSizes_[i]) &&
|
|
bindResource(gpu, *cb, i, ConstantBuffer, i, NULL, cb->wrtOffset());
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
Kernel::processMemObjects(
|
|
VirtualGPU& gpu,
|
|
const amd::Kernel& kernel,
|
|
const_address params,
|
|
bool nativeMem) const
|
|
{
|
|
bool aliases = false;
|
|
VirtualGPU::MemoryDependency& dependecy = gpu.memoryDependency();
|
|
bool readCache = !internal_ && rwAttributes_ && !dev().settings().assumeAliases_;
|
|
|
|
// Mark the tracker with a new kernel,
|
|
// so we can avoid checks of the aliased objects
|
|
gpu.memoryDependency().newKernel();
|
|
|
|
// Check all parameters for the current kernel
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
for (size_t i = 0; i < signature.numParameters(); ++i) {
|
|
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
|
const KernelArg* arg = argument(i);
|
|
Memory* memory = NULL;
|
|
bool readOnly = false;
|
|
|
|
// Find if current argument is a buffer
|
|
if ((desc.type_ == T_POINTER) &&
|
|
(arg->type_ != KernelArg::PointerLocal) &&
|
|
(arg->type_ != KernelArg::PointerHwLocal)) {
|
|
if (nativeMem) {
|
|
memory = *reinterpret_cast<Memory* const*>(params + desc.offset_);
|
|
}
|
|
else if (*reinterpret_cast<amd::Memory* const*>
|
|
(params + desc.offset_) != NULL) {
|
|
memory = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>
|
|
(params + desc.offset_));
|
|
// Synchronize data with other memory instances if necessary
|
|
memory->syncCacheFromHost(gpu);
|
|
}
|
|
|
|
if (memory != NULL) {
|
|
readOnly = arg->memory_.readOnly_;
|
|
|
|
// Check if read cache optimization is possible
|
|
if (readCache) {
|
|
// Find if the same buffer was sent to other arguments (aliases)
|
|
for (size_t j = i + 1; (j < signature.numParameters()); ++j) {
|
|
const amd::KernelParameterDescriptor& descJ = signature.at(j);
|
|
const KernelArg* argJ = argument(j);
|
|
if (argJ->type_ == KernelArg::PointerGlobal) {
|
|
bool readOnlyJ = argJ->memory_.readOnly_;
|
|
Memory* memory2 = NULL;
|
|
if (nativeMem) {
|
|
memory2 = *reinterpret_cast<Memory* const*>
|
|
(params + descJ.offset_);
|
|
}
|
|
else if (*reinterpret_cast<amd::Memory* const*>
|
|
(params + descJ.offset_) != NULL) {
|
|
memory2 = dev().getGpuMemory(
|
|
*reinterpret_cast<amd::Memory* const*>
|
|
(params + descJ.offset_));
|
|
}
|
|
if (memory == memory2) {
|
|
if (!readOnly || !readOnlyJ) {
|
|
aliases = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Validate memory for a dependency in the queue
|
|
gpu.memoryDependency().validate(gpu, memory, readOnly);
|
|
}
|
|
}
|
|
}
|
|
|
|
return aliases & readCache;
|
|
}
|
|
|
|
bool
|
|
Kernel::loadParameters(
|
|
VirtualGPU& gpu,
|
|
const amd::Kernel& kernel,
|
|
const_address params,
|
|
bool nativeMem) const
|
|
{
|
|
bool result = true;
|
|
uint i;
|
|
|
|
// Initialize local private ranges
|
|
if (!initLocalPrivateRanges(gpu)) {
|
|
return false;
|
|
}
|
|
|
|
if (!dev().settings().useAliases_ &&
|
|
(UavIdUndefined != uavRaw_) &&
|
|
(!(flags() & PrintfOutput) || (printfId_ != UavIdUndefined))) {
|
|
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
|
|
// Bind a buffer for a dummy read
|
|
result = bindResource(gpu, *gpuMemory, 0,
|
|
ArgumentUavID, uavRaw_);
|
|
}
|
|
|
|
// Find all parameters for the current kernel
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
for (i = 0; i != signature.numParameters(); ++i) {
|
|
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
|
// Set current argument
|
|
if (!setArgument(gpu, i, params + desc.offset_, desc.size_, nativeMem)) {
|
|
result = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (result) {
|
|
// Update the ring ranges and math constant
|
|
setLocalPrivateRanges(gpu);
|
|
|
|
result = bindConstantBuffers(gpu);
|
|
|
|
if (dev().settings().useAliases_) {
|
|
result &= bindResource(gpu, dev().globalMem(), 0, GlobalBufferArena, uavArena_);
|
|
}
|
|
else if (flags() & PrivateFixed) {
|
|
result &= bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_);
|
|
}
|
|
|
|
// Setup debug output buffer (if printf is active)
|
|
if (flags() & PrintfOutput) {
|
|
gpu.addVmMemory(gpu.printfDbg().dbgBuffer());
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
Kernel::run(VirtualGPU& gpu, GpuEvent* calEvent, bool lastRun) const
|
|
{
|
|
// 8xx workaround for the number of groups limit in HW
|
|
if (gpu.gslKernelDesc()->funcInfo_.setBufferForNumGroup) {
|
|
const ProgramGrid* programGrid = &gpu.cal()->progGrid_;
|
|
ConstBuffer* cb = gpu.numGrpCb();
|
|
assert((cb != NULL) && "Runtime must have the constant buffer");
|
|
|
|
uint32_t* memPtr = reinterpret_cast<uint32_t*>(cb->sysMemCopy());
|
|
memPtr[0] = programGrid->gridSize.width;
|
|
memPtr[1] = programGrid->gridSize.height;
|
|
memPtr[2] = programGrid->gridSize.depth;
|
|
memPtr[3] = 0;
|
|
|
|
memPtr[4] = programGrid->gridBlock.width;
|
|
memPtr[5] = programGrid->gridBlock.height;
|
|
memPtr[6] = programGrid->gridBlock.depth;
|
|
memPtr[7] = 0;
|
|
|
|
bool result = cb->uploadDataToHw(8 * sizeof(uint32_t));
|
|
if (result) {
|
|
gpu.setConstantBuffer(SC_INFO_CONSTANTBUFFER,
|
|
cb->gslResource(), static_cast<CALuint>(cb->wrtOffset()), cb->hbSize());
|
|
}
|
|
else {
|
|
assert(!"Runtime didn't upload data for NumGroup workaround");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (!gpu.runProgramGrid(*calEvent,
|
|
const_cast<ProgramGrid*>(&gpu.cal()->progGrid_), gpu.vmMems(), gpu.cal_.memCount_)) {
|
|
LogError("Failed to execute the program!");
|
|
return false;
|
|
}
|
|
|
|
// Unbind all resources
|
|
unbindResources(gpu, *calEvent, lastRun);
|
|
|
|
return true;
|
|
}
|
|
|
|
static size_t counter = 0;
|
|
void
|
|
Kernel::debug(VirtualGPU& gpu) const
|
|
{
|
|
std::fstream stubWrite;
|
|
address src = NULL;
|
|
if (!dev().heap()->isVirtual()) {
|
|
src = reinterpret_cast<address>
|
|
(const_cast<Resource&>(dev().globalMem()).map(&gpu));
|
|
}
|
|
|
|
std::cerr << "--- " << name_ << " ---" << std::endl;
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
Memory* gpuMem = gpu.slots_[i].memory_;
|
|
std::stringstream fileName;
|
|
bool bufferObj =
|
|
((arg->type_ == KernelArg::PointerGlobal) ||
|
|
(arg->type_ == KernelArg::PointerConst) ||
|
|
(arg->type_ == KernelArg::PointerHwConst));
|
|
|
|
if ((src != NULL) && arg->isCbNeeded() && bufferObj) {
|
|
address memory = gpu.cb(arg->cbIdx_)->sysMemCopy();
|
|
std::cerr.setf(std::ios::hex);
|
|
uint* location = reinterpret_cast<uint*>
|
|
(src + *reinterpret_cast<uint*>(memory + arg->cbPos_));
|
|
std::cerr << " > " << arg->name_ << ": 0x" << location << std::endl;
|
|
|
|
// Dump the data
|
|
fileName << counter << "_kernel_" << name() <<
|
|
"_" << arg->name_ << "_" << location << ".bin";
|
|
stubWrite.open(fileName.str().c_str(),
|
|
(std::fstream::out | std::fstream::binary));
|
|
|
|
// Write data to a file
|
|
if (stubWrite.is_open()) {
|
|
stubWrite.write(
|
|
reinterpret_cast<char*>(location), gpuMem->size());
|
|
stubWrite.close();
|
|
}
|
|
}
|
|
if (((arg->type_ >= KernelArg::Image1D) &&
|
|
(arg->type_ <= KernelArg::Image3D)) ||
|
|
((src == NULL) && bufferObj)) {
|
|
Memory* resource = gpu.slots_[i].memory_;
|
|
void* memory = resource->map(&gpu);
|
|
uint* location = reinterpret_cast<uint*>(memory);
|
|
std::cerr << " > " << arg->name_ << (bufferObj ? ": buffer" : ": image") << std::endl;
|
|
// Dump the data
|
|
fileName << counter << "_kernel_" << name() <<
|
|
"_" << arg->name_ << "_" << location << ".bin";
|
|
stubWrite.open(fileName.str().c_str(),
|
|
(std::fstream::out | std::fstream::binary));
|
|
|
|
// Write data to a file
|
|
if (stubWrite.is_open()) {
|
|
stubWrite.write(
|
|
reinterpret_cast<char*>(location), gpuMem->size());
|
|
stubWrite.close();
|
|
}
|
|
resource->unmap(&gpu);
|
|
}
|
|
}
|
|
|
|
for (uint i = 0; i < gpu.constBufs_.size(); ++i) {
|
|
std::stringstream fileName;
|
|
fileName << counter++ << "_kernel_" << name() << "_const" << i << ".bin";
|
|
stubWrite.open(fileName.str().c_str(),
|
|
(std::fstream::out | std::fstream::binary));
|
|
if (stubWrite.is_open()) {
|
|
address memory = reinterpret_cast<address>(gpu.constBufs_[i]->map(&gpu, Resource::ReadOnly));
|
|
// Check if we have OpenCL program
|
|
stubWrite.write(reinterpret_cast<char*>(memory+gpu.cb(i)->wrtOffset()), gpu.cb(i)->lastWrtSize());
|
|
gpu.constBufs_[i]->unmap(&gpu);
|
|
stubWrite.close();
|
|
}
|
|
}
|
|
const Program::HwConstBuffers& gds = prog().glbHwCb();
|
|
for (Program::HwConstBuffers::const_iterator it = gds.begin(); it != gds.end(); ++it) {
|
|
uint idx = it->first;
|
|
std::stringstream fileName;
|
|
fileName << counter++ << "_kernel_" << name() << "_const" << idx << ".bin";
|
|
stubWrite.open(fileName.str().c_str(),
|
|
(std::fstream::out | std::fstream::binary));
|
|
if (stubWrite.is_open()) {
|
|
address memory = reinterpret_cast<address>((it->second)->map(&gpu, Resource::ReadOnly));
|
|
// Check if we have OpenCL program
|
|
stubWrite.write(reinterpret_cast<char*>(memory), (it->second)->size());
|
|
(it->second)->unmap(&gpu);
|
|
stubWrite.close();
|
|
}
|
|
}
|
|
if (!dev().heap()->isVirtual()) {
|
|
const_cast<Resource&>(dev().globalMem()).unmap(&gpu);
|
|
}
|
|
}
|
|
|
|
bool
|
|
Kernel::initConstBuffers()
|
|
{
|
|
bool result = true;
|
|
size_t i;
|
|
|
|
assert((numCb_ != 0) && "We have 0 constant buffers!");
|
|
|
|
// Allocate an array for CB sizes
|
|
cbSizes_ = new size_t[numCb_];
|
|
if (cbSizes_ == NULL) {
|
|
return false;
|
|
}
|
|
memset(cbSizes_, 0, sizeof(size_t) * numCb_);
|
|
|
|
// CB0 is reserved for ABI data
|
|
cbSizes_[0] = TotalABIVectors * ConstBuffer::VectorSize;
|
|
|
|
// Find sizes of all constant buffers
|
|
for (i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
size_t size = arg->cbPos_ + arg->size(true);
|
|
size_t specVec = arg->specialVector();
|
|
if (specVec != 0) {
|
|
size = arg->cbPos_ + (arg->size_ / KernelArg::VectorSizeLimit) *
|
|
ConstBuffer::VectorSize;
|
|
}
|
|
// Do we need a CB?
|
|
if (arg->isCbNeeded() && (cbSizes_[arg->cbIdx_] < size)) {
|
|
cbSizes_[arg->cbIdx_] = size;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
Kernel::setInternalSamplers(VirtualGPU& gpu) const
|
|
{
|
|
for (uint i = 0; i < samplerSize(); ++i) {
|
|
const KernelArg* arg = sampler(i);
|
|
uint state = arg->cbPos_;
|
|
uint idx = arg->index_;
|
|
|
|
if (gpu.cal()->samplersState_[idx] != state) {
|
|
setSampler(gpu, state, idx);
|
|
gpu.cal_.samplersState_[idx] = state;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
Kernel::setArgument(
|
|
VirtualGPU& gpu,
|
|
uint idx,
|
|
const void* param,
|
|
size_t size,
|
|
bool nativeMem) const
|
|
{
|
|
bool result = true;
|
|
const KernelArg* arg;
|
|
address memory;
|
|
size_t argSize;
|
|
static const bool waitOnBusyEngine = true;
|
|
|
|
assert((idx < arguments_.size()) && "Param index is out of range!");
|
|
|
|
arg = argument(idx);
|
|
assert((arg->cbIdx_ == 1) && "Runtime supports CB1 only for the arguments buffer!");
|
|
memory = gpu.cb(1)->sysMemCopy();
|
|
argSize = arg->size(true);
|
|
|
|
// Bind the global heap for emulation mode
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
// Fall through ...
|
|
default:
|
|
break;
|
|
}
|
|
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerHwConst:
|
|
case KernelArg::PointerGlobal:
|
|
{
|
|
gpu::Memory* gpuMem = NULL;
|
|
if (nativeMem) {
|
|
gpuMem = *reinterpret_cast<Memory* const*>(param);
|
|
}
|
|
else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
|
|
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
|
|
}
|
|
bool forceZeroOffset = false;
|
|
|
|
if (gpuMem == NULL) {
|
|
forceZeroOffset = true;
|
|
gpuMem = dev().getGpuMemory(dev().dummyPage());
|
|
}
|
|
uint64_t offset = gpuMem->pinOffset();
|
|
|
|
// Make sure the passed argument is a buffer object
|
|
if (!gpuMem->cal()->buffer_) {
|
|
LogError("The kernel buffer argument isn't a buffer object!");
|
|
return false;
|
|
}
|
|
|
|
if (arg->type_ == KernelArg::PointerHwConst) {
|
|
// Bind current memory object with the kernel
|
|
if (!bindResource(gpu, *gpuMem, idx,
|
|
ArgumentConstBuffer, arg->index_, gpuMem)) {
|
|
return false;
|
|
}
|
|
assert((offset == 0) && "No offset for HW CB");
|
|
// Add a fake offset to make sure (ptr != NULL) is TRUE
|
|
offset = 1;
|
|
}
|
|
else {
|
|
ResourceType type = ArgumentHeapBuffer;
|
|
|
|
// Check if kernel expects UAV binding
|
|
if (arg->memory_.uavBuf_) {
|
|
type = ArgumentBuffer;
|
|
}
|
|
else {
|
|
if (blitKernelHack_) {
|
|
// Bind global buffer to UAV this buffer is bound to
|
|
if (!bindResource(gpu, *gpuMem, 0, GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
}
|
|
else {
|
|
// Bind global buffer to UAV this buffer is bound to
|
|
if (!bindResource(gpu, dev().globalMem(), 0,
|
|
GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Bind current memory object with the kernel
|
|
// Note: it's a fake binding, if the buffer is part of
|
|
// the global heap
|
|
if (!bindResource(gpu, *gpuMem, idx, type, arg->index_, gpuMem)) {
|
|
return false;
|
|
}
|
|
|
|
// Update offset only if we bind HeapBuffer or
|
|
// it's global address space in UAV setup on SI+
|
|
if ((type == ArgumentHeapBuffer) || dev().settings().siPlus_) {
|
|
if (!blitKernelHack_) {
|
|
offset += gpuMem->hbOffset();
|
|
if (!forceZeroOffset) {
|
|
assert((offset != 0) && "Offset 0 with a real allocation!");
|
|
}
|
|
}
|
|
gpu.addVmMemory(gpuMem);
|
|
}
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, waitOnBusyEngine);
|
|
|
|
if (forceZeroOffset) {
|
|
offset = 0;
|
|
}
|
|
|
|
// Copy memory offset into the constant buffer
|
|
if (abi64Bit()) {
|
|
*(reinterpret_cast<uint64_t*>(memory + arg->cbPos_)) = offset;
|
|
}
|
|
else {
|
|
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) =
|
|
static_cast<uint>(offset);
|
|
}
|
|
}
|
|
break;
|
|
case KernelArg::Image1D:
|
|
case KernelArg::Image2D:
|
|
case KernelArg::Image3D:
|
|
case KernelArg::Image1DB:
|
|
case KernelArg::Image1DA:
|
|
case KernelArg::Image2DA:
|
|
{
|
|
gpu::Memory* gpuMem = NULL;
|
|
if (nativeMem) {
|
|
gpuMem = *reinterpret_cast<Memory* const*>(param);
|
|
}
|
|
else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
|
|
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
|
|
}
|
|
|
|
if (gpuMem == NULL) {
|
|
return false;
|
|
}
|
|
// Make sure the passed argument is an image object
|
|
if (gpuMem->cal()->buffer_) {
|
|
LogError("The kernel image argument isn't an image object!");
|
|
return false;
|
|
}
|
|
|
|
ResourceType resType = arg->memory_.readOnly_ ?
|
|
ArgumentImageRead : ArgumentImageWrite;
|
|
|
|
// Bind current memory object with the shader.
|
|
if (!bindResource(gpu, *gpuMem, idx,
|
|
resType, arg->index_, gpuMem)) {
|
|
return false;
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, waitOnBusyEngine);
|
|
|
|
// Copy image constants into the constant buffer
|
|
if (gpuMem->owner() != NULL) {
|
|
copyImageConstants(gpuMem->owner()->asImage(),
|
|
reinterpret_cast<ImageConstants*>(memory + arg->cbPos_));
|
|
}
|
|
}
|
|
break;
|
|
case KernelArg::Sampler:
|
|
{
|
|
amd::Sampler* amdSampler =
|
|
*reinterpret_cast<amd::Sampler* const*>(param);
|
|
uint idx = arg->index_;
|
|
uint32_t state = amdSampler->state();
|
|
|
|
if (state != gpu.cal()->samplersState_[idx]) {
|
|
setSampler(gpu, state, idx);
|
|
gpu.cal_.samplersState_[idx] = state;
|
|
}
|
|
|
|
// Copy sampler state into the constant buffer
|
|
*(reinterpret_cast<uint32_t*>(memory + arg->cbPos_)) = state;
|
|
}
|
|
break;
|
|
case KernelArg::Counter:
|
|
{
|
|
gpu::Memory* gpuMem = NULL;
|
|
if (nativeMem) {
|
|
gpuMem = *reinterpret_cast<Memory* const*>(param);
|
|
}
|
|
else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
|
|
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, waitOnBusyEngine);
|
|
|
|
// Bind current memory object with the shader.
|
|
if (!bindResource(gpu, *gpuMem, idx,
|
|
ArgumentCounter, idx, gpuMem)) {
|
|
return false;
|
|
}
|
|
}
|
|
break;
|
|
case KernelArg::PointerHwLocal:
|
|
{
|
|
// Calculate current offset in the local ring
|
|
uint offset = gpu.cal_.progGrid_.localSize;
|
|
uint extra = amd::alignUp(offset, arg->alignment_) - offset;
|
|
|
|
offset = amd::alignUp(offset, arg->alignment_);
|
|
size_t memSize = *static_cast<const uintptr_t*>(param);
|
|
|
|
// Allocate new memory from the local ring
|
|
gpu.cal_.progGrid_.localSize += static_cast<uint>(memSize) + extra;
|
|
// Copy current local argument's offset into the CB
|
|
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) = offset;
|
|
|
|
CondLog((gpu.cal_.progGrid_.localSize > dev().info().localMemSize_),
|
|
"Requested local size is bigger than reported!");
|
|
}
|
|
break;
|
|
case KernelArg::Float:
|
|
case KernelArg::Double:
|
|
case KernelArg::Char:
|
|
case KernelArg::UChar:
|
|
case KernelArg::Short:
|
|
case KernelArg::UShort:
|
|
case KernelArg::Int:
|
|
case KernelArg::UInt:
|
|
case KernelArg::Long:
|
|
case KernelArg::ULong:
|
|
if (size != argSize) {
|
|
LogWarning("Parameter's sizes are unmatched!");
|
|
}
|
|
// Fall through ...
|
|
case KernelArg::Struct:
|
|
case KernelArg::Union: {
|
|
size_t specVec = arg->specialVector();
|
|
if (specVec != 0) {
|
|
uint iter = (arg->size_ / KernelArg::VectorSizeLimit);
|
|
for (uint i = 0; i < iter; ++i) {
|
|
amd::Os::fastMemcpy((memory + arg->cbPos_ +
|
|
i * ConstBuffer::VectorSize),
|
|
reinterpret_cast<const char*>(param) +
|
|
i * KernelArg::VectorSizeLimit * specVec,
|
|
specVec * KernelArg::VectorSizeLimit);
|
|
}
|
|
}
|
|
else {
|
|
// Copy data into the CB
|
|
amd::Os::fastMemcpy((memory + arg->cbPos_), param, size);
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
LogError("Unhandled argument's type!");
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
Kernel::initLocalPrivateRanges(VirtualGPU& gpu) const
|
|
{
|
|
// Initialize HW local
|
|
gpu.cal_.progGrid_.localSize = hwLocalSize_;
|
|
|
|
// Bind the global buffer if emulated local or private memory
|
|
// was allocated by the kernel
|
|
if ((flags() & PrintfOutput && (printfId_ == UavIdUndefined)) &&
|
|
(uavRaw_ != UavIdUndefined)) {
|
|
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Bind the global buffer if emulated constant buffers are enabled
|
|
if (cbId_ != UavIdUndefined) {
|
|
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentCbID, cbId_)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Bind the printf buffer
|
|
if (printfId_ != UavIdUndefined) {
|
|
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentPrintfID, printfId_)) {
|
|
return false;
|
|
}
|
|
}
|
|
// Initialize the iterations count
|
|
gpu.cal_.iterations_ = 1;
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
Kernel::setLocalPrivateRanges(VirtualGPU& gpu) const
|
|
{
|
|
address cbBuf = gpu.cb(0)->sysMemCopy();
|
|
uint* data;
|
|
uint gridSize =
|
|
gpu.cal()->progGrid_.gridSize.width *
|
|
gpu.cal()->progGrid_.gridSize.height *
|
|
gpu.cal()->progGrid_.gridSize.depth;
|
|
uint blockSize =
|
|
gpu.cal()->progGrid_.gridBlock.width *
|
|
gpu.cal()->progGrid_.gridBlock.height *
|
|
gpu.cal()->progGrid_.gridBlock.depth;
|
|
|
|
//! \todo validate if the compiler still generates PrivateFixed
|
|
if (flags() & PrivateFixed) {
|
|
// Update private ring
|
|
data = reinterpret_cast<uint*>
|
|
(cbBuf + PrivateRingOffset * ConstBuffer::VectorSize);
|
|
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
|
|
|
|
if (abi64Bit()) {
|
|
reinterpret_cast<uint64_t*>(data)[0] = gpuMemory->hbOffset();
|
|
data[2] = 0;
|
|
data[3] = 0;
|
|
}
|
|
else {
|
|
data[0] = static_cast<uint>(gpuMemory->hbOffset());
|
|
data[1] = 0;
|
|
data[2] = data[3] = 0;
|
|
}
|
|
gpu.addVmMemory(gpuMemory);
|
|
}
|
|
|
|
// Copy the math lib constants
|
|
amd::Os::fastMemcpy(
|
|
(cbBuf + MathLibOffset * ConstBuffer::VectorSize),
|
|
MathLibConst, sizeof(MathLibConst));
|
|
|
|
// Update the offset to the global data
|
|
if (prog().glbData() != NULL) {
|
|
gpu.addVmMemory(prog().glbData());
|
|
uint64_t glbDataOffset = prog().glbData()->hbOffset();
|
|
if (abi64Bit()) {
|
|
*reinterpret_cast<uint64_t*>(cbBuf + GlobalDataStoreOffset *
|
|
ConstBuffer::VectorSize) = glbDataOffset;
|
|
}
|
|
else {
|
|
*reinterpret_cast<uint*>(cbBuf + GlobalDataStoreOffset *
|
|
ConstBuffer::VectorSize) = static_cast<uint>(glbDataOffset);
|
|
}
|
|
}
|
|
|
|
// Split workload if it was requested
|
|
if ((gpu.cal_.iterations_ < 2) &&
|
|
gpu.dmaFlushMgmt().dispatchSplitSize() != 0) {
|
|
uint totalSize = gridSize * blockSize;
|
|
if (totalSize > gpu.dmaFlushMgmt().dispatchSplitSize()) {
|
|
gpu.cal_.iterations_ = std::max(gpu.cal_.iterations_,
|
|
(totalSize / gpu.dmaFlushMgmt().dispatchSplitSize()));
|
|
}
|
|
}
|
|
|
|
// Initialize the number of iterations to the grid size
|
|
if (flags() & PrintfOutput) {
|
|
gpu.cal_.iterations_ = gridSize;
|
|
}
|
|
}
|
|
|
|
void
|
|
Kernel::setSampler(
|
|
VirtualGPU& gpu,
|
|
uint32_t state,
|
|
uint physUnit
|
|
) const
|
|
{
|
|
// All CAL sampler's parameters are in floats
|
|
float gslAddress = GSL_CLAMP_TO_BORDER;
|
|
float gslMinFilter = GSL_MIN_NEAREST;
|
|
float gslMagFilter = GSL_MAG_NEAREST;
|
|
|
|
state &= ~amd::Sampler::StateNormalizedCoordsMask;
|
|
|
|
// Program the sampler address mode
|
|
switch (state & amd::Sampler::StateAddressMask) {
|
|
case amd::Sampler::StateAddressRepeat:
|
|
gslAddress = GSL_REPEAT;
|
|
break;
|
|
case amd::Sampler::StateAddressClampToEdge:
|
|
gslAddress = GSL_CLAMP_TO_EDGE;
|
|
break;
|
|
case amd::Sampler::StateAddressMirroredRepeat:
|
|
gslAddress = GSL_MIRRORED_REPEAT;
|
|
break;
|
|
case amd::Sampler::StateAddressClamp:
|
|
case amd::Sampler::StateAddressNone:
|
|
default:
|
|
break;
|
|
}
|
|
state &= ~amd::Sampler::StateAddressMask;
|
|
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_S, &gslAddress);
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_T, &gslAddress);
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_R, &gslAddress);
|
|
|
|
// Program texture filter mode
|
|
if (state == amd::Sampler::StateFilterLinear) {
|
|
gslMinFilter = GSL_MIN_LINEAR;
|
|
gslMagFilter = GSL_MAG_LINEAR;
|
|
}
|
|
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MIN_FILTER, &gslMinFilter);
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MAG_FILTER, &gslMagFilter);
|
|
}
|
|
|
|
bool
|
|
Kernel::bindResource(
|
|
VirtualGPU& gpu,
|
|
const Resource& resource,
|
|
uint paramIdx,
|
|
ResourceType type,
|
|
uint physUnit,
|
|
Memory* memory,
|
|
size_t offset) const
|
|
{
|
|
gslUAVType uavType = GSL_UAV_TYPE_UNKNOWN;
|
|
|
|
// Find the original resource name from the IL program
|
|
switch (type) {
|
|
case GlobalBuffer:
|
|
if (gpu.state_.boundGlobal_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundGlobal_ = true;
|
|
physUnit = uavRaw_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case GlobalBufferArena:
|
|
if (gpu.state_.boundGlobal_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundGlobal_ = true;
|
|
physUnit = uavArena_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case ArgumentCbID:
|
|
if (gpu.state_.boundCb_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundCb_ = true;
|
|
physUnit = cbId_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case ArgumentPrintfID:
|
|
if (gpu.state_.boundPrintf_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundPrintf_ = true;
|
|
physUnit = printfId_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case ArgumentHeapBuffer:
|
|
case ArgumentBuffer:
|
|
case ArgumentImageRead:
|
|
case ArgumentImageWrite:
|
|
case ArgumentConstBuffer:
|
|
case ArgumentCounter:
|
|
// Early exit if resource is bound already
|
|
if (gpu.slots_[paramIdx].state_.bound_) {
|
|
return true;
|
|
}
|
|
|
|
// Associate resource with the slot
|
|
gpu.slots_[paramIdx].memory_ = memory;
|
|
|
|
// Mark resource as bound
|
|
gpu.slots_[paramIdx].state_.bound_ = true;
|
|
|
|
if (type == ArgumentCounter) {
|
|
GpuEvent calEvent;
|
|
|
|
// Bind memory with atomic counter
|
|
gpu.bindAtomicCounter(argument(paramIdx)->index_,
|
|
memory->gslResource());
|
|
|
|
// Copy the counter value into GDS
|
|
gpu.syncAtomicCounter(calEvent, argument(paramIdx)->index_, false);
|
|
|
|
// Mark resource as busy
|
|
memory->setBusy(gpu, calEvent);
|
|
return true;
|
|
}
|
|
else if (type == ArgumentHeapBuffer) {
|
|
// We return here, since we just have to bind the global heap
|
|
return true;
|
|
}
|
|
else if (type == ArgumentConstBuffer) {
|
|
gpu.slots_[paramIdx].state_.constant_ = true;
|
|
}
|
|
break;
|
|
case ArgumentUavID:
|
|
case ConstantBuffer:
|
|
break;
|
|
default:
|
|
LogPrintfError("Unspecified argument type ()!", type);
|
|
return false;
|
|
}
|
|
|
|
gslMemObject gslMem = NULL;
|
|
// Use global address space on SI+ for UAV setup
|
|
if (dev().settings().siPlus_ &&
|
|
((type == ArgumentBuffer) || (type == ArgumentCbID) ||
|
|
(type == ArgumentUavID) || (type == ArgumentPrintfID)) &&
|
|
!blitKernelHack_) {
|
|
gslMem = dev().heap()->resource().gslResource();
|
|
}
|
|
else {
|
|
gslMem = resource.gslResource();
|
|
}
|
|
|
|
// Associate memory with the physical unit, the actual binding
|
|
bool result = true;
|
|
switch (type) {
|
|
case GlobalBuffer:
|
|
case GlobalBufferArena:
|
|
case ArgumentBuffer:
|
|
case ArgumentImageWrite:
|
|
case ArgumentUavID:
|
|
case ArgumentCbID:
|
|
case ArgumentPrintfID:
|
|
if (type == ArgumentImageWrite) {
|
|
uavType = GSL_UAV_TYPE_TYPED;
|
|
}
|
|
else if ((type == ArgumentBuffer) || (type == ArgumentUavID)) {
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
}
|
|
if (gpu.cal_.uavs_[physUnit] != gslMem) {
|
|
result = gpu.setUAVBuffer(physUnit, gslMem, uavType);
|
|
gpu.setUAVChannelOrder(physUnit, gslMem);
|
|
gpu.cal_.uavs_[physUnit] = gslMem;
|
|
}
|
|
else if (!dev().settings().siPlus_)
|
|
{
|
|
gpu.setUAVChannelOrder(physUnit, gslMem);
|
|
}
|
|
break;
|
|
case ConstantBuffer:
|
|
case ArgumentConstBuffer:
|
|
if ((gpu.cal_.constBuffers_[physUnit] != gslMem) || (offset != 0)) {
|
|
result = gpu.setConstantBuffer(physUnit,
|
|
gslMem, offset, resource.hbSize());
|
|
gpu.cal_.constBuffers_[physUnit] = gslMem;
|
|
}
|
|
break;
|
|
case ArgumentImageRead:
|
|
if (gpu.cal_.readImages_[physUnit] != gslMem) {
|
|
result = gpu.setInput(physUnit, gslMem);
|
|
gpu.cal_.readImages_[physUnit] = gslMem;
|
|
}
|
|
break;
|
|
default:
|
|
result = false;
|
|
assert(false);
|
|
break;
|
|
}
|
|
if (!result) {
|
|
LogPrintfError("setMem failed unit:%d mem:0x%08x!", physUnit, gslMem);
|
|
return false;
|
|
}
|
|
|
|
if ((type == GlobalBuffer) && dev().settings().useAliases_) {
|
|
if (uavArena_ != 0) {
|
|
if (!setupArenaAliases(gpu, resource)) {
|
|
return false;
|
|
}
|
|
if ((uavArena_ != physUnit) &&
|
|
(gpu.cal_.uavs_[uavArena_] != gslMem)) {
|
|
gpu.cal_.uavs_[uavArena_] = gslMem;
|
|
// Associate memory with the name
|
|
if (!gpu.setUAVBuffer(uavArena_, gslMem,
|
|
GSL_UAV_TYPE_TYPELESS)) {
|
|
LogError("calCtxSetMem failed!");
|
|
return false;
|
|
}
|
|
gpu.setUAVChannelOrder(uavArena_, gslMem);
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
Kernel::unbindResources(
|
|
VirtualGPU& gpu,
|
|
GpuEvent calEvent,
|
|
bool lastRun) const
|
|
{
|
|
// Make sure unbind occurs on the last run, in case the execution had a split
|
|
if (lastRun) {
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
if (gpu.slots_[i].state_.bound_) {
|
|
GpuEvent calEventTmp = calEvent;
|
|
|
|
if (KernelArg::Counter == argument(i)->type_) {
|
|
// Copy the counter value from GDS
|
|
gpu.syncAtomicCounter(calEventTmp, argument(i)->index_, true);
|
|
}
|
|
else if (!(gpu.slots_[i].state_.constant_ ||
|
|
argument(i)->memory_.readOnly_)) {
|
|
// Signal the abstraction layer that GPU memory is dirty
|
|
if (gpu.slots_[i].memory_->owner() != NULL) {
|
|
gpu.slots_[i].memory_->owner()->signalWrite(&gpu.dev());
|
|
}
|
|
}
|
|
// Mark resource as busy
|
|
gpu.slots_[i].memory_->setBusy(gpu, calEventTmp);
|
|
|
|
gpu.slots_[i].state_.value_ = 0;
|
|
}
|
|
}
|
|
|
|
// Unbind the global buffer
|
|
gpu.state_.boundGlobal_ = false;
|
|
|
|
// Unbind the constant buffer
|
|
gpu.state_.boundCb_ = false;
|
|
|
|
// Unbind the pritnf buffer
|
|
gpu.state_.boundPrintf_ = false;
|
|
}
|
|
|
|
// Mark CB busy
|
|
for (uint i = 0; i < numCb_; ++i) {
|
|
gpu.constBufs_[i]->setBusy(gpu, calEvent);
|
|
}
|
|
|
|
// 8xx workaround for the number of groups limit in HW
|
|
if (gpu.gslKernelDesc()->funcInfo_.setBufferForNumGroup) {
|
|
ConstBuffer* cb = gpu.numGrpCb();
|
|
assert((cb != NULL) && "Runtime must have the constant buffer");
|
|
cb->setBusy(gpu, calEvent);
|
|
}
|
|
|
|
// Set the event object for the scratch buffer
|
|
if (workGroupInfo()->scratchRegs_ > 0) {
|
|
for (uint i = 0; i < dev().scratch(gpu.hwRing())->memObjs_.size(); ++i) {
|
|
dev().scratch(gpu.hwRing())->memObjs_[i]->setBusy(gpu, calEvent);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool
|
|
Kernel::setupArenaAliases(VirtualGPU& gpu, const Resource& resource) const
|
|
{
|
|
const static uint ScArenaUavShortId = 9;
|
|
const static uint ScArenaUavByteId = 10;
|
|
|
|
Resource* buf = &(const_cast<Resource&>(resource));
|
|
Resource* alias;
|
|
gslMemObject gslMem = NULL;
|
|
//
|
|
// byte view
|
|
//
|
|
alias = buf->getAliasUAVBuffer(CM_SURF_FMT_R8I);
|
|
if (NULL == alias) {
|
|
return false;
|
|
}
|
|
gslMem = alias->gslResource();
|
|
if (gpu.cal_.uavs_[ScArenaUavByteId] != gslMem) {
|
|
if (!gpu.setUAVBuffer(ScArenaUavByteId,
|
|
gslMem, GSL_UAV_TYPE_TYPELESS)) {
|
|
return false;
|
|
}
|
|
gpu.cal_.uavs_[ScArenaUavByteId] = gslMem;
|
|
}
|
|
|
|
//
|
|
// short view
|
|
//
|
|
alias = buf->getAliasUAVBuffer(CM_SURF_FMT_R16I);
|
|
if (NULL == alias) {
|
|
return false;
|
|
}
|
|
bool result = true;
|
|
gslMem = alias->gslResource();
|
|
if (gpu.cal_.uavs_[ScArenaUavShortId] != gslMem) {
|
|
result = gpu.setUAVBuffer(ScArenaUavShortId,
|
|
gslMem, GSL_UAV_TYPE_TYPELESS);
|
|
gpu.cal_.uavs_[ScArenaUavShortId] = gslMem;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
void
|
|
Kernel::copyImageConstants(
|
|
const amd::Image* amdImage,
|
|
ImageConstants* imageData
|
|
) const
|
|
{
|
|
imageData->width_ = static_cast<uint32_t>(amdImage->getWidth());
|
|
imageData->height_ = static_cast<uint32_t>(amdImage->getHeight());
|
|
imageData->depth_ = static_cast<uint32_t>(amdImage->getDepth());
|
|
imageData->dataType_ =
|
|
static_cast<uint32_t>(amdImage->getImageFormat().image_channel_data_type);
|
|
|
|
imageData->widthFloat_ = 1.f / static_cast<float>(amdImage->getWidth());
|
|
imageData->heightFloat_ = 1.f / static_cast<float>(amdImage->getHeight());
|
|
imageData->depthFloat_ = 1.f / static_cast<float>(amdImage->getDepth());
|
|
imageData->channelOrder_ =
|
|
static_cast<uint32_t>(amdImage->getImageFormat().image_channel_order);
|
|
}
|
|
|
|
union MetadataVersion {
|
|
struct {
|
|
uint64_t revision_: 16; //!< LLVM metadata revision
|
|
uint64_t minorVersion_: 16; //!< LLVM metadata minor verison
|
|
uint64_t majorVersion_: 16; //!< LLVM metadata major version
|
|
};
|
|
uint64_t value_;
|
|
MetadataVersion(uint mj, uint mi, uint rev): value_(0)
|
|
{
|
|
revision_ = rev;
|
|
minorVersion_ = mi;
|
|
majorVersion_ = mj;
|
|
}
|
|
MetadataVersion(): value_(0) {}
|
|
};
|
|
|
|
//! Version of metadata with buffer attributes
|
|
const MetadataVersion MetadataBufferAttributes = MetadataVersion(2, 0, 88);
|
|
|
|
//! Version of metadata with type qualifiers
|
|
const MetadataVersion MetadataTypeQualifiers = MetadataVersion(3, 1, 103);
|
|
|
|
bool
|
|
NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount)
|
|
{
|
|
// Initialize workgroup info
|
|
workGroupInfo_.size_ = nullDev().info().maxWorkGroupSize_;
|
|
MetadataVersion mdVersion;
|
|
|
|
// Find first tag
|
|
size_t pos = metaData.find(";");
|
|
|
|
// Loop through all provided program arguments
|
|
while (pos != std::string::npos) {
|
|
KernelArg arg;
|
|
|
|
if (!expect(metaData, &pos, ";")) {
|
|
break;
|
|
}
|
|
|
|
arg.type_ = KernelArg::None;
|
|
|
|
// Loop through all available metadata types
|
|
for (uint i = 0; i < ArgStateTotal; ++i) {
|
|
uint tmpValue;
|
|
// Find the name tag
|
|
if (expect(metaData, &pos, ArgState[i].typeName_)) {
|
|
switch (ArgState[i].type_) {
|
|
case KernelArg::None:
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::Reflection: {
|
|
uint argIdx;
|
|
// Read the argument's index
|
|
if (!getuint(metaData, &pos, &argIdx)) {
|
|
LogWarning("Couldn't get the argument index!");
|
|
return false;
|
|
}
|
|
KernelArg* tmpArg = arguments_[argIdx];
|
|
if (!getstring(metaData, &pos, &tmpArg->typeName_)) {
|
|
LogWarning("Couldn't get the argument type!");
|
|
return false;
|
|
}
|
|
}
|
|
continue;
|
|
case KernelArg::ConstArg: {
|
|
uint argIdx;
|
|
// Read the argument's index
|
|
if (!getuint(metaData, &pos, &argIdx)) {
|
|
LogWarning("Couldn't get the argument index!");
|
|
return false;
|
|
}
|
|
KernelArg* tmpArg = arguments_[argIdx];
|
|
tmpArg->typeQualifier_ |= CL_KERNEL_ARG_TYPE_CONST;
|
|
}
|
|
continue;
|
|
case KernelArg::Grouping:
|
|
for (uint j = 0; j < 3; ++j) {
|
|
uint temp;
|
|
// Read the compile workgroup size
|
|
if (!getuint(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the compile workgroup size!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.compileSize_[j] = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::WrkgrpSize: {
|
|
uint temp;
|
|
// Read the workgroup size
|
|
if (!getuint(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the workgroup size!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.size_ = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::Wavefront:
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::UavId:
|
|
// Read index
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
return false;
|
|
}
|
|
break;
|
|
case KernelArg::ConstBufId:
|
|
// Read index
|
|
if (!getuint(metaData, &pos, &cbId_)) {
|
|
return false;
|
|
}
|
|
continue;
|
|
case KernelArg::PrintfBufId:
|
|
// Read index
|
|
if (!getuint(metaData, &pos, &printfId_)) {
|
|
return false;
|
|
}
|
|
continue;
|
|
case KernelArg::MetadataVersion:
|
|
// Read metadata version
|
|
if (!getuint(metaData, &pos, &tmpValue)) {
|
|
return false;
|
|
}
|
|
mdVersion.majorVersion_ = tmpValue;
|
|
if (!getuint(metaData, &pos, &tmpValue)) {
|
|
return false;
|
|
}
|
|
mdVersion.minorVersion_ = tmpValue;
|
|
if (!getuint(metaData, &pos, &tmpValue)) {
|
|
return false;
|
|
}
|
|
mdVersion.revision_ = tmpValue;
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::GroupingHint:
|
|
for (uint j = 0; j < 3; ++j) {
|
|
uint temp;
|
|
// Read the compile workgroup size hint
|
|
if (!getuint(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the compile workgroup size hint!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.compileSizeHint_[j] = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::VecTypeHint:
|
|
{
|
|
std::string temp;
|
|
// Read the compile vector type hint
|
|
if (!getstring(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the compile vector type hint!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.compileVecTypeHint_ = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
char argName[256];
|
|
// Save the argument type
|
|
arg.type_ = ArgState[i].type_;
|
|
|
|
// Check if we should expect the name
|
|
if (ArgState[i].name_) {
|
|
// Read the parameter's name
|
|
if (!getword(metaData, &pos, argName)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
arg.name_ = argName;
|
|
}
|
|
|
|
if (arg.type_ == KernelArg::Sampler) {
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
if (!getuint(metaData, &pos, &arg.location_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
if (!getuint(metaData, &pos, &arg.cbPos_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check if we should expect the resource data type
|
|
if (ArgState[i].resType_) {
|
|
uint k;
|
|
// Search for the data type
|
|
for (k = 0; k < DataTypeTotal; k++) {
|
|
if (expect(metaData, &pos, DataType[k].tagName_)) {
|
|
arg.dataType_ = DataType[k].type_;
|
|
if (arg.type_ == KernelArg::Image) {
|
|
flags_ |= ImageEnable;
|
|
if (expect(metaData, &pos, "RO:")) {
|
|
arg.memory_.readOnly_ = 1;
|
|
}
|
|
else if (expect(metaData, &pos, "RW:")) {
|
|
arg.memory_.readWrite_ = 1;
|
|
flags_ |= ImageWrite;
|
|
}
|
|
else if (expect(metaData, &pos, "WO:")) {
|
|
arg.memory_.writeOnly_ = 1;
|
|
flags_ |= ImageWrite;
|
|
}
|
|
}
|
|
else if (arg.type_ == KernelArg::Value) {
|
|
arg.type_ = DataType[k].type_;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
if (k == DataTypeTotal) {
|
|
LogWarning("We couldn't find the argument's type.");
|
|
if ((arg.type_ == KernelArg::Value) ||
|
|
!getword(metaData, &pos, argName)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
//! @todo temporary condition
|
|
if ((arg.type_ == KernelArg::Opaque) ||
|
|
(arg.type_ == KernelArg::Sampler)) {
|
|
assert(false);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Check if we should expect the data size
|
|
if (ArgState[i].size_) {
|
|
uint tmpData;
|
|
// Read the data size
|
|
if (!getuint(metaData, &pos, &tmpData)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
if (arg.type_ == KernelArg::Image) {
|
|
arg.type_ = arg.dataType_;
|
|
arg.index_ = tmpData;
|
|
}
|
|
else {
|
|
arg.size_ = tmpData;
|
|
}
|
|
}
|
|
|
|
if (arg.type_ == KernelArg::Counter) {
|
|
// Read a counter index
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
LogWarning("Couldn't get a counter index!");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check if we should expect a resource index
|
|
if (ArgState[i].cbIdx_) {
|
|
// Read resource index
|
|
if (!getuint(metaData, &pos, &arg.cbIdx_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
|
|
if (arg.isCbNeeded() && (numCb_ < arg.cbIdx_)) {
|
|
numCb_ = arg.cbIdx_;
|
|
}
|
|
}
|
|
// Check if we should expect the CB offset
|
|
if (ArgState[i].cbPos_) {
|
|
// Read position in the constant buffer
|
|
if (!getuint(metaData, &pos, &arg.cbPos_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
// Check if we should expect the buffer type
|
|
if (ArgState[i].buf_) {
|
|
// Read the buffer type
|
|
if (!getword(metaData, &pos, argName)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
arg.buf_ = argName;
|
|
|
|
for (uint k = 0; k < BufTypeTotal; ++k) {
|
|
if (0 == arg.buf_.compare(BufType[k].tagName_)) {
|
|
// Update the parameter type
|
|
arg.type_ = BufType[k].type_;
|
|
// Check if we should expect a buffer index
|
|
if (BufType[k].number_) {
|
|
// Read a buffer index
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
// Check for the required alignment
|
|
if (BufType[k].alignment_) {
|
|
// Read data alignment
|
|
if (!getuint(metaData, &pos, &arg.alignment_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
// Check for the buffer's attribute
|
|
if ((mdVersion.value_ >= MetadataBufferAttributes.value_) &&
|
|
BufType[k].attribute_) {
|
|
if (expect(metaData, &pos, "RO")) {
|
|
arg.memory_.readOnly_ = 1;
|
|
}
|
|
else if (expect(metaData, &pos, "RW")) {
|
|
arg.memory_.readWrite_ = 1;
|
|
}
|
|
else if (expect(metaData, &pos, "WO")) {
|
|
arg.memory_.writeOnly_ = 1;
|
|
}
|
|
}
|
|
// Check for the type qualifier
|
|
if ((mdVersion.value_ >= MetadataTypeQualifiers.value_) &&
|
|
BufType[k].attribute_) {
|
|
uint tmp;
|
|
pos += 1;
|
|
if (!getuint(metaData, &pos, &tmp)) {
|
|
LogWarning("Couldn't get volatile type!");
|
|
return false;
|
|
}
|
|
if (tmp == 1) {
|
|
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
|
}
|
|
if (!getuint(metaData, &pos, &tmp)) {
|
|
LogWarning("Couldn't get restrict type!");
|
|
return false;
|
|
}
|
|
if (tmp == 1) {
|
|
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Find multiple UAV references
|
|
switch (arg.type_) {
|
|
case KernelArg::PointerGlobal:
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
case KernelArg::UavId:
|
|
uavRefCount[arg.index_]++;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
// Check if this argument will be passed in constant buffer
|
|
if (arg.isCbNeeded() || (arg.type_ == KernelArg::UavId)) {
|
|
if (arg.type_ == KernelArg::Sampler) {
|
|
// Serach for the passed by value sampler
|
|
for (uint i = 0; i < argSize(); ++i) {
|
|
KernelArg* value = arguments_[i];
|
|
if (0 == value->name_.compare(arg.name_)) {
|
|
value->type_ = arg.type_;
|
|
value->index_ = arg.index_;
|
|
value->location_ = 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
KernelArg* argument = new KernelArg(arg);
|
|
if (argument != NULL) {
|
|
addArgument(argument);
|
|
}
|
|
else {
|
|
LogError("Couldn't allocate memory!");
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
// Check if we have a pre-defined sampler
|
|
else if (arg.type_ == KernelArg::Sampler) {
|
|
KernelArg* sampler = new KernelArg(arg);
|
|
if (sampler != NULL) {
|
|
addSampler(sampler);
|
|
}
|
|
else {
|
|
LogError("Couldn't allocate memory!");
|
|
return false;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Next argument
|
|
pos = metaData.find(";", pos);
|
|
}
|
|
|
|
// Find arguments that will require a reallocation
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
KernelArg* arg = arguments_[i];
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerGlobal:
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
// Check if can't use a dedicated UAV,
|
|
// so realloc memory in the heap
|
|
arg->memory_.realloc_ = isRealloc();
|
|
|
|
if (nullDev().settings().useAliases_) {
|
|
if (uavRefCount[arg->index_] > 1) {
|
|
// Multiple accesses, assume this is the heap
|
|
uavRaw_ = arg->index_;
|
|
}
|
|
// Mark argument as an UAV buffer if no aliases or it's not arena
|
|
else if (arg->index_ != VirtualGPU::UavArena) {
|
|
arg->memory_.uavBuf_ = true;
|
|
}
|
|
}
|
|
else {
|
|
arg->memory_.uavBuf_ = true;
|
|
}
|
|
break;
|
|
case KernelArg::PointerHwConst:
|
|
arg->memory_.realloc_ = true;
|
|
break;
|
|
case KernelArg::UavId:
|
|
if (!nullDev().settings().useAliases_ ||
|
|
(arg->index_ != VirtualGPU::UavArena)) {
|
|
uavRaw_ = arg->index_;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
// If argument marked with the const qualifier, then overwrite
|
|
// Read-Write attributes, since compiler doesn't mark it properly
|
|
if (arg->typeQualifier() & CL_KERNEL_ARG_TYPE_CONST) {
|
|
arg->memory_.readOnly_ = 1;
|
|
arg->memory_.readWrite_ = 0;
|
|
arg->memory_.writeOnly_ = 0;
|
|
}
|
|
}
|
|
|
|
if (!nullDev().settings().useAliases_ &&
|
|
(uavRaw_ != UavIdUndefined) &&
|
|
!(flags() & PrintfOutput)) {
|
|
// Find if default UAV is already assigned to an argument
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
KernelArg* arg = arguments_[i];
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerGlobal:
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
if (uavRaw_ == arg->index_) {
|
|
uavRaw_ = UavIdUndefined;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
// There is always 1 constant buffer, associated with the kernel
|
|
numCb_++;
|
|
assert((numCb_ <= MaxConstBuffersArguments) &&
|
|
"Runtime doesn't support more than max CBs for arguments!");
|
|
|
|
// Limit workgroup size if requested
|
|
if ((flags() & LimitWorkgroup) && (GPU_MAX_WORKGROUP_SIZE == 0)) {
|
|
size_t temp = 1;
|
|
workGroupInfo_.size_ = workGroupInfo()->wavefrontSize_;
|
|
for (uint j = 0; j < 3; ++j) {
|
|
if (workGroupInfo()->compileSize_[j] != 0) {
|
|
temp *= workGroupInfo_.compileSize_[j];
|
|
}
|
|
}
|
|
// Report a compilation error if requested compile size doesn't
|
|
// match the required workgroup size
|
|
if (workGroupInfo()->size_ < temp) {
|
|
char str[8];
|
|
intToStr(workGroupInfo_.size_, str, 8);
|
|
buildError_ = CL_OUT_OF_RESOURCES;
|
|
buildLog_ +=
|
|
"Error: Requested compile size is bigger than the required workgroup size of ";
|
|
buildLog_ += str;
|
|
buildLog_ += " elements\n";
|
|
LogError(buildLog().c_str());
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Read/Write attributes are provided in metadata
|
|
if (mdVersion.value_ >= MetadataBufferAttributes.value_) {
|
|
rwAttributes_ = true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
Kernel::validateMemory(uint idx, amd::Memory* amdMem) const
|
|
{
|
|
// Check if memory doesn't require reallocation
|
|
bool noRealloc = (!argument(idx)->memory_.realloc_ ||
|
|
amdMem->reallocedDeviceMemory(&dev()));
|
|
|
|
return noRealloc;
|
|
}
|
|
|
|
inline static HSAIL_ARG_TYPE
|
|
GetHSAILArgType(const aclArgData* argInfo)
|
|
{
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return HSAIL_ARGTYPE_POINTER;
|
|
case ARG_TYPE_QUEUE:
|
|
return HSAIL_ARGTYPE_QUEUE;
|
|
case ARG_TYPE_VALUE:
|
|
return HSAIL_ARGTYPE_VALUE;
|
|
case ARG_TYPE_IMAGE:
|
|
return HSAIL_ARGTYPE_IMAGE;
|
|
case ARG_TYPE_SAMPLER:
|
|
return HSAIL_ARGTYPE_SAMPLER;
|
|
case ARG_TYPE_ERROR:
|
|
default:
|
|
return HSAIL_ARGTYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
inline static size_t
|
|
GetHSAILArgAlignment(const aclArgData* argInfo)
|
|
{
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return argInfo->arg.pointer.align;
|
|
default:
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
inline static HSAIL_ADDRESS_QUALIFIER
|
|
GetHSAILAddrQual(const aclArgData* argInfo)
|
|
{
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_CONSTANT_EMU:
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV:
|
|
case PTR_MT_GLOBAL:
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
case PTR_MT_LDS_EMU:
|
|
case PTR_MT_LDS:
|
|
return HSAIL_ADDRESS_LOCAL;
|
|
case PTR_MT_SCRATCH_EMU:
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
case PTR_MT_ERROR:
|
|
default:
|
|
LogError("Unsupported address type");
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
}
|
|
else if ((argInfo->type == ARG_TYPE_IMAGE) ||
|
|
(argInfo->type == ARG_TYPE_SAMPLER)) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
}
|
|
else if (argInfo->type == ARG_TYPE_QUEUE) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
}
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
|
|
/* f16 returns f32 - workaround due to comp lib */
|
|
inline static HSAIL_DATA_TYPE
|
|
GetHSAILDataType(const aclArgData* argInfo)
|
|
{
|
|
aclArgDataType dataType;
|
|
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
dataType = argInfo->arg.pointer.data;
|
|
}
|
|
else if (argInfo->type == ARG_TYPE_VALUE) {
|
|
dataType = argInfo->arg.value.data;
|
|
}
|
|
else {
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
switch (dataType) {
|
|
case DATATYPE_i1:
|
|
return HSAIL_DATATYPE_B1;
|
|
case DATATYPE_i8:
|
|
return HSAIL_DATATYPE_S8;
|
|
case DATATYPE_i16:
|
|
return HSAIL_DATATYPE_S16;
|
|
case DATATYPE_i32:
|
|
return HSAIL_DATATYPE_S32;
|
|
case DATATYPE_i64:
|
|
return HSAIL_DATATYPE_S64;
|
|
case DATATYPE_u8:
|
|
return HSAIL_DATATYPE_U8;
|
|
case DATATYPE_u16:
|
|
return HSAIL_DATATYPE_U16;
|
|
case DATATYPE_u32:
|
|
return HSAIL_DATATYPE_U32;
|
|
case DATATYPE_u64:
|
|
return HSAIL_DATATYPE_U64;
|
|
case DATATYPE_f16:
|
|
return HSAIL_DATATYPE_F32;
|
|
case DATATYPE_f32:
|
|
return HSAIL_DATATYPE_F32;
|
|
case DATATYPE_f64:
|
|
return HSAIL_DATATYPE_F64;
|
|
case DATATYPE_struct:
|
|
return HSAIL_DATATYPE_STRUCT;
|
|
case DATATYPE_opaque:
|
|
return HSAIL_DATATYPE_OPAQUE;
|
|
case DATATYPE_ERROR:
|
|
default:
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
inline static int
|
|
GetHSAILArgSize(const aclArgData *argInfo)
|
|
{
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_VALUE:
|
|
switch (GetHSAILDataType(argInfo)) {
|
|
case HSAIL_DATATYPE_B1:
|
|
return 1;
|
|
case HSAIL_DATATYPE_B8:
|
|
case HSAIL_DATATYPE_S8:
|
|
case HSAIL_DATATYPE_U8:
|
|
return 1;
|
|
case HSAIL_DATATYPE_B16:
|
|
case HSAIL_DATATYPE_U16:
|
|
case HSAIL_DATATYPE_S16:
|
|
case HSAIL_DATATYPE_F16:
|
|
return 2;
|
|
case HSAIL_DATATYPE_B32:
|
|
case HSAIL_DATATYPE_U32:
|
|
case HSAIL_DATATYPE_S32:
|
|
case HSAIL_DATATYPE_F32:
|
|
return 4;
|
|
case HSAIL_DATATYPE_B64:
|
|
case HSAIL_DATATYPE_U64:
|
|
case HSAIL_DATATYPE_S64:
|
|
case HSAIL_DATATYPE_F64:
|
|
return 8;
|
|
case HSAIL_DATATYPE_STRUCT:
|
|
return argInfo->arg.value.numElements;
|
|
default:
|
|
return -1;
|
|
}
|
|
case ARG_TYPE_POINTER:
|
|
case ARG_TYPE_IMAGE:
|
|
case ARG_TYPE_SAMPLER:
|
|
case ARG_TYPE_QUEUE:
|
|
return sizeof(void*);
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
inline static clk_value_type_t
|
|
GetOclType(const aclArgData* argInfo)
|
|
{
|
|
static const clk_value_type_t ClkValueMapType[6][6] = {
|
|
{ T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 },
|
|
{ T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 },
|
|
{ T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 },
|
|
{ T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 },
|
|
{ T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 },
|
|
{ T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 },
|
|
};
|
|
|
|
uint sizeType;
|
|
if (argInfo->type == ARG_TYPE_QUEUE) {
|
|
return T_QUEUE;
|
|
}
|
|
if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) {
|
|
return T_POINTER;
|
|
}
|
|
else if (argInfo->type == ARG_TYPE_VALUE) {
|
|
switch (argInfo->arg.value.data) {
|
|
case DATATYPE_i8:
|
|
case DATATYPE_u8:
|
|
sizeType = 0;
|
|
break;
|
|
case DATATYPE_i16:
|
|
case DATATYPE_u16:
|
|
sizeType = 1;
|
|
break;
|
|
case DATATYPE_i32:
|
|
case DATATYPE_u32:
|
|
sizeType = 2;
|
|
break;
|
|
case DATATYPE_i64:
|
|
case DATATYPE_u64:
|
|
sizeType = 3;
|
|
break;
|
|
case DATATYPE_f16:
|
|
case DATATYPE_f32:
|
|
sizeType = 4;
|
|
break;
|
|
case DATATYPE_f64:
|
|
sizeType = 5;
|
|
break;
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
switch (argInfo->arg.value.numElements) {
|
|
case 1: return ClkValueMapType[sizeType][0];
|
|
case 2: return ClkValueMapType[sizeType][1];
|
|
case 3: return ClkValueMapType[sizeType][2];
|
|
case 4: return ClkValueMapType[sizeType][3];
|
|
case 8: return ClkValueMapType[sizeType][4];
|
|
case 16: return ClkValueMapType[sizeType][5];
|
|
default: return T_VOID;
|
|
}
|
|
}
|
|
else if (argInfo->type == ARG_TYPE_SAMPLER) {
|
|
return T_SAMPLER;
|
|
}
|
|
else {
|
|
return T_VOID;
|
|
}
|
|
}
|
|
|
|
inline static cl_kernel_arg_address_qualifier
|
|
GetOclAddrQual(const aclArgData* argInfo)
|
|
{
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_UAV:
|
|
case PTR_MT_GLOBAL:
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV_CONSTANT:
|
|
case PTR_MT_CONSTANT_EMU:
|
|
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
|
case PTR_MT_LDS_EMU:
|
|
case PTR_MT_LDS:
|
|
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
|
default:
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
}
|
|
else if (argInfo->type == ARG_TYPE_IMAGE) {
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
}
|
|
//default for all other cases
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
|
|
inline static cl_kernel_arg_access_qualifier
|
|
GetOclAccessQual(const aclArgData* argInfo)
|
|
{
|
|
if (argInfo->type == ARG_TYPE_IMAGE) {
|
|
switch (argInfo->arg.image.type) {
|
|
case ACCESS_TYPE_RO:
|
|
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
|
case ACCESS_TYPE_WO:
|
|
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
|
case ACCESS_TYPE_RW:
|
|
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
|
default:
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
}
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
|
|
inline static cl_kernel_arg_type_qualifier
|
|
GetOclTypeQual(const aclArgData* argInfo)
|
|
{
|
|
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
if (argInfo->arg.pointer.isVolatile) {
|
|
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
|
}
|
|
if (argInfo->arg.pointer.isRestrict) {
|
|
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
|
}
|
|
if (argInfo->arg.pointer.isPipe) {
|
|
rv |= CL_KERNEL_ARG_TYPE_PIPE;
|
|
}
|
|
if (argInfo->isConst) {
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
}
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV_CONSTANT:
|
|
case PTR_MT_CONSTANT_EMU:
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
static int
|
|
GetOclSize(const aclArgData* argInfo)
|
|
{
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER: return sizeof(void *);
|
|
case ARG_TYPE_VALUE:
|
|
//! \note OCL 6.1.5. For 3-component vector data types,
|
|
//! the size of the data type is 4 * sizeof(component).
|
|
switch (argInfo->arg.value.data) {
|
|
case DATATYPE_struct:
|
|
return 1 * argInfo->arg.value.numElements;
|
|
case DATATYPE_i8:
|
|
case DATATYPE_u8:
|
|
return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_u16:
|
|
case DATATYPE_i16:
|
|
case DATATYPE_f16:
|
|
return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_u32:
|
|
case DATATYPE_i32:
|
|
case DATATYPE_f32:
|
|
return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_i64:
|
|
case DATATYPE_u64:
|
|
case DATATYPE_f64:
|
|
return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_ERROR:
|
|
default: return -1;
|
|
}
|
|
case ARG_TYPE_IMAGE: return sizeof(cl_mem);
|
|
case ARG_TYPE_SAMPLER: return sizeof(cl_sampler);
|
|
case ARG_TYPE_QUEUE: return sizeof(cl_command_queue);
|
|
default: return -1;
|
|
}
|
|
}
|
|
|
|
void
|
|
HSAILKernel::initArgList(const aclArgData* aclArg)
|
|
{
|
|
// Initialize the hsail argument list too
|
|
initHsailArgs(aclArg);
|
|
|
|
// Iterate through the arguments and insert into parameterList
|
|
device::Kernel::parameters_t params;
|
|
amd::KernelParameterDescriptor desc;
|
|
size_t offset = 0;
|
|
|
|
// Reserved arguments for HSAIL launch
|
|
aclArg += ExtraArguments;
|
|
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
|
|
desc.name_ = arguments_[i]->name_.c_str();
|
|
desc.type_ = GetOclType(aclArg);
|
|
desc.addressQualifier_ = GetOclAddrQual(aclArg);
|
|
desc.accessQualifier_ = GetOclAccessQual(aclArg);
|
|
desc.typeQualifier_ = GetOclTypeQual(aclArg);
|
|
desc.typeName_ = arguments_[i]->typeName_.c_str();
|
|
|
|
// Make a check if it is local or global
|
|
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
|
desc.size_ = 0;
|
|
}
|
|
else {
|
|
desc.size_ = GetOclSize(aclArg);
|
|
}
|
|
|
|
// Make offset alignment to match CPU metadata, since
|
|
// in multidevice config abstraction layer has a single signature
|
|
// and CPU sends the paramaters as they are allocated in memory
|
|
size_t size = desc.size_;
|
|
if (size == 0) {
|
|
// Local memory for CPU
|
|
size = sizeof(cl_mem);
|
|
}
|
|
offset = amd::alignUp(offset, std::min(size, size_t(16)));
|
|
desc.offset_ = offset;
|
|
offset += amd::alignUp(size, sizeof(uint32_t));
|
|
params.push_back(desc);
|
|
|
|
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
|
|
flags_.imageEna_ = true;
|
|
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
|
|
flags_.imageWriteEna_ = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
createSignature(params);
|
|
}
|
|
|
|
void
|
|
HSAILKernel::initHsailArgs(const aclArgData* aclArg)
|
|
{
|
|
int offset = 0;
|
|
|
|
// Reserved arguments for HSAIL launch
|
|
aclArg += ExtraArguments;
|
|
|
|
// Iterate through the each kernel argument
|
|
for (; aclArg->struct_size != 0; aclArg++) {
|
|
Argument* arg = new Argument;
|
|
// Initialize HSAIL kernel argument
|
|
arg->name_ = aclArg->argStr;
|
|
arg->typeName_ = aclArg->typeStr;
|
|
arg->size_ = GetHSAILArgSize(aclArg);
|
|
arg->offset_ = offset;
|
|
arg->type_ = GetHSAILArgType(aclArg);
|
|
arg->addrQual_ = GetHSAILAddrQual(aclArg);
|
|
arg->dataType_ = GetHSAILDataType(aclArg);
|
|
// If vector of args we add additional arguments to flatten it out
|
|
arg->numElem_ = ((aclArg->type == ARG_TYPE_VALUE) &&
|
|
(aclArg->arg.value.data != DATATYPE_struct)) ?
|
|
aclArg->arg.value.numElements : 1;
|
|
arg->alignment_ = GetHSAILArgAlignment(aclArg);
|
|
offset += GetHSAILArgSize(aclArg);
|
|
arguments_.push_back(arg);
|
|
}
|
|
}
|
|
|
|
void
|
|
HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf)
|
|
{
|
|
PrintfInfo info;
|
|
uint index = 0;
|
|
for (; aclPrintf->struct_size != 0; aclPrintf++) {
|
|
index = aclPrintf->ID;
|
|
if (printf_.size() <= index) {
|
|
printf_.resize(index + 1);
|
|
}
|
|
info.fmtString_ = aclPrintf->fmtStr;
|
|
info.fmtString_ += "\n";
|
|
uint32_t *tmp_ptr = const_cast<uint32_t*>(aclPrintf->argSizes);
|
|
for (uint i = 0; i < aclPrintf->numSizes; i++ , tmp_ptr++) {
|
|
info.arguments_.push_back(*tmp_ptr);
|
|
}
|
|
printf_[index] = info;
|
|
info.arguments_.clear();
|
|
}
|
|
}
|
|
|
|
HSAILKernel::HSAILKernel(std::string name,
|
|
HSAILProgram* prog,
|
|
std::string compileOptions)
|
|
: device::Kernel(name)
|
|
, compileOptions_(compileOptions)
|
|
, dev_(prog->dev())
|
|
, prog_(*prog)
|
|
, index_(0)
|
|
, code_(NULL)
|
|
, hwMetaData_(NULL)
|
|
{
|
|
hsa_ = true;
|
|
}
|
|
|
|
HSAILKernel::~HSAILKernel()
|
|
{
|
|
while (!arguments_.empty()) {
|
|
Argument* arg = arguments_.back();
|
|
delete arg;
|
|
arguments_.pop_back();
|
|
}
|
|
|
|
delete [] hwMetaData_;
|
|
|
|
delete code_;
|
|
}
|
|
|
|
bool
|
|
HSAILKernel::init()
|
|
{
|
|
acl_error error;
|
|
//compile kernel down to ISA
|
|
std::string openClKernelName("&__OpenCL_" + name() + "_kernel");
|
|
|
|
std::string options(compileOptions_.c_str());
|
|
options.append(" -just-kernel=");
|
|
options.append(openClKernelName.c_str());
|
|
// Get the ISA out
|
|
size_t size_isa;
|
|
void* shader_isa = NULL;
|
|
error = aclCompile(dev().hsaCompiler(), prog().binaryElf(),
|
|
options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL);
|
|
if (error != ACL_SUCCESS) {
|
|
LogError("Failed to finalize");
|
|
return false;
|
|
}
|
|
|
|
shader_isa = const_cast<void *>(aclGetDeviceBinary(dev().hsaCompiler(),
|
|
prog().binaryElf(), openClKernelName.c_str(), &size_isa, &error));
|
|
if (shader_isa == NULL) {
|
|
LogError("Failed find the ISA");
|
|
return false;
|
|
}
|
|
|
|
aqlCreateHWInfo(shader_isa, size_isa);
|
|
|
|
// Pull out metadata from the ELF
|
|
size_t sizeOfArgList;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_ARGUMENT_ARRAY, openClKernelName.c_str(), NULL, &sizeOfArgList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
char* aclArgList = new char[sizeOfArgList];
|
|
if (NULL == aclArgList) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_ARGUMENT_ARRAY, openClKernelName.c_str(), aclArgList, &sizeOfArgList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
// Set the argList
|
|
initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
|
|
delete [] aclArgList;
|
|
|
|
size_t sizeOfWorkGroupSize;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_WORK_GROUP_SIZE, openClKernelName.c_str(), NULL, &sizeOfWorkGroupSize);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_WORK_GROUP_SIZE, openClKernelName.c_str(),
|
|
workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Copy wavefront size
|
|
workGroupInfo_.wavefrontSize_ = dev().getAttribs().wavefrontSize;
|
|
// Find total workgroup size
|
|
if (workGroupInfo_.compileSize_[0] != 0) {
|
|
workGroupInfo_.size_ =
|
|
workGroupInfo_.compileSize_[0] *
|
|
workGroupInfo_.compileSize_[1] *
|
|
workGroupInfo_.compileSize_[2];
|
|
}
|
|
else {
|
|
workGroupInfo_.size_ = dev().info().maxWorkGroupSize_;
|
|
}
|
|
|
|
// Pull out printf metadata from the ELF
|
|
size_t sizeOfPrintfList;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), NULL, &sizeOfPrintfList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Make sure kernel has any printf info
|
|
if (0 != sizeOfPrintfList) {
|
|
char* aclPrintfList = new char[sizeOfPrintfList];
|
|
if (NULL == aclPrintfList) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), aclPrintfList,
|
|
&sizeOfPrintfList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Set the PrintfList
|
|
initPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
|
|
delete [] aclPrintfList;
|
|
}
|
|
|
|
size_t sizeOfDevice;
|
|
bool hasKernelEnqueue = false;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_DEVICE_ENQUEUE, openClKernelName.c_str(),
|
|
&hasKernelEnqueue, &sizeOfDevice);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
flags_.dynamicParallelism_ = hasKernelEnqueue;
|
|
|
|
int index = -1;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_KERNEL_INDEX, openClKernelName.c_str(),
|
|
&index, &sizeOfDevice);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
index_ = static_cast<uint>(index);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const
|
|
{
|
|
// Check if memory doesn't require reallocation
|
|
bool noRealloc = true;
|
|
//amdMem->reallocedDeviceMemory(&dev()));
|
|
|
|
return noRealloc;
|
|
}
|
|
|
|
const Device&
|
|
HSAILKernel::dev() const
|
|
{
|
|
return reinterpret_cast<const Device&>(dev_);
|
|
}
|
|
|
|
const HSAILProgram&
|
|
HSAILKernel::prog() const
|
|
{
|
|
return reinterpret_cast<const HSAILProgram&>(prog_);
|
|
}
|
|
|
|
void
|
|
HSAILKernel::findLocalWorkSize(
|
|
size_t workDim,
|
|
const amd::NDRange& gblWorkSize,
|
|
amd::NDRange& lclWorkSize) const
|
|
{
|
|
// Initialize the default workgoup info
|
|
// Check if the kernel has the compiled sizes
|
|
if (workGroupInfo()->compileSize_[0] == 0) {
|
|
// Find the default local workgroup size, if it wasn't specified
|
|
if (lclWorkSize[0] == 0) {
|
|
size_t thrPerGrp;
|
|
bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
|
|
bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
|
|
bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
|
|
|
|
bool overrideSet = ((workDim == 1) && b1DOverrideSet) ||
|
|
((workDim == 2) && b2DOverrideSet) ||
|
|
((workDim == 3) && b3DOverrideSet);
|
|
if (!overrideSet) {
|
|
// Find threads per group
|
|
thrPerGrp = workGroupInfo()->size_;
|
|
|
|
// Check if kernel uses images
|
|
if (flags_.imageEna_ &&
|
|
// and thread group is a multiple value of wavefronts
|
|
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
|
|
// and it's 2 or 3-dimensional workload
|
|
(workDim > 1) &&
|
|
((dev().settings().partialDispatch_) ||
|
|
(((gblWorkSize[0] % 16) == 0) &&
|
|
((gblWorkSize[1] % 16) == 0)))) {
|
|
// Use 8x8 workgroup size if kernel has image writes
|
|
if (flags_.imageWriteEna_ ||
|
|
(thrPerGrp != dev().info().maxWorkGroupSize_)) {
|
|
lclWorkSize[0] = 8;
|
|
lclWorkSize[1] = 8;
|
|
}
|
|
else {
|
|
lclWorkSize[0] = 16;
|
|
lclWorkSize[1] = 16;
|
|
}
|
|
if (workDim == 3) {
|
|
lclWorkSize[2] = 1;
|
|
}
|
|
}
|
|
else {
|
|
size_t tmp = thrPerGrp;
|
|
// Split the local workgroup into the most efficient way
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
size_t div = tmp;
|
|
for (; (gblWorkSize[d] % div) != 0; div--);
|
|
lclWorkSize[d] = div;
|
|
tmp /= div;
|
|
}
|
|
|
|
// Check if partial dispatch is enabled and
|
|
if (dev().settings().partialDispatch_ &&
|
|
// we couldn't find optimal workload
|
|
(lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
|
|
size_t maxSize = 0;
|
|
size_t maxDim = 0;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (maxSize < gblWorkSize[d]) {
|
|
maxSize = gblWorkSize[d];
|
|
maxDim = d;
|
|
}
|
|
}
|
|
// Check if a local workgroup has the most optimal size
|
|
if (thrPerGrp > maxSize) {
|
|
thrPerGrp = maxSize;
|
|
}
|
|
lclWorkSize[maxDim] = thrPerGrp;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (d != maxDim) {
|
|
lclWorkSize[d] = 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
// Use overrides when app doesn't provide workgroup dimensions
|
|
if (workDim == 1) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
|
|
}
|
|
else if (workDim == 2) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
|
|
}
|
|
else if (workDim == 3) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
|
|
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
|
|
}
|
|
else
|
|
{
|
|
assert(0 && "Invalid workDim!");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
|
|
}
|
|
}
|
|
}
|
|
|
|
inline static void
|
|
WriteAqlArg(
|
|
unsigned char** dst,//!< The write pointer to the buffer
|
|
const void* src, //!< The source pointer
|
|
uint size, //!< The size in bytes to copy
|
|
uint alignment = 0 //!< The alignment to follow while writing to the buffer
|
|
)
|
|
{
|
|
if (alignment == 0) {
|
|
*dst = amd::alignUp(*dst, size);
|
|
}
|
|
else {
|
|
*dst = amd::alignUp(*dst, alignment);
|
|
}
|
|
memcpy(*dst, src, size);
|
|
*dst += size;
|
|
}
|
|
|
|
HsaAqlDispatchPacket*
|
|
HSAILKernel::loadArguments(
|
|
VirtualGPU& gpu,
|
|
const amd::Kernel& kernel,
|
|
const amd::NDRangeContainer& sizes,
|
|
const_address parameters,
|
|
bool nativeMem,
|
|
uint64_t vmDefQueue,
|
|
uint64_t* vmParentWrap,
|
|
std::vector<const Resource*>& memList) const
|
|
{
|
|
static const bool WaitOnBusyEngine = true;
|
|
uint64_t ldsAddress = ldsSize();
|
|
address aqlArgBuf = gpu.cb(0)->sysMemCopy();
|
|
address aqlStruct = gpu.cb(1)->sysMemCopy();
|
|
bool srdResource = false;
|
|
|
|
// The HLC generates 3 additional arguments for the global offsets
|
|
//and fourth argument is the printf_buffer pointer
|
|
size_t offsetSize[HSAILKernel::ExtraArguments] = { 0, 0, 0, 0, 0, 0 };
|
|
for (uint i = 0; i < sizes.dimensions(); ++i) {
|
|
offsetSize[i] = sizes.offset()[i];
|
|
}
|
|
|
|
if (dynamicParallelism()) {
|
|
// Provide the host parent AQL wrap object to the kernel
|
|
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(aqlStruct);
|
|
memset(wrap, 0, sizeof(AmdAqlWrap));
|
|
wrap->state = AQL_WRAP_BUSY;
|
|
ConstBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(sizeof(AmdAqlWrap));
|
|
*vmParentWrap = cb->vmAddress() + cb->wrtOffset();
|
|
offsetSize[4] = vmDefQueue;
|
|
offsetSize[5] = *vmParentWrap;
|
|
memList.push_back(cb);
|
|
}
|
|
|
|
// Check if the kernel may have printf output
|
|
if ((printfInfo().size() > 0) &&
|
|
// and printf buffer was allocated
|
|
(gpu.printfDbgHSA().dbgBuffer() != NULL)) {
|
|
offsetSize[3] = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
|
|
memList.push_back(gpu.printfDbgHSA().dbgBuffer());
|
|
}
|
|
WriteAqlArg(&aqlArgBuf, offsetSize, sizeof(offsetSize), sizeof(size_t));
|
|
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
const amd::KernelParameters& kernelParams = kernel.parameters();
|
|
|
|
// Find all parameters for the current kernel
|
|
for (uint i = 0; i != signature.numParameters(); ++i) {
|
|
const HSAILKernel::Argument* arg = argument(i);
|
|
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
|
const_address paramaddr = parameters + desc.offset_;
|
|
|
|
switch (arg->type_) {
|
|
case HSAIL_ARGTYPE_POINTER:
|
|
// If it is a global pointer
|
|
if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) {
|
|
|
|
Memory* gpuMem = NULL;
|
|
amd::Memory* mem = NULL;
|
|
|
|
if (kernelParams.boundToSvmPointer(dev(), parameters, i)) {
|
|
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr));
|
|
mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast<void* const*>(paramaddr));
|
|
if (mem != NULL) {
|
|
gpuMem = dev().getGpuMemory(mem);
|
|
gpuMem->wait(gpu, WaitOnBusyEngine);
|
|
memList.push_back(gpuMem);
|
|
}
|
|
else {
|
|
return NULL;
|
|
}
|
|
break;
|
|
}
|
|
if (nativeMem) {
|
|
gpuMem = *reinterpret_cast<Memory* const*>(paramaddr);
|
|
}
|
|
else {
|
|
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
|
|
if (mem != NULL) {
|
|
gpuMem = dev().getGpuMemory(mem);
|
|
}
|
|
}
|
|
if (gpuMem == NULL) {
|
|
WriteAqlArg(&aqlArgBuf, &gpuMem, sizeof(void*));
|
|
break;
|
|
}
|
|
|
|
//! @todo 64 bit isn't supported with 32 bit binary
|
|
uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset();
|
|
WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*));
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, WaitOnBusyEngine);
|
|
|
|
//! @todo Compiler has to return read/write attributes
|
|
if ((NULL != mem) &&
|
|
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
memList.push_back(gpuMem);
|
|
}
|
|
// If it is a local pointer
|
|
else {
|
|
assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) &&
|
|
"Unsupported address type");
|
|
ldsAddress = amd::alignUp(ldsAddress, arg->alignment_);
|
|
WriteAqlArg(&aqlArgBuf, &ldsAddress, sizeof(size_t));
|
|
ldsAddress += *reinterpret_cast<const size_t *>(paramaddr);
|
|
}
|
|
break;
|
|
case HSAIL_ARGTYPE_VALUE:
|
|
// Special case for structrues
|
|
if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) {
|
|
// Copy the current structre into CB1
|
|
memcpy(aqlStruct, paramaddr, arg->size_);
|
|
ConstBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(arg->size_);
|
|
// Then use a pointer in aqlArgBuffer to CB1
|
|
uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset();
|
|
WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*));
|
|
memList.push_back(cb);
|
|
}
|
|
else {
|
|
WriteAqlArg(&aqlArgBuf, paramaddr,
|
|
arg->numElem_ * arg->size_, arg->size_);
|
|
}
|
|
break;
|
|
case HSAIL_ARGTYPE_IMAGE: {
|
|
Image* image = NULL;
|
|
amd::Memory* mem = NULL;
|
|
if (nativeMem) {
|
|
image = static_cast<Image*>(*reinterpret_cast<Memory* const*>(paramaddr));
|
|
}
|
|
else {
|
|
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
|
|
if (mem == NULL) {
|
|
LogError( "The kernel image argument isn't an image object!");
|
|
return false;
|
|
}
|
|
image = static_cast<Image*>(dev().getGpuMemory(mem));
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
image->wait(gpu, WaitOnBusyEngine);
|
|
|
|
if (dev().settings().hsailDirectSRD_) {
|
|
// Image arguments are of size 48 bytes and aligned to 16 bytes
|
|
WriteAqlArg(&aqlArgBuf, image->hwState(),
|
|
HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
|
|
}
|
|
else {
|
|
uint64_t srd = image->hwSrd();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
srdResource = true;
|
|
}
|
|
|
|
//! @todo Compiler has to return read/write attributes
|
|
if ((NULL != mem) &&
|
|
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
|
|
memList.push_back(image);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_SAMPLER: {
|
|
const amd::Sampler* sampler =
|
|
*reinterpret_cast<amd::Sampler* const*>(paramaddr);
|
|
const Sampler* gpuSampler = static_cast<Sampler*>
|
|
(sampler->getDeviceSampler(dev()));
|
|
if (dev().settings().hsailDirectSRD_) {
|
|
WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
|
|
HSA_SAMPLER_OBJECT_SIZE, HSA_SAMPLER_OBJECT_ALIGNMENT);
|
|
}
|
|
else {
|
|
uint64_t srd = gpuSampler->hwSrd();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
srdResource = true;
|
|
}
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_QUEUE: {
|
|
const amd::DeviceQueue* queue =
|
|
*reinterpret_cast<amd::DeviceQueue* const*>(paramaddr);
|
|
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
|
|
uint64_t vmQueue = gpuQueue->vQueue()->vmAddress();
|
|
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
|
|
break;
|
|
}
|
|
default:
|
|
LogError(" Unsupported address type ");
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
if (ldsAddress > dev().info().localMemSize_) {
|
|
LogError("No local memory available\n");
|
|
return NULL;
|
|
}
|
|
|
|
assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) &&
|
|
"Size and the number of arguments don't match!");
|
|
uint argBufSize = amd::alignUp(
|
|
static_cast<uint>(argsBufferSize()), sizeof(uint32_t));
|
|
HsaAqlDispatchPacket* aqlPkt = reinterpret_cast<HsaAqlDispatchPacket*>(
|
|
gpu.cb(0)->sysMemCopy() + argBufSize);
|
|
|
|
amd::NDRange local(sizes.local());
|
|
amd::NDRange global(sizes.global());
|
|
|
|
// Initialize the Global, Local and Offset values
|
|
aqlPkt->dimensions = sizes.dimensions();
|
|
// Initialize the work grid parameter
|
|
for (uint idx = 0; idx < 3; idx++) {
|
|
aqlPkt->grid_size[idx] = 1;
|
|
aqlPkt->workgroup_size[idx] = 1;
|
|
}
|
|
|
|
// Check if runtime has to find local workgroup size
|
|
findLocalWorkSize(aqlPkt->dimensions, global, local);
|
|
for (uint idx = 0; idx < aqlPkt->dimensions; idx++) {
|
|
aqlPkt->grid_size[idx] = global[idx];
|
|
aqlPkt->workgroup_size[idx] = local[idx];
|
|
}
|
|
|
|
// Initialize if dispatch should enable profiling
|
|
aqlPkt->reserved2 = 0; //config->profile ? 1:0;
|
|
|
|
// Initialize kernel ISA and execution buffer requirements
|
|
aqlPkt->kernel_object_address = gpuAqlCode()->vmAddress();
|
|
aqlPkt->group_segment_size_bytes = ldsAddress - ldsSize();
|
|
aqlPkt->private_segment_size_bytes = spillSegSize();
|
|
|
|
// Initialize cache flush configuration for the dispatch
|
|
//! @todo Currently not used in emulation
|
|
aqlPkt->barrier = 1;
|
|
aqlPkt->release_fence_scope = 1;
|
|
aqlPkt->acquire_fence_scope = 2;
|
|
aqlPkt->invalidate_instruction_cache = 1;
|
|
|
|
ConstBuffer* cb = gpu.constBufs_[0];
|
|
cb->uploadDataToHw(argBufSize + sizeof(HsaAqlDispatchPacket));
|
|
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
|
|
aqlPkt->kernel_arg_address = argList;
|
|
memList.push_back(cb);
|
|
memList.push_back(gpuAqlCode());
|
|
if (NULL != prog().globalStore()) {
|
|
memList.push_back(prog().globalStore());
|
|
}
|
|
if (cpuAqlCode_->enable_sgpr_queue_ptr) {
|
|
memList.push_back(gpu.hsaQueueMem());
|
|
}
|
|
|
|
if (srdResource) {
|
|
dev().srds().fillResourceList(memList);
|
|
}
|
|
|
|
return aqlPkt;
|
|
}
|
|
|
|
} // namespace gpu
|