bf32cddc03
EPR #304775 - fixed the bug 9838. The svm pointer in the arugment list needs to be tracked to make sure all operation of resource has been finished before we dispatch kernel. code review: http://ocltc.amd.com/reviews/r/5200/ precheckin: http://ocltc.amd.com:8111/viewModification.html?modId=35125&personal=true&buildTypeId=&tab=vcsModificationTests Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#257 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#322 edit
4051 строка
137 KiB
C++
4051 строка
137 KiB
C++
//
|
|
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
|
|
#include "device/gpu/gpukernel.hpp"
|
|
#include "device/gpu/gpuprogram.hpp"
|
|
#include "device/gpu/gpublit.hpp"
|
|
#include "device/gpu/gpuconstbuf.hpp"
|
|
#include "device/gpu/gpusched.hpp"
|
|
#include "platform/commandqueue.hpp"
|
|
#include "utils/options.hpp"
|
|
|
|
#include "acl.h"
|
|
#include "SCShadersR678XXCommon.h"
|
|
|
|
#include <string>
|
|
#include <memory>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <ctime>
|
|
|
|
namespace gpu {
|
|
|
|
const MetaDataConst ArgState[ArgStateTotal] =
|
|
{
|
|
// Note: the order is important
|
|
// Name Type Properties
|
|
// Kernel description (special properties)
|
|
{ "memory:compilerwrite", KernelArg::PrivateFixed, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "uniqueid:", KernelArg::None, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "memory:private:", KernelArg::PrivateSize, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "memory:local:", KernelArg::LocalSize, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "memory:hwprivate:", KernelArg::HwPrivateSize, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "memory:uavprivate:", KernelArg::HwPrivateSize, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "memory:hwlocal:", KernelArg::HwLocalSize, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "memory:64bitABI", KernelArg::ABI64Bit, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "limitgroupsize", KernelArg::Wavefront, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "function:", KernelArg::None, { 1, 1, 0, 0, 0, 0, 0 } },
|
|
{ "intrinsic:", KernelArg::None, { 1, 0, 0, 0, 0, 0, 0 } },
|
|
{ "error:", KernelArg::ErrorMessage, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "warning:", KernelArg::WarningMessage, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "printf_fmt:", KernelArg::PrintfFormatStr, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "version:", KernelArg::MetadataVersion, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
// Kernel basic types
|
|
{ "pointer:", KernelArg::PointerGlobal, { 1, 1, 1, 1, 1, 1, 0 } },
|
|
{ "value:", KernelArg::Value, { 1, 1, 1, 1, 1, 0, 0 } },
|
|
{ "image:", KernelArg::Image, { 1, 1, 1, 1, 1, 0, 0 } },
|
|
{ "sampler:", KernelArg::Sampler, { 0, 1, 0, 0, 0, 0, 0 } },
|
|
{ "counter:", KernelArg::Counter, { 1, 1, 0, 1, 1, 0, 0 } },
|
|
{ "cws:", KernelArg::Grouping, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "lws:", KernelArg::WrkgrpSize, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "uavid:", KernelArg::UavId, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "reflection:", KernelArg::Reflection, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "constarg:", KernelArg::ConstArg, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "cbid:", KernelArg::ConstBufId, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "printfid:", KernelArg::PrintfBufId, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "wsh:", KernelArg::GroupingHint, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
{ "vth:", KernelArg::VecTypeHint, { 0, 0, 0, 0, 0, 0, 0 } },
|
|
};
|
|
|
|
const DataTypeConst DataType[] =
|
|
{
|
|
{ "i8:", KernelArg::Char, },
|
|
{ "i16:", KernelArg::Short, },
|
|
{ "i32:", KernelArg::Int, },
|
|
{ "i64:", KernelArg::Long, },
|
|
{ "u8:", KernelArg::UChar, },
|
|
{ "u16:", KernelArg::UShort, },
|
|
{ "u32:", KernelArg::UInt, },
|
|
{ "u64:", KernelArg::ULong, },
|
|
{ "float:", KernelArg::Float, },
|
|
{ "double:", KernelArg::Double, },
|
|
{ "struct:", KernelArg::Struct, },
|
|
{ "union:", KernelArg::Union, },
|
|
{ "1D:", KernelArg::Image1D, },
|
|
{ "2D:", KernelArg::Image2D, },
|
|
{ "3D:", KernelArg::Image3D, },
|
|
{ "1DB:", KernelArg::Image1DB, },
|
|
{ "1DA:", KernelArg::Image1DA, },
|
|
{ "2DA:", KernelArg::Image2DA, },
|
|
{ "opaque:", KernelArg::Opaque, },
|
|
{ "event:", KernelArg::Event, },
|
|
{ "sampler:", KernelArg::Sampler, },
|
|
{ "half:", KernelArg::Half, },
|
|
};
|
|
|
|
const uint DataTypeTotal = sizeof(DataType) / sizeof(DataTypeConst);
|
|
|
|
struct BufDataConst
|
|
{
|
|
const char* tagName_; //!< buffer's name
|
|
KernelArg::ArgumentType type_; //!< type of argument
|
|
struct
|
|
{
|
|
uint number_ : 1; //!< buffer's number
|
|
uint alignment_ : 1; //!< buffer's alignment
|
|
uint attribute_ : 1; //!< buffer's read/write attribute
|
|
uint reserved : 29; //!< reserved
|
|
};
|
|
};
|
|
|
|
static const BufDataConst BufType[] =
|
|
{
|
|
{ "g", KernelArg::PointerGlobal, { 1, 0, 0, 0 } },
|
|
{ "p", KernelArg::PointerPrivate, { 1, 1, 1, 0 } },
|
|
{ "l", KernelArg::PointerLocal, { 1, 1, 1, 0 } },
|
|
{ "uav", KernelArg::PointerGlobal, { 1, 1, 1, 0 } },
|
|
{ "c", KernelArg::PointerConst, { 1, 1, 1, 0 } },
|
|
{ "hl", KernelArg::PointerHwLocal, { 1, 1, 1, 0 } },
|
|
{ "hp", KernelArg::PointerHwPrivate,{ 1, 1, 1, 0 } },
|
|
{ "hc", KernelArg::PointerHwConst, { 1, 1, 1, 0 } }
|
|
};
|
|
static const uint BufTypeTotal = sizeof(BufType) / sizeof(BufDataConst);
|
|
|
|
//! The mathlib constants for each kernel execution
|
|
static const float MathLibConst[4] = { 0.0f, 0.5f, 1.0f, 2.0f };
|
|
|
|
bool
|
|
expect(const std::string& str, size_t* pos, const std::string& sym)
|
|
{
|
|
bool result = true;
|
|
uint i;
|
|
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
// Check if we have expected symbols
|
|
for (i = 0; i < sym.size(); ++i) {
|
|
char deb = str[*pos + i];
|
|
if (deb != sym[i]) {
|
|
result = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (result) *pos += i;
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
getword(const std::string& str, size_t* pos, char* sym)
|
|
{
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;", *pos);
|
|
size_t count = posEnd - *pos;
|
|
|
|
if (count != 0) {
|
|
if (!str.copy(sym, count, *pos)) {
|
|
return false;
|
|
}
|
|
}
|
|
sym[count] = 0;
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
getstring(const std::string& str, size_t* pos, std::string* out)
|
|
{
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(":\n\r;", *pos);
|
|
size_t count = posEnd - *pos;
|
|
|
|
char* sym = new char[count + 1];
|
|
if (count != 0) {
|
|
if (!str.copy(sym, count, *pos)) {
|
|
return false;
|
|
}
|
|
}
|
|
sym[count] = 0;
|
|
*out = sym;
|
|
delete [] sym;
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
getuint(const std::string& str, size_t* pos, uint* val)
|
|
{
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
char sym[16];
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
|
|
|
|
if (!str.copy(sym, posEnd - *pos, *pos)) {
|
|
return false;
|
|
}
|
|
*val = 0;
|
|
for (size_t i = 0; i < (posEnd - *pos); ++i) {
|
|
*val = (*val * 10) + (sym[i] - 0x30);
|
|
}
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
getuintHex(const std::string& str, size_t* pos, uint* val)
|
|
{
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
char sym[16];
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
|
|
|
|
if (!str.copy(sym, posEnd - *pos, *pos)) {
|
|
return false;
|
|
}
|
|
*val = 0;
|
|
for (size_t i = 0; i < (posEnd - *pos); ++i) {
|
|
if (sym[i] >= '0' && sym[i] <= 'F') {
|
|
*val = (*val * 16) + (sym[i] - '0');
|
|
}
|
|
else if (sym[i] >= 'a' && sym[i] <= 'f') {
|
|
*val = (*val * 16) + (sym[i] - 'a' + 10);
|
|
}
|
|
else {
|
|
return false;
|
|
}
|
|
}
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
getuint64Hex(const std::string& str, size_t* pos, uint64_t* val)
|
|
{
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
char sym[16];
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
|
|
|
|
if (!str.copy(sym, posEnd - *pos, *pos)) {
|
|
return false;
|
|
}
|
|
*val = 0;
|
|
for (size_t i = 0; i < (posEnd - *pos); ++i) {
|
|
if (sym[i] >= '0' && sym[i] <= 'F') {
|
|
*val = (*val * 16) + (sym[i] - '0');
|
|
}
|
|
else if (sym[i] >= 'a' && sym[i] <= 'f') {
|
|
*val = (*val * 16) + (sym[i] - 'a' + 10);
|
|
}
|
|
else {
|
|
return false;
|
|
}
|
|
}
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
void
|
|
intToStr(size_t value, char* str, size_t size)
|
|
{
|
|
static const uint MaxDigits32bit = 10;
|
|
char result[MaxDigits32bit];
|
|
uint idx = MaxDigits32bit;
|
|
|
|
do {
|
|
idx--;
|
|
result[idx] = static_cast<char>((value % 10) + '0');
|
|
value /= 10;
|
|
} while ((value != 0) && (idx > 0));
|
|
size_t len = MaxDigits32bit - idx;
|
|
size_t n = std::min<size_t>(len, size-1);
|
|
memcpy(str, &result[idx], n);
|
|
str[n] = '\0';
|
|
}
|
|
|
|
//! Default destructor
|
|
CalImageReference::~CalImageReference()
|
|
{
|
|
// Free CAL image
|
|
free(image_);
|
|
}
|
|
|
|
KernelArg::KernelArg()
|
|
: type_(KernelArg::None)
|
|
, size_(0)
|
|
, cbIdx_(0)
|
|
, cbPos_(0)
|
|
, index_(0)
|
|
, alignment_(1)
|
|
, dataType_(KernelArg::None)
|
|
{
|
|
name_ = "";
|
|
buf_ = "";
|
|
memory_.value_ = 0;
|
|
typeQualifier_ = CL_KERNEL_ARG_TYPE_NONE;
|
|
}
|
|
|
|
KernelArg::KernelArg(const KernelArg& data)
|
|
{
|
|
// Fill the new object
|
|
*this = data;
|
|
}
|
|
|
|
KernelArg&
|
|
KernelArg::operator=(const KernelArg& data)
|
|
{
|
|
// Fill the fields of the current object
|
|
name_ = data.name_;
|
|
typeName_ = data.typeName_;
|
|
typeQualifier_ = data.typeQualifier_;
|
|
type_ = data.type_;
|
|
size_ = data.size_;
|
|
cbIdx_ = data.cbIdx_;
|
|
cbPos_ = data.cbPos_;
|
|
buf_ = data.buf_;
|
|
index_ = data.index_;
|
|
alignment_ = data.alignment_;
|
|
dataType_ = data.dataType_;
|
|
memory_.value_ = data.memory_.value_;
|
|
return *this;
|
|
}
|
|
|
|
bool
|
|
KernelArg::isCbNeeded() const
|
|
{
|
|
//! \note not a safe way
|
|
bool result = ((type_ > None) && (type_ < Sampler)) ? true : false;
|
|
if ((type_ == Sampler) && (location_ == 0)) {
|
|
// Sampler is defined outside the kernel
|
|
result = true;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
size_t
|
|
KernelArg::size(bool gpuLayer)const
|
|
{
|
|
switch (type_) {
|
|
case None:
|
|
return 0;
|
|
case PointerConst:
|
|
case PointerHwConst:
|
|
case PointerGlobal:
|
|
return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem);
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
return (gpuLayer) ? sizeof(ImageConstants) : sizeof(cl_mem);
|
|
case Sampler:
|
|
return (gpuLayer) ? 2 * sizeof(uint32_t) : sizeof(cl_sampler);
|
|
case Counter:
|
|
return (gpuLayer) ? 0 : sizeof(cl_mem);
|
|
case PointerLocal:
|
|
case PointerHwLocal:
|
|
return (gpuLayer) ? sizeof(uint32_t) * size_ : 0;
|
|
case PointerPrivate:
|
|
case PointerHwPrivate:
|
|
return (gpuLayer) ? sizeof(uint32_t) * size_ : 0;
|
|
case Float:
|
|
return sizeof(cl_float) * amd::nextPowerOfTwo(size_);
|
|
case Double:
|
|
return sizeof(cl_double) * amd::nextPowerOfTwo(size_);
|
|
case Char:
|
|
case UChar:
|
|
return sizeof(cl_char) * amd::nextPowerOfTwo(size_);
|
|
case Short:
|
|
case UShort:
|
|
return sizeof(cl_short) * amd::nextPowerOfTwo(size_);
|
|
case Int:
|
|
case UInt:
|
|
return sizeof(cl_uint) * amd::nextPowerOfTwo(size_);
|
|
case Long:
|
|
case ULong:
|
|
return sizeof(cl_ulong) * amd::nextPowerOfTwo(size_);
|
|
case Struct:
|
|
case Union:
|
|
return (gpuLayer) ? amd::alignUp(size_, 16) : size_;
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
cl_kernel_arg_address_qualifier
|
|
KernelArg::addressQualifier() const
|
|
{
|
|
switch (type_) {
|
|
case PointerGlobal:
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
case PointerLocal:
|
|
case PointerHwLocal:
|
|
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
|
case PointerConst:
|
|
case PointerHwConst:
|
|
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
|
default:
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
}
|
|
|
|
cl_kernel_arg_access_qualifier
|
|
KernelArg::accessQualifier() const
|
|
{
|
|
switch (type_) {
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
if (memory_.readOnly_) {
|
|
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
|
}
|
|
else if (memory_.writeOnly_) {
|
|
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
|
}
|
|
else if (memory_.readWrite_) {
|
|
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
|
}
|
|
// Fall through ...
|
|
default:
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
}
|
|
|
|
//! temporary solution for the vectors handling in compiler
|
|
size_t
|
|
KernelArg::specialVector() const
|
|
{
|
|
if (size_ > VectorSizeLimit) {
|
|
switch (type_) {
|
|
case Char:
|
|
case UChar:
|
|
return sizeof(cl_char);
|
|
case Short:
|
|
case UShort:
|
|
return sizeof(cl_short);
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
clk_value_type_t
|
|
KernelArg::type()const
|
|
{
|
|
switch (type_) {
|
|
case PointerGlobal:
|
|
case PointerLocal:
|
|
case PointerHwLocal:
|
|
case PointerConst:
|
|
case PointerHwConst:
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
case Counter:
|
|
return T_POINTER;
|
|
case Float:
|
|
return T_FLOAT;
|
|
case Double:
|
|
return T_DOUBLE;
|
|
case Char:
|
|
case UChar:
|
|
return T_CHAR;
|
|
case Short:
|
|
case UShort:
|
|
return T_SHORT;
|
|
case Int:
|
|
return T_INT;
|
|
case UInt:
|
|
//! \note No UINT type
|
|
return T_INT;
|
|
case Long:
|
|
return T_LONG;
|
|
case ULong:
|
|
//! \note No ULONG type
|
|
return T_LONG;
|
|
case Struct:
|
|
case Union:
|
|
//! @todo What should we report?
|
|
return T_CHAR;
|
|
case Sampler:
|
|
return T_SAMPLER;
|
|
case PointerPrivate:
|
|
case PointerHwPrivate:
|
|
case None:
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
}
|
|
|
|
NullKernel::NullKernel(
|
|
const std::string& name,
|
|
const NullDevice& gpuNullDev,
|
|
const NullProgram& nullprog)
|
|
: device::Kernel(name)
|
|
, buildError_(CL_BUILD_PROGRAM_FAILURE)
|
|
, gpuDev_(gpuNullDev)
|
|
, prog_(nullprog)
|
|
, calRef_(NULL)
|
|
, internal_(false)
|
|
, flags_(0)
|
|
, cbSizes_(NULL)
|
|
, numCb_(0)
|
|
, rwAttributes_(false)
|
|
, instructionCnt_(4)
|
|
{
|
|
// UAV raw index will be detected
|
|
uavRaw_ = UavIdUndefined;
|
|
// Initialize UAV arena index(should be 8)
|
|
uavArena_ = VirtualGPU::UavArena;
|
|
// CB index will be detected
|
|
cbId_ = UavIdUndefined;
|
|
// Printf index will be detected
|
|
printfId_ = UavIdUndefined;
|
|
}
|
|
|
|
NullKernel::~NullKernel()
|
|
{
|
|
uint idx;
|
|
|
|
if (calRef_ == NULL) {
|
|
return;
|
|
}
|
|
calRef_->release();
|
|
|
|
// Destroy all kernel arguments
|
|
for (idx = 0; idx < arguments_.size(); ++idx) {
|
|
delete arguments_[idx];
|
|
}
|
|
arguments_.clear();
|
|
|
|
// Destroy all sampler kernel arguments
|
|
for (idx = 0; idx < intSamplers_.size(); ++idx) {
|
|
delete intSamplers_[idx];
|
|
}
|
|
intSamplers_.clear();
|
|
}
|
|
|
|
|
|
static int
|
|
scComponentToArrayIndex(E_SC_COMPONENT dstComp)
|
|
{
|
|
switch (dstComp) {
|
|
case SC_COMPONENT_X:
|
|
return 0;
|
|
case SC_COMPONENT_Y:
|
|
return 1;
|
|
case SC_COMPONENT_Z:
|
|
return 2;
|
|
case SC_COMPONENT_W:
|
|
return 3;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void
|
|
addLoopConst(const SC_HWSHADER* shader, AMUabiAddEncoding& encoding)
|
|
{
|
|
uint count = shader->dep.NumIntrlIConstants;
|
|
encoding.litConstsCount = shader->dep.NumIntrlIConstants;
|
|
|
|
// only suppport loop consts (int consts)
|
|
if (count) {
|
|
AMUabiLiteralConst* allocatedconsts = encoding.litConsts;
|
|
memset(allocatedconsts, 0, count * sizeof(AMUabiLiteralConst));
|
|
uint usedConsts = 0;
|
|
for (uint i = 0; i < count; ++i) {
|
|
uint currentConst;
|
|
for (currentConst = 0; currentConst < usedConsts; ++currentConst) {
|
|
if (allocatedconsts[currentConst].addr ==
|
|
HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber) {
|
|
break;
|
|
}
|
|
}
|
|
if (currentConst == usedConsts) {
|
|
usedConsts++;
|
|
assert(usedConsts <= count);
|
|
}
|
|
allocatedconsts[currentConst].addr = HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber;
|
|
allocatedconsts[currentConst].type = AMU_ABI_INT32;
|
|
allocatedconsts[currentConst].value.
|
|
int32[scComponentToArrayIndex(HWSHADER_Get(shader, dep.IntrlIConstants)[i].eDstComp)] =
|
|
HWSHADER_Get(shader, dep.IntrlIConstants)[i].iValue;
|
|
}
|
|
encoding.litConstsCount = usedConsts;
|
|
}
|
|
}
|
|
|
|
bool
|
|
NullKernel::create(
|
|
const std::string& code,
|
|
const std::string& metadata,
|
|
const void* binaryCode,
|
|
size_t binarySize)
|
|
{
|
|
std::auto_ptr<uint> uavRefCount (new uint[MaxUavArguments]);
|
|
if (NULL == uavRefCount.get()) {
|
|
return false;
|
|
}
|
|
|
|
// Set all ref counts to 0
|
|
memset(uavRefCount.get(), 0, sizeof(uavRefCount.get()[0]) * MaxUavArguments);
|
|
|
|
// parse the metadata fields
|
|
if (!parseArguments(metadata, uavRefCount.get())) {
|
|
return false;
|
|
}
|
|
|
|
CALimage calImage;
|
|
// Save source if DEBUG build
|
|
#if DEBUG
|
|
ilSource_ = code;
|
|
#endif // DEBUG
|
|
|
|
amd::option::Options *options = nullProg().getCompilerOptions();
|
|
internal_ = options->oVariables->clInternalKernel;
|
|
|
|
if ((binaryCode == NULL) && (binarySize == 0) && !code.empty()) {
|
|
acl_error err;
|
|
std::string arch = GPU_TARGET_INFO_ARCH;
|
|
if (nullDev().settings().use64BitPtr_) {
|
|
arch += "64";
|
|
}
|
|
aclTargetInfo info = aclGetTargetInfo(
|
|
arch.c_str(), nullDev().hwInfo()->targetName_, &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclGetTargetInfo failed");
|
|
return false;
|
|
}
|
|
|
|
aclBinaryOptions binOpts = {0};
|
|
binOpts.struct_size = sizeof(binOpts);
|
|
binOpts.elfclass = info.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32;
|
|
binOpts.bitness = ELFDATA2LSB;
|
|
binOpts.alloc = &::malloc;
|
|
binOpts.dealloc = &::free;
|
|
|
|
aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &info, &binOpts, &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclBinaryInit failed");
|
|
return false;
|
|
}
|
|
|
|
if (ACL_SUCCESS != aclInsertSection(nullDev().compiler(), bin,
|
|
code.data(), code.size(), aclSOURCE)) {
|
|
LogWarning("aclInsertSection failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
|
|
amd::option::Options* Opts = (amd::option::Options*)bin->options;
|
|
// temporary solution to synchronize buildNo between runtime and complib
|
|
// until we move runtime inside complib
|
|
Opts->setBuildNo(options->getBuildNo());
|
|
|
|
// pass kernel name to compiler
|
|
Opts->setCurrKernelName(name().c_str());
|
|
|
|
err = aclCompile(nullDev().compiler(), bin, options->origOptionStr.c_str(),
|
|
ACL_TYPE_AMDIL_TEXT, ACL_TYPE_ISA, NULL);
|
|
|
|
buildLog_ += aclGetCompilerLog(nullDev().compiler());
|
|
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclCompile failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
if (!options->oVariables->BinEXE) {
|
|
// Early exit if binary doesn't contain EXE
|
|
aclBinaryFini(bin);
|
|
return true;
|
|
}
|
|
size_t len;
|
|
const void* isa = aclExtractSection(nullDev().compiler(), bin,
|
|
&len, aclTEXT, &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclExtractSection failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
|
|
uint calImageSize;
|
|
if (!createMultiBinary(
|
|
&calImageSize, reinterpret_cast<void**>(&calImage), isa)) {
|
|
LogWarning("initSrcEncoding failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
|
|
aclBinaryFini(bin);
|
|
}
|
|
else if ((binaryCode != NULL) && (binarySize != 0)) {
|
|
uint size = 0;
|
|
if (!amuABIMultiBinaryGetSize(&size, const_cast<void*>(binaryCode))
|
|
|| size > binarySize) {
|
|
buildLog_ += "Invalid binary image";
|
|
LogError("amuABIMultiBinaryGetSize failed!");
|
|
return false;
|
|
}
|
|
|
|
calImage = static_cast<CALimage>(malloc(size));
|
|
::memcpy(calImage, binaryCode, size);
|
|
}
|
|
else {
|
|
LogError("Incorrect initialization parameters!");
|
|
return false;
|
|
}
|
|
|
|
calRef_ = new CalImageReference(calImage);
|
|
if (calRef_ == NULL) {
|
|
LogError("Memory allocation failure!");
|
|
// Free CAL image
|
|
free(calImage);
|
|
return false;
|
|
}
|
|
|
|
CALfuncInfo calFuncInfo;
|
|
|
|
// Get kernel compiled information
|
|
getFuncInfoFromImage(calImage, &calFuncInfo);
|
|
if (calFuncInfo.maxScratchRegsNeeded > 0) {
|
|
LogPrintfInfo("%s kernel has register spilling."
|
|
"Lower performance is expected.", name().c_str());
|
|
}
|
|
|
|
workGroupInfo_.scratchRegs_ = calFuncInfo.maxScratchRegsNeeded;
|
|
workGroupInfo_.wavefrontPerSIMD_ = calFuncInfo.numWavefrontPerSIMD;
|
|
workGroupInfo_.wavefrontSize_ = calFuncInfo.wavefrontSize;
|
|
workGroupInfo_.availableGPRs_ = calFuncInfo.numGPRsAvailable;
|
|
workGroupInfo_.usedGPRs_ = calFuncInfo.numGPRsUsed;
|
|
workGroupInfo_.availableSGPRs_ = calFuncInfo.numSGPRsAvailable;
|
|
workGroupInfo_.usedSGPRs_ = calFuncInfo.numSGPRsUsed;
|
|
workGroupInfo_.availableVGPRs_ = calFuncInfo.numVGPRsAvailable;
|
|
workGroupInfo_.usedVGPRs_ = calFuncInfo.numVGPRsUsed;
|
|
workGroupInfo_.availableLDSSize_ = calFuncInfo.LDSSizeAvailable;
|
|
workGroupInfo_.usedLDSSize_ = calFuncInfo.LDSSizeUsed;
|
|
workGroupInfo_.availableStackSize_ = calFuncInfo.stackSizeAvailable;
|
|
workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed;
|
|
|
|
device::Kernel::parameters_t params;
|
|
if (!createSignature(params)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
size_t
|
|
NullKernel::getCalBinarySize() const
|
|
{
|
|
CALuint imageSize;
|
|
if (!amuABIMultiBinaryGetSize(&imageSize, calImage())) {
|
|
LogError("Failed to get the image size!");
|
|
return 0;
|
|
}
|
|
return static_cast<size_t>(imageSize);
|
|
}
|
|
|
|
bool
|
|
NullKernel::getCalBinary(void* binary, size_t size) const
|
|
{
|
|
uint calImageSize = 0;
|
|
if (!amuABIMultiBinaryGetSize(&calImageSize, calImage())
|
|
|| size < calImageSize) {
|
|
LogError("CAL failed to save the kernel binary!");
|
|
return false;
|
|
}
|
|
::memcpy(binary, calImage(), calImageSize);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
Kernel::create(
|
|
const std::string& code,
|
|
const std::string& metadata,
|
|
const void* binaryCode,
|
|
size_t binarySize)
|
|
{
|
|
setPreferredSizeMultiple(dev().getAttribs().wavefrontSize);
|
|
|
|
if (!NullKernel::create(code, metadata, binaryCode, binarySize)) {
|
|
return false;
|
|
}
|
|
|
|
// initialize constant buffer sizes
|
|
if (!initConstBuffers()) {
|
|
return false;
|
|
}
|
|
|
|
// Initialize the kernel parameters
|
|
bool result = initParameters();
|
|
|
|
if (!dev().heap()->isVirtual()) {
|
|
amd::option::Options *options = nullProg().getCompilerOptions();
|
|
// @todo Remove this. This is a hack for no VM mode
|
|
if (!options->oVariables->EnableDumpKernel) {
|
|
if (!name().compare(BlitName[KernelBlitManager::BlitCopyImageToBuffer]) ||
|
|
!name().compare(BlitName[KernelBlitManager::BlitCopyBufferToImage])) {
|
|
blitKernelHack_ = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (result) {
|
|
buildError_ = CL_SUCCESS;
|
|
}
|
|
else {
|
|
result = false;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
Kernel::Kernel(
|
|
const std::string& name,
|
|
const Device& gpuDev,
|
|
const Program& prog,
|
|
const InitData* initData)
|
|
: NullKernel(name, gpuDev, prog)
|
|
, blitKernelHack_(false)
|
|
{
|
|
hwPrivateSize_ = 0;
|
|
if (NULL != initData) {
|
|
flags_ = initData->flags_;
|
|
hwPrivateSize_ = initData->hwPrivateSize_;
|
|
hwLocalSize_ = initData->hwLocalSize_;
|
|
}
|
|
// Workgroup info private memory size
|
|
workGroupInfo_.privateMemSize_ = hwPrivateSize_;
|
|
hsa_ = false;
|
|
}
|
|
|
|
Kernel::~Kernel()
|
|
{
|
|
if (calRef_ == NULL) {
|
|
return;
|
|
}
|
|
|
|
{
|
|
Device::ScopedLockVgpus lock(dev());
|
|
|
|
// Release all virtual image objects on all virtual GPUs
|
|
for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
|
|
dev().vgpus()[idx]->releaseKernel(calImage());
|
|
}
|
|
}
|
|
|
|
if (0 != numCb_) {
|
|
delete [] cbSizes_;
|
|
}
|
|
}
|
|
|
|
const Device&
|
|
Kernel::dev() const
|
|
{
|
|
return reinterpret_cast<const Device&>(gpuDev_);
|
|
}
|
|
|
|
const Program&
|
|
Kernel::prog() const
|
|
{
|
|
return reinterpret_cast<const Program&>(prog_);
|
|
}
|
|
|
|
bool
|
|
NullKernel::createMultiBinary(uint* imageSize, void** image, const void* isa)
|
|
{
|
|
const SC_HWSHADER* shader = reinterpret_cast<const SC_HWSHADER*>(isa);
|
|
|
|
bool result = false;
|
|
AMUabiAddEncoding encoding;
|
|
memset(&encoding, 0, sizeof(AMUabiAddEncoding));
|
|
|
|
size_t allocSize =
|
|
sizeof(uint) * MaxReadImage +
|
|
sizeof(CALUavEntry) * MaxUavArguments +
|
|
sizeof(CALSamplerMapEntry) * MaxSamplers +
|
|
sizeof(CALConstantBufferMask) * MaxConstBuffers +
|
|
sizeof(AMUabiLiteralConst) * shader->dep.NumIntrlIConstants;
|
|
char* tmpMem = new char[allocSize];
|
|
if (tmpMem == NULL) {
|
|
LogError("Error allocating memory");
|
|
return false;
|
|
}
|
|
|
|
CalcPtr(encoding.inputs, tmpMem, 0, 0);
|
|
CalcPtr(encoding.uav, encoding.inputs, sizeof(uint), MaxReadImage);
|
|
CalcPtr(encoding.inputSamplerMaps, encoding.uav, sizeof(CALUavEntry), MaxUavArguments);
|
|
CalcPtr(encoding.constBuffers, encoding.inputSamplerMaps, sizeof(CALSamplerMapEntry), MaxSamplers);
|
|
if (shader->dep.NumIntrlIConstants != 0) {
|
|
CalcPtr(encoding.litConsts, encoding.constBuffers, sizeof(CALConstantBufferMask), MaxConstBuffers);
|
|
}
|
|
AMUabiMultiBinary amuBinary;
|
|
amuABIMultiBinaryCreate(&amuBinary);
|
|
|
|
if (nullDev().settings().siPlus_) {
|
|
result = siCreateHwInfo(shader, encoding);
|
|
}
|
|
else {
|
|
result = r800CreateHwInfo(shader, encoding);
|
|
}
|
|
if (!result) {
|
|
delete [] tmpMem;
|
|
LogWarning("Error Creating program info");
|
|
return false;
|
|
}
|
|
|
|
addLoopConst(shader, encoding);
|
|
|
|
unsigned int outputCount=0, condOut=0, earlyExit=0, globalCount=0, persistentCount=0;
|
|
unsigned int symbolCount=0;
|
|
CALOutputEntry* outputs=0;
|
|
unsigned int* globalBuffers=0;
|
|
unsigned int* persistentBuffers=0;
|
|
AMUabiUserSymbol* symbols=0;
|
|
|
|
CALSamplerMapEntry* inputSamplers = encoding.inputSamplerMaps;
|
|
CALConstantBufferMask* constBuffers = encoding.constBuffers;
|
|
uint* inputResources = encoding.inputs;
|
|
CALUavEntry* uav = encoding.uav;
|
|
|
|
uint inputSamplerCount = samplerSize();
|
|
for (uint i = 0; i < inputSamplerCount; ++i) {
|
|
inputSamplers[i].resource = 0;
|
|
inputSamplers[i].sampler = sampler(i)->index_;
|
|
}
|
|
|
|
uint constBufferCount = 2;
|
|
|
|
constBuffers[0].index = 0;
|
|
constBuffers[1].index = 1;
|
|
|
|
uint inputResourceCount = 0;
|
|
|
|
uint uavCount = 0;
|
|
bool globalBound = false;
|
|
bool cbBound = false;
|
|
bool printfBound = false;
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerHwConst:
|
|
constBuffers[constBufferCount++].index = arg->index_;
|
|
break;
|
|
case KernelArg::PointerGlobal:
|
|
if (nullDev().settings().useAliases_) {
|
|
if (!globalBound) {
|
|
uav[uavCount].offset = uavRaw_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
if (uavArena_ != 0) {
|
|
uav[uavCount].offset = uavArena_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_ARENA;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
}
|
|
globalBound = true;
|
|
}
|
|
else {
|
|
uav[uavCount].offset = arg->index_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
break;
|
|
case KernelArg::ConstBufId:
|
|
if (!cbBound) {
|
|
uav[uavCount].offset = cbId_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
cbBound = true;
|
|
break;
|
|
case KernelArg::PrintfBufId:
|
|
if (!printfBound) {
|
|
uav[uavCount].offset = printfId_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
printfBound = true;
|
|
break;
|
|
case KernelArg::UavId:
|
|
if (!nullDev().settings().useAliases_ &&
|
|
(UavIdUndefined != uavRaw_) &&
|
|
!(flags() & PrintfOutput)) {
|
|
uav[uavCount].offset = arg->index_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
else {
|
|
if (UavIdUndefined != uavRaw_) {
|
|
uav[uavCount].offset = uavRaw_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
if (uavArena_ != 0) {
|
|
uav[uavCount].offset = uavArena_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_ARENA;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
}
|
|
break;
|
|
case KernelArg::Sampler:
|
|
inputSamplers[inputSamplerCount].resource = 0;
|
|
inputSamplers[inputSamplerCount].sampler = arg->index_;
|
|
inputSamplerCount++;
|
|
break;
|
|
case KernelArg::Image1D:
|
|
case KernelArg::Image2D:
|
|
case KernelArg::Image3D:
|
|
case KernelArg::Image1DB:
|
|
case KernelArg::Image1DA:
|
|
case KernelArg::Image2DA:
|
|
if (arg->memory_.readOnly_) {
|
|
inputResources[inputResourceCount++] = arg->index_;
|
|
}
|
|
else {
|
|
uav[uavCount].offset = arg->index_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPED;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_2D;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (uint i = 0; i < nullProg().glbCb().size(); ++i) {
|
|
constBuffers[constBufferCount++].index = nullProg().glbCb()[i];
|
|
}
|
|
|
|
encoding.machine = nullDev().hwInfo()->machine_;
|
|
encoding.type = ED_ATI_CAL_TYPE_COMPUTE;
|
|
encoding.inputCount = inputResourceCount;
|
|
encoding.outputCount = outputCount;
|
|
encoding.outputs = outputs;
|
|
encoding.condOut = condOut;
|
|
encoding.earlyExit = earlyExit;
|
|
encoding.globalBuffersCount = globalCount;
|
|
encoding.globalBuffers = globalBuffers;
|
|
encoding.persistentBuffersCount = persistentCount;
|
|
encoding.persistentBuffers = persistentBuffers;
|
|
encoding.constBuffersCount = constBufferCount;
|
|
encoding.inputSamplerMapCount = inputSamplerCount;
|
|
encoding.symbolsCount = symbolCount;
|
|
encoding.symbols = symbols;
|
|
encoding.uavCount = uavCount;
|
|
|
|
amuABIMultiBinaryAddEncoding(amuBinary, &encoding);
|
|
|
|
uint success = amuABIMultiBinaryPack(imageSize, image, amuBinary);
|
|
|
|
amuABIMultiBinaryDestroy(amuBinary);
|
|
|
|
delete [] tmpMem;
|
|
delete [] encoding.progInfos;
|
|
|
|
return (success == 0) ? false : true;
|
|
}
|
|
|
|
void
|
|
Kernel::findLocalWorkSize(
|
|
size_t workDim,
|
|
const amd::NDRange& gblWorkSize,
|
|
amd::NDRange& lclWorkSize) const
|
|
{
|
|
// Initialize the default workgoup info
|
|
// Check if the kernel has the compiled sizes
|
|
if (workGroupInfo()->compileSize_[0] == 0) {
|
|
// Find the default local workgroup size, if it wasn't specified
|
|
if (lclWorkSize[0] == 0) {
|
|
size_t thrPerGrp;
|
|
bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
|
|
bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
|
|
bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
|
|
|
|
bool overrideSet = ((workDim == 1) && b1DOverrideSet) ||
|
|
((workDim == 2) && b2DOverrideSet) ||
|
|
((workDim == 3) && b3DOverrideSet);
|
|
if (!overrideSet) {
|
|
// Find threads per group
|
|
thrPerGrp = workGroupInfo()->size_;
|
|
|
|
// Check if kernel uses images
|
|
if ((flags() & ImageEnable) &&
|
|
// and thread group is a multiple value of wavefronts
|
|
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
|
|
// and it's 2 or 3-dimensional workload
|
|
(workDim > 1) &&
|
|
((dev().settings().partialDispatch_) ||
|
|
(((gblWorkSize[0] % 16) == 0) &&
|
|
((gblWorkSize[1] % 16) == 0)))) {
|
|
// Use 8x8 workgroup size if kernel has image writes
|
|
if ((flags() & ImageWrite) ||
|
|
(thrPerGrp != nullDev().info().maxWorkGroupSize_)) {
|
|
lclWorkSize[0] = 8;
|
|
lclWorkSize[1] = 8;
|
|
}
|
|
else {
|
|
lclWorkSize[0] = 16;
|
|
lclWorkSize[1] = 16;
|
|
}
|
|
if (workDim == 3) {
|
|
lclWorkSize[2] = 1;
|
|
}
|
|
}
|
|
else {
|
|
size_t tmp = thrPerGrp;
|
|
// Split the local workgroup into the most efficient way
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
size_t div = tmp;
|
|
for (; (gblWorkSize[d] % div) != 0; div--);
|
|
lclWorkSize[d] = div;
|
|
tmp /= div;
|
|
}
|
|
|
|
// Check if partial dispatch is enabled and
|
|
if (dev().settings().partialDispatch_ &&
|
|
// we couldn't find optimal workload
|
|
(lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
|
|
size_t maxSize = 0;
|
|
size_t maxDim = 0;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (maxSize < gblWorkSize[d]) {
|
|
maxSize = gblWorkSize[d];
|
|
maxDim = d;
|
|
}
|
|
}
|
|
// Check if a local workgroup has the most optimal size
|
|
if (thrPerGrp > maxSize) {
|
|
thrPerGrp = maxSize;
|
|
}
|
|
lclWorkSize[maxDim] = thrPerGrp;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (d != maxDim) {
|
|
lclWorkSize[d] = 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
// Use overrides when app doesn't provide workgroup dimensions
|
|
if (workDim == 1) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
|
|
}
|
|
else if (workDim == 2) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
|
|
}
|
|
else if (workDim == 3) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
|
|
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
|
|
}
|
|
else
|
|
{
|
|
assert(0 && "Invalid workDim!");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
Kernel::setupProgramGrid(
|
|
VirtualGPU& gpu,
|
|
size_t workDim,
|
|
const amd::NDRange& glbWorkOffset,
|
|
const amd::NDRange& gblWorkSize,
|
|
amd::NDRange& lclWorkSize,
|
|
const amd::NDRange& groupOffset,
|
|
const amd::NDRange& glbWorkOffsetOrg,
|
|
const amd::NDRange& glbWorkSizeOrg
|
|
) const
|
|
{
|
|
// ABI is always in CB0
|
|
address cbBuf = gpu.cb(0)->sysMemCopy();
|
|
uint* pGlobalSize = reinterpret_cast<uint*>
|
|
(cbBuf + GlobalWorkitemOffset * ConstBuffer::VectorSize);
|
|
uint* pLocalSize = reinterpret_cast<uint*>
|
|
(cbBuf + LocalWorkitemOffset * ConstBuffer::VectorSize);
|
|
uint* pNumGroups = reinterpret_cast<uint*>
|
|
(cbBuf + GroupsOffset * ConstBuffer::VectorSize);
|
|
uint* pGlobalOffset = reinterpret_cast<uint*>
|
|
(cbBuf + GlobalWorkOffsetOffset * ConstBuffer::VectorSize);
|
|
uint* pGroupOffset = reinterpret_cast<uint*>
|
|
(cbBuf + GroupWorkOffsetOffset * ConstBuffer::VectorSize);
|
|
uint32_t* debugInfo = reinterpret_cast<uint*>
|
|
(cbBuf + DebugOffset * ConstBuffer::VectorSize);
|
|
uint* pNDRangeGlobalOffset = reinterpret_cast<uint*>
|
|
(cbBuf + NDRangeGlobalWorkOffsetOffset * ConstBuffer::VectorSize);
|
|
|
|
// Check for 64-bit metadata
|
|
uint glbABIShift = (abi64Bit()) ? 1 : 0;
|
|
|
|
ProgramGrid* progGrid = &gpu.cal_.progGrid_;
|
|
|
|
// Finds local workgroup size
|
|
findLocalWorkSize(workDim, gblWorkSize, lclWorkSize);
|
|
|
|
// Initialize the execution grid block and size/offset
|
|
pGlobalSize[0] = pGlobalSize[1] = pGlobalSize[2] = 1;
|
|
pGlobalSize[3] = static_cast<uint>(workDim);
|
|
|
|
pLocalSize[0] = pLocalSize[1] = pLocalSize[2] = 1;
|
|
pLocalSize[3] = 0;
|
|
|
|
pNumGroups[0] = pNumGroups[1] = pNumGroups[2] = 1;
|
|
pNumGroups[3] = 0;
|
|
|
|
pGlobalOffset[2] = pGlobalOffset[1] = pGlobalOffset[0] = 0;
|
|
pGroupOffset[2] = pGroupOffset[1] = pGroupOffset[0] = 0;
|
|
|
|
progGrid->gridBlock.width =
|
|
progGrid->gridBlock.height =
|
|
progGrid->gridBlock.depth = 1;
|
|
|
|
progGrid->gridSize.width =
|
|
progGrid->gridSize.height =
|
|
progGrid->gridSize.depth = 1;
|
|
|
|
progGrid->partialGridBlock.width =
|
|
progGrid->partialGridBlock.height =
|
|
progGrid->partialGridBlock.depth = 1;
|
|
|
|
bool partialGrid = false;
|
|
|
|
// Fill the right values, based on the application request
|
|
switch (workDim) {
|
|
case 3:
|
|
pLocalSize[2] =
|
|
progGrid->gridBlock.depth = static_cast<CALuint>(lclWorkSize[2]);
|
|
|
|
pGlobalSize[2] = static_cast<CALuint>(glbWorkSizeOrg[2]);
|
|
progGrid->gridSize.depth = static_cast<CALuint>(gblWorkSize[2]);
|
|
progGrid->gridSize.depth /= progGrid->gridBlock.depth;
|
|
pNumGroups[2] = pGlobalSize[2] / progGrid->gridBlock.depth;
|
|
|
|
pGlobalOffset[2] = glbWorkOffset[2];
|
|
pGroupOffset[2] = groupOffset[2];
|
|
pNDRangeGlobalOffset[2 + glbABIShift] = glbWorkOffsetOrg[2];
|
|
|
|
if (dev().settings().partialDispatch_) {
|
|
// Check if partial workgroup dispatch is required
|
|
progGrid->partialGridBlock.depth = gblWorkSize[2] % lclWorkSize[2];
|
|
if (progGrid->partialGridBlock.depth != 0) {
|
|
partialGrid = true;
|
|
// Increment the number of groups
|
|
progGrid->gridSize.depth++;
|
|
pNumGroups[2]++;
|
|
}
|
|
else {
|
|
progGrid->partialGridBlock.depth = lclWorkSize[2];
|
|
}
|
|
}
|
|
// Fall through to fill 2D and 1D dimensions...
|
|
case 2:
|
|
pLocalSize[1] =
|
|
progGrid->gridBlock.height = static_cast<CALuint>(lclWorkSize[1]);
|
|
|
|
pGlobalSize[1] = static_cast<CALuint>(glbWorkSizeOrg[1]);
|
|
progGrid->gridSize.height = static_cast<CALuint>(gblWorkSize[1]);
|
|
progGrid->gridSize.height /= progGrid->gridBlock.height;
|
|
pNumGroups[1] = pGlobalSize[1] / progGrid->gridBlock.height;
|
|
|
|
pGlobalOffset[1] = glbWorkOffset[1];
|
|
pGroupOffset[1] = groupOffset[1];
|
|
pNDRangeGlobalOffset[1 + glbABIShift] = glbWorkOffsetOrg[1];
|
|
|
|
if (dev().settings().partialDispatch_) {
|
|
// Check if partial workgroup dispatch is required
|
|
progGrid->partialGridBlock.height = gblWorkSize[1] % lclWorkSize[1];
|
|
if (progGrid->partialGridBlock.height != 0) {
|
|
partialGrid = true;
|
|
// Increment the number of groups
|
|
progGrid->gridSize.height++;
|
|
pNumGroups[1]++;
|
|
}
|
|
else {
|
|
progGrid->partialGridBlock.height = lclWorkSize[1];
|
|
}
|
|
}
|
|
// Fall through to fill 1D dimension...
|
|
case 1:
|
|
pLocalSize[0] =
|
|
progGrid->gridBlock.width = static_cast<CALuint>(lclWorkSize[0]);
|
|
|
|
pGlobalSize[0] = static_cast<CALuint>(glbWorkSizeOrg[0]);
|
|
progGrid->gridSize.width = static_cast<CALuint>(gblWorkSize[0]);
|
|
progGrid->gridSize.width /= progGrid->gridBlock.width;
|
|
pNumGroups[0] = pGlobalSize[0] / progGrid->gridBlock.width;
|
|
|
|
pGlobalOffset[0] = glbWorkOffset[0];
|
|
pGroupOffset[0] = groupOffset[0];
|
|
pNDRangeGlobalOffset[0 + glbABIShift] = glbWorkOffsetOrg[0];
|
|
|
|
if (dev().settings().partialDispatch_) {
|
|
// Check if partial workgroup dispatch is required
|
|
progGrid->partialGridBlock.width = gblWorkSize[0] % lclWorkSize[0];
|
|
if (progGrid->partialGridBlock.width != 0) {
|
|
partialGrid = true;
|
|
// Increment the number of groups
|
|
progGrid->gridSize.width++;
|
|
pNumGroups[0]++;
|
|
}
|
|
else {
|
|
progGrid->partialGridBlock.width = lclWorkSize[0];
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
LogWarning("Wrong dimensions. Force to 1x1x1!");
|
|
break;
|
|
}
|
|
|
|
if (!partialGrid) {
|
|
progGrid->partialGridBlock.width =
|
|
progGrid->partialGridBlock.height =
|
|
progGrid->partialGridBlock.depth = 0;
|
|
}
|
|
|
|
// Calculate the total number of workitems and workgroups
|
|
pGlobalOffset[3] = pGroupOffset[3] = 1;
|
|
for (uint i = 0; i < workDim; ++i) {
|
|
pGlobalOffset[3] *= pGlobalOffset[i];
|
|
pGroupOffset[3] *= pGroupOffset[i];
|
|
}
|
|
|
|
// Setup debug output buffer (if printf is active)
|
|
if (flags() & PrintfOutput) {
|
|
if (abi64Bit()) {
|
|
// Setup the debug info in constant buffer
|
|
reinterpret_cast<uint64_t*>(debugInfo)[1] =
|
|
gpu.printfDbg().bufOffset();
|
|
// Size in DWORDs
|
|
debugInfo[4] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
|
|
debugInfo[4] /= sizeof(uint32_t);
|
|
}
|
|
else {
|
|
// Setup the debug info in constant buffer
|
|
debugInfo[1] = static_cast<uint32_t>(gpu.printfDbg().bufOffset());
|
|
// Size in DWORDs
|
|
debugInfo[2] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
|
|
debugInfo[2] /= sizeof(uint32_t);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool
|
|
Kernel::initParameters()
|
|
{
|
|
size_t offset = 0;
|
|
device::Kernel::parameters_t params;
|
|
amd::KernelParameterDescriptor desc;
|
|
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
|
|
// Initialize the arguments for the abstraction layer
|
|
if (arg->isCbNeeded()) {
|
|
desc.name_ = arg->name_.data();
|
|
desc.type_ = arg->type();
|
|
desc.size_ = arg->size(false);
|
|
desc.addressQualifier_ = arg->addressQualifier();
|
|
desc.accessQualifier_ = arg->accessQualifier();
|
|
desc.typeName_ = arg->typeName();
|
|
desc.typeQualifier_ = arg->typeQualifier();
|
|
|
|
// Make offset alignment to match CPU metadata, since
|
|
// in multidevice config abstraction layer has a single signature
|
|
// and CPU sends the paramaters as they are allocated in memory
|
|
size_t size = desc.size_;
|
|
if (size == 0) {
|
|
// Local memory for CPU
|
|
size = sizeof(cl_mem);
|
|
}
|
|
offset = amd::alignUp(offset, std::min(size, size_t(16)));
|
|
desc.offset_ = offset;
|
|
offset += amd::alignUp(size, sizeof(uint32_t));
|
|
params.push_back(desc);
|
|
}
|
|
}
|
|
|
|
// Report the allocated local memory size (emulated and hw)
|
|
if (hwLocalSize_ != 0) {
|
|
CondLog((dev().info().localMemSize_ < hwLocalSize_),
|
|
"Requested local size is bigger than reported");
|
|
workGroupInfo_.localMemSize_ = hwLocalSize_;
|
|
}
|
|
|
|
if (!createSignature(params)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
Kernel::bindGlobalHwCb(
|
|
VirtualGPU& gpu,
|
|
VirtualGPU::GslKernelDesc* desc) const
|
|
{
|
|
bool result = true;
|
|
|
|
// Bind HW constant buffers used for the global data store
|
|
const Program::HwConstBuffers& gds = prog().glbHwCb();
|
|
for (Program::HwConstBuffers::const_iterator it = gds.begin();
|
|
(it != gds.end() && result); ++it) {
|
|
uint idx = it->first;
|
|
result = bindResource(gpu, *(it->second), idx, ConstantBuffer, idx);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
Kernel::bindConstantBuffers(VirtualGPU& gpu) const
|
|
{
|
|
bool result = true;
|
|
|
|
assert((numCb_ <= MaxConstBuffersArguments) &&
|
|
"Runtime doesn't support more CBs for arguments!");
|
|
|
|
// Upload the parameters to HW and bind all constant buffers
|
|
for (uint i = 0; i < numCb_; i++) {
|
|
ConstBuffer* cb = gpu.constBufs_[i];
|
|
result &= cb->uploadDataToHw(cbSizes_[i]) &&
|
|
bindResource(gpu, *cb, i, ConstantBuffer, i, NULL, cb->wrtOffset());
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
Kernel::processMemObjects(
|
|
VirtualGPU& gpu,
|
|
const amd::Kernel& kernel,
|
|
const_address params,
|
|
bool nativeMem) const
|
|
{
|
|
bool aliases = false;
|
|
VirtualGPU::MemoryDependency& dependecy = gpu.memoryDependency();
|
|
bool readCache = !internal_ && rwAttributes_ && !dev().settings().assumeAliases_;
|
|
|
|
// Mark the tracker with a new kernel,
|
|
// so we can avoid checks of the aliased objects
|
|
gpu.memoryDependency().newKernel();
|
|
|
|
// Check all parameters for the current kernel
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
for (size_t i = 0; i < signature.numParameters(); ++i) {
|
|
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
|
const KernelArg* arg = argument(i);
|
|
Memory* memory = NULL;
|
|
bool readOnly = false;
|
|
|
|
// Find if current argument is a buffer
|
|
if ((desc.type_ == T_POINTER) &&
|
|
(arg->type_ != KernelArg::PointerLocal) &&
|
|
(arg->type_ != KernelArg::PointerHwLocal)) {
|
|
if (nativeMem) {
|
|
memory = *reinterpret_cast<Memory* const*>(params + desc.offset_);
|
|
}
|
|
else if (*reinterpret_cast<amd::Memory* const*>
|
|
(params + desc.offset_) != NULL) {
|
|
memory = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>
|
|
(params + desc.offset_));
|
|
// Synchronize data with other memory instances if necessary
|
|
memory->syncCacheFromHost(gpu);
|
|
}
|
|
|
|
if (memory != NULL) {
|
|
readOnly = arg->memory_.readOnly_;
|
|
|
|
// Check if read cache optimization is possible
|
|
if (readCache) {
|
|
// Find if the same buffer was sent to other arguments (aliases)
|
|
for (size_t j = i + 1; (j < signature.numParameters()); ++j) {
|
|
const amd::KernelParameterDescriptor& descJ = signature.at(j);
|
|
const KernelArg* argJ = argument(j);
|
|
if (argJ->type_ == KernelArg::PointerGlobal) {
|
|
bool readOnlyJ = argJ->memory_.readOnly_;
|
|
Memory* memory2 = NULL;
|
|
if (nativeMem) {
|
|
memory2 = *reinterpret_cast<Memory* const*>
|
|
(params + descJ.offset_);
|
|
}
|
|
else if (*reinterpret_cast<amd::Memory* const*>
|
|
(params + descJ.offset_) != NULL) {
|
|
memory2 = dev().getGpuMemory(
|
|
*reinterpret_cast<amd::Memory* const*>
|
|
(params + descJ.offset_));
|
|
}
|
|
if (memory == memory2) {
|
|
if (!readOnly || !readOnlyJ) {
|
|
aliases = true;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Validate memory for a dependency in the queue
|
|
gpu.memoryDependency().validate(gpu, memory, readOnly);
|
|
}
|
|
}
|
|
}
|
|
|
|
return aliases & readCache;
|
|
}
|
|
|
|
bool
|
|
Kernel::loadParameters(
|
|
VirtualGPU& gpu,
|
|
const amd::Kernel& kernel,
|
|
const_address params,
|
|
bool nativeMem) const
|
|
{
|
|
bool result = true;
|
|
uint i;
|
|
|
|
// Initialize local private ranges
|
|
if (!initLocalPrivateRanges(gpu)) {
|
|
return false;
|
|
}
|
|
|
|
if (!dev().settings().useAliases_ &&
|
|
(UavIdUndefined != uavRaw_) &&
|
|
(!(flags() & PrintfOutput) || (printfId_ != UavIdUndefined))) {
|
|
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
|
|
// Bind a buffer for a dummy read
|
|
result = bindResource(gpu, *gpuMemory, 0,
|
|
ArgumentUavID, uavRaw_);
|
|
}
|
|
|
|
// Find all parameters for the current kernel
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
for (i = 0; i != signature.numParameters(); ++i) {
|
|
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
|
// Set current argument
|
|
if (!setArgument(gpu, i, params + desc.offset_, desc.size_, nativeMem)) {
|
|
result = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (result) {
|
|
// Update the ring ranges and math constant
|
|
setLocalPrivateRanges(gpu);
|
|
|
|
result = bindConstantBuffers(gpu);
|
|
|
|
if (dev().settings().useAliases_) {
|
|
result &= bindResource(gpu, dev().globalMem(), 0, GlobalBufferArena, uavArena_);
|
|
}
|
|
else if (flags() & PrivateFixed) {
|
|
result &= bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_);
|
|
}
|
|
|
|
// Setup debug output buffer (if printf is active)
|
|
if (flags() & PrintfOutput) {
|
|
gpu.addVmMemory(gpu.printfDbg().dbgBuffer());
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
Kernel::run(VirtualGPU& gpu, GpuEvent* calEvent, bool lastRun) const
|
|
{
|
|
// 8xx workaround for the number of groups limit in HW
|
|
if (gpu.gslKernelDesc()->funcInfo_.setBufferForNumGroup) {
|
|
const ProgramGrid* programGrid = &gpu.cal()->progGrid_;
|
|
ConstBuffer* cb = gpu.numGrpCb();
|
|
assert((cb != NULL) && "Runtime must have the constant buffer");
|
|
|
|
uint32_t* memPtr = reinterpret_cast<uint32_t*>(cb->sysMemCopy());
|
|
memPtr[0] = programGrid->gridSize.width;
|
|
memPtr[1] = programGrid->gridSize.height;
|
|
memPtr[2] = programGrid->gridSize.depth;
|
|
memPtr[3] = 0;
|
|
|
|
memPtr[4] = programGrid->gridBlock.width;
|
|
memPtr[5] = programGrid->gridBlock.height;
|
|
memPtr[6] = programGrid->gridBlock.depth;
|
|
memPtr[7] = 0;
|
|
|
|
bool result = cb->uploadDataToHw(8 * sizeof(uint32_t));
|
|
if (result) {
|
|
gpu.setConstantBuffer(SC_INFO_CONSTANTBUFFER,
|
|
cb->gslResource(), static_cast<CALuint>(cb->wrtOffset()), cb->hbSize());
|
|
}
|
|
else {
|
|
assert(!"Runtime didn't upload data for NumGroup workaround");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if (!gpu.runProgramGrid(*calEvent,
|
|
const_cast<ProgramGrid*>(&gpu.cal()->progGrid_), gpu.vmMems(), gpu.cal_.memCount_)) {
|
|
LogError("Failed to execute the program!");
|
|
return false;
|
|
}
|
|
|
|
// Unbind all resources
|
|
unbindResources(gpu, *calEvent, lastRun);
|
|
|
|
return true;
|
|
}
|
|
|
|
static size_t counter = 0;
|
|
void
|
|
Kernel::debug(VirtualGPU& gpu) const
|
|
{
|
|
std::fstream stubWrite;
|
|
address src = NULL;
|
|
if (!dev().heap()->isVirtual()) {
|
|
src = reinterpret_cast<address>
|
|
(const_cast<Resource&>(dev().globalMem()).map(&gpu));
|
|
}
|
|
|
|
std::cerr << "--- " << name_ << " ---" << std::endl;
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
Memory* gpuMem = gpu.slots_[i].memory_;
|
|
std::stringstream fileName;
|
|
bool bufferObj =
|
|
((arg->type_ == KernelArg::PointerGlobal) ||
|
|
(arg->type_ == KernelArg::PointerConst) ||
|
|
(arg->type_ == KernelArg::PointerHwConst));
|
|
|
|
if ((src != NULL) && arg->isCbNeeded() && bufferObj) {
|
|
address memory = gpu.cb(arg->cbIdx_)->sysMemCopy();
|
|
std::cerr.setf(std::ios::hex);
|
|
uint* location = reinterpret_cast<uint*>
|
|
(src + *reinterpret_cast<uint*>(memory + arg->cbPos_));
|
|
std::cerr << " > " << arg->name_ << ": 0x" << location << std::endl;
|
|
|
|
// Dump the data
|
|
fileName << counter << "_kernel_" << name() <<
|
|
"_" << arg->name_ << "_" << location << ".bin";
|
|
stubWrite.open(fileName.str().c_str(),
|
|
(std::fstream::out | std::fstream::binary));
|
|
|
|
// Write data to a file
|
|
if (stubWrite.is_open()) {
|
|
stubWrite.write(
|
|
reinterpret_cast<char*>(location), gpuMem->size());
|
|
stubWrite.close();
|
|
}
|
|
}
|
|
if (((arg->type_ >= KernelArg::Image1D) &&
|
|
(arg->type_ <= KernelArg::Image3D)) ||
|
|
((src == NULL) && bufferObj)) {
|
|
Memory* resource = gpu.slots_[i].memory_;
|
|
void* memory = resource->map(&gpu);
|
|
uint* location = reinterpret_cast<uint*>(memory);
|
|
std::cerr << " > " << arg->name_ << (bufferObj ? ": buffer" : ": image") << std::endl;
|
|
// Dump the data
|
|
fileName << counter << "_kernel_" << name() <<
|
|
"_" << arg->name_ << "_" << location << ".bin";
|
|
stubWrite.open(fileName.str().c_str(),
|
|
(std::fstream::out | std::fstream::binary));
|
|
|
|
// Write data to a file
|
|
if (stubWrite.is_open()) {
|
|
stubWrite.write(
|
|
reinterpret_cast<char*>(location), gpuMem->size());
|
|
stubWrite.close();
|
|
}
|
|
resource->unmap(&gpu);
|
|
}
|
|
}
|
|
|
|
for (uint i = 0; i < gpu.constBufs_.size(); ++i) {
|
|
std::stringstream fileName;
|
|
fileName << counter++ << "_kernel_" << name() << "_const" << i << ".bin";
|
|
stubWrite.open(fileName.str().c_str(),
|
|
(std::fstream::out | std::fstream::binary));
|
|
if (stubWrite.is_open()) {
|
|
address memory = reinterpret_cast<address>(gpu.constBufs_[i]->map(&gpu, Resource::ReadOnly));
|
|
// Check if we have OpenCL program
|
|
stubWrite.write(reinterpret_cast<char*>(memory+gpu.cb(i)->wrtOffset()), gpu.cb(i)->lastWrtSize());
|
|
gpu.constBufs_[i]->unmap(&gpu);
|
|
stubWrite.close();
|
|
}
|
|
}
|
|
const Program::HwConstBuffers& gds = prog().glbHwCb();
|
|
for (Program::HwConstBuffers::const_iterator it = gds.begin(); it != gds.end(); ++it) {
|
|
uint idx = it->first;
|
|
std::stringstream fileName;
|
|
fileName << counter++ << "_kernel_" << name() << "_const" << idx << ".bin";
|
|
stubWrite.open(fileName.str().c_str(),
|
|
(std::fstream::out | std::fstream::binary));
|
|
if (stubWrite.is_open()) {
|
|
address memory = reinterpret_cast<address>((it->second)->map(&gpu, Resource::ReadOnly));
|
|
// Check if we have OpenCL program
|
|
stubWrite.write(reinterpret_cast<char*>(memory), (it->second)->size());
|
|
(it->second)->unmap(&gpu);
|
|
stubWrite.close();
|
|
}
|
|
}
|
|
if (!dev().heap()->isVirtual()) {
|
|
const_cast<Resource&>(dev().globalMem()).unmap(&gpu);
|
|
}
|
|
}
|
|
|
|
bool
|
|
Kernel::initConstBuffers()
|
|
{
|
|
bool result = true;
|
|
size_t i;
|
|
|
|
assert((numCb_ != 0) && "We have 0 constant buffers!");
|
|
|
|
// Allocate an array for CB sizes
|
|
cbSizes_ = new size_t[numCb_];
|
|
if (cbSizes_ == NULL) {
|
|
return false;
|
|
}
|
|
memset(cbSizes_, 0, sizeof(size_t) * numCb_);
|
|
|
|
// CB0 is reserved for ABI data
|
|
cbSizes_[0] = TotalABIVectors * ConstBuffer::VectorSize;
|
|
|
|
// Find sizes of all constant buffers
|
|
for (i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
size_t size = arg->cbPos_ + arg->size(true);
|
|
size_t specVec = arg->specialVector();
|
|
if (specVec != 0) {
|
|
size = arg->cbPos_ + (arg->size_ / KernelArg::VectorSizeLimit) *
|
|
ConstBuffer::VectorSize;
|
|
}
|
|
// Do we need a CB?
|
|
if (arg->isCbNeeded() && (cbSizes_[arg->cbIdx_] < size)) {
|
|
cbSizes_[arg->cbIdx_] = size;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
Kernel::setInternalSamplers(VirtualGPU& gpu) const
|
|
{
|
|
for (uint i = 0; i < samplerSize(); ++i) {
|
|
const KernelArg* arg = sampler(i);
|
|
uint state = arg->cbPos_;
|
|
uint idx = arg->index_;
|
|
|
|
if (gpu.cal()->samplersState_[idx] != state) {
|
|
setSampler(gpu, state, idx);
|
|
gpu.cal_.samplersState_[idx] = state;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
Kernel::setArgument(
|
|
VirtualGPU& gpu,
|
|
uint idx,
|
|
const void* param,
|
|
size_t size,
|
|
bool nativeMem) const
|
|
{
|
|
bool result = true;
|
|
const KernelArg* arg;
|
|
address memory;
|
|
size_t argSize;
|
|
static const bool waitOnBusyEngine = true;
|
|
|
|
assert((idx < arguments_.size()) && "Param index is out of range!");
|
|
|
|
arg = argument(idx);
|
|
assert((arg->cbIdx_ == 1) && "Runtime supports CB1 only for the arguments buffer!");
|
|
memory = gpu.cb(1)->sysMemCopy();
|
|
argSize = arg->size(true);
|
|
|
|
// Bind the global heap for emulation mode
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
// Fall through ...
|
|
default:
|
|
break;
|
|
}
|
|
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerHwConst:
|
|
case KernelArg::PointerGlobal:
|
|
{
|
|
gpu::Memory* gpuMem = NULL;
|
|
if (nativeMem) {
|
|
gpuMem = *reinterpret_cast<Memory* const*>(param);
|
|
}
|
|
else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
|
|
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
|
|
}
|
|
bool forceZeroOffset = false;
|
|
|
|
if (gpuMem == NULL) {
|
|
forceZeroOffset = true;
|
|
gpuMem = dev().getGpuMemory(dev().dummyPage());
|
|
}
|
|
uint64_t offset = gpuMem->pinOffset();
|
|
|
|
// Make sure the passed argument is a buffer object
|
|
if (!gpuMem->cal()->buffer_) {
|
|
LogError("The kernel buffer argument isn't a buffer object!");
|
|
return false;
|
|
}
|
|
|
|
if (arg->type_ == KernelArg::PointerHwConst) {
|
|
// Bind current memory object with the kernel
|
|
if (!bindResource(gpu, *gpuMem, idx,
|
|
ArgumentConstBuffer, arg->index_, gpuMem)) {
|
|
return false;
|
|
}
|
|
assert((offset == 0) && "No offset for HW CB");
|
|
// Add a fake offset to make sure (ptr != NULL) is TRUE
|
|
offset = 1;
|
|
}
|
|
else {
|
|
ResourceType type = ArgumentHeapBuffer;
|
|
|
|
// Check if kernel expects UAV binding
|
|
if (arg->memory_.uavBuf_) {
|
|
type = ArgumentBuffer;
|
|
}
|
|
else {
|
|
if (blitKernelHack_) {
|
|
// Bind global buffer to UAV this buffer is bound to
|
|
if (!bindResource(gpu, *gpuMem, 0, GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
}
|
|
else {
|
|
// Bind global buffer to UAV this buffer is bound to
|
|
if (!bindResource(gpu, dev().globalMem(), 0,
|
|
GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Bind current memory object with the kernel
|
|
// Note: it's a fake binding, if the buffer is part of
|
|
// the global heap
|
|
if (!bindResource(gpu, *gpuMem, idx, type, arg->index_, gpuMem)) {
|
|
return false;
|
|
}
|
|
|
|
// Update offset only if we bind HeapBuffer or
|
|
// it's global address space in UAV setup on SI+
|
|
if ((type == ArgumentHeapBuffer) || dev().settings().siPlus_) {
|
|
if (!blitKernelHack_) {
|
|
offset += gpuMem->hbOffset();
|
|
if (!forceZeroOffset) {
|
|
assert((offset != 0) && "Offset 0 with a real allocation!");
|
|
}
|
|
}
|
|
gpu.addVmMemory(gpuMem);
|
|
}
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, waitOnBusyEngine);
|
|
|
|
if (forceZeroOffset) {
|
|
offset = 0;
|
|
}
|
|
|
|
// Copy memory offset into the constant buffer
|
|
if (abi64Bit()) {
|
|
*(reinterpret_cast<uint64_t*>(memory + arg->cbPos_)) = offset;
|
|
}
|
|
else {
|
|
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) =
|
|
static_cast<uint>(offset);
|
|
}
|
|
}
|
|
break;
|
|
case KernelArg::Image1D:
|
|
case KernelArg::Image2D:
|
|
case KernelArg::Image3D:
|
|
case KernelArg::Image1DB:
|
|
case KernelArg::Image1DA:
|
|
case KernelArg::Image2DA:
|
|
{
|
|
gpu::Memory* gpuMem = NULL;
|
|
if (nativeMem) {
|
|
gpuMem = *reinterpret_cast<Memory* const*>(param);
|
|
}
|
|
else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
|
|
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
|
|
}
|
|
|
|
if (gpuMem == NULL) {
|
|
return false;
|
|
}
|
|
// Make sure the passed argument is an image object
|
|
if (gpuMem->cal()->buffer_) {
|
|
LogError("The kernel image argument isn't an image object!");
|
|
return false;
|
|
}
|
|
|
|
ResourceType resType = arg->memory_.readOnly_ ?
|
|
ArgumentImageRead : ArgumentImageWrite;
|
|
|
|
// Bind current memory object with the shader.
|
|
if (!bindResource(gpu, *gpuMem, idx,
|
|
resType, arg->index_, gpuMem)) {
|
|
return false;
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, waitOnBusyEngine);
|
|
|
|
// Copy image constants into the constant buffer
|
|
if (gpuMem->owner() != NULL) {
|
|
copyImageConstants(gpuMem->owner()->asImage(),
|
|
reinterpret_cast<ImageConstants*>(memory + arg->cbPos_));
|
|
}
|
|
}
|
|
break;
|
|
case KernelArg::Sampler:
|
|
{
|
|
amd::Sampler* amdSampler =
|
|
*reinterpret_cast<amd::Sampler* const*>(param);
|
|
uint idx = arg->index_;
|
|
uint32_t state = amdSampler->state();
|
|
|
|
if (state != gpu.cal()->samplersState_[idx]) {
|
|
setSampler(gpu, state, idx);
|
|
gpu.cal_.samplersState_[idx] = state;
|
|
}
|
|
|
|
// Copy sampler state into the constant buffer
|
|
*(reinterpret_cast<uint32_t*>(memory + arg->cbPos_)) = state;
|
|
}
|
|
break;
|
|
case KernelArg::Counter:
|
|
{
|
|
gpu::Memory* gpuMem = NULL;
|
|
if (nativeMem) {
|
|
gpuMem = *reinterpret_cast<Memory* const*>(param);
|
|
}
|
|
else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
|
|
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, waitOnBusyEngine);
|
|
|
|
// Bind current memory object with the shader.
|
|
if (!bindResource(gpu, *gpuMem, idx,
|
|
ArgumentCounter, idx, gpuMem)) {
|
|
return false;
|
|
}
|
|
}
|
|
break;
|
|
case KernelArg::PointerHwLocal:
|
|
{
|
|
// Calculate current offset in the local ring
|
|
uint offset = gpu.cal_.progGrid_.localSize;
|
|
uint extra = amd::alignUp(offset, arg->alignment_) - offset;
|
|
|
|
offset = amd::alignUp(offset, arg->alignment_);
|
|
size_t memSize = *static_cast<const uintptr_t*>(param);
|
|
|
|
// Allocate new memory from the local ring
|
|
gpu.cal_.progGrid_.localSize += static_cast<uint>(memSize) + extra;
|
|
// Copy current local argument's offset into the CB
|
|
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) = offset;
|
|
|
|
CondLog((gpu.cal_.progGrid_.localSize > dev().info().localMemSize_),
|
|
"Requested local size is bigger than reported!");
|
|
}
|
|
break;
|
|
case KernelArg::Float:
|
|
case KernelArg::Double:
|
|
case KernelArg::Char:
|
|
case KernelArg::UChar:
|
|
case KernelArg::Short:
|
|
case KernelArg::UShort:
|
|
case KernelArg::Int:
|
|
case KernelArg::UInt:
|
|
case KernelArg::Long:
|
|
case KernelArg::ULong:
|
|
if (size != argSize) {
|
|
LogWarning("Parameter's sizes are unmatched!");
|
|
}
|
|
// Fall through ...
|
|
case KernelArg::Struct:
|
|
case KernelArg::Union: {
|
|
size_t specVec = arg->specialVector();
|
|
if (specVec != 0) {
|
|
uint iter = (arg->size_ / KernelArg::VectorSizeLimit);
|
|
for (uint i = 0; i < iter; ++i) {
|
|
amd::Os::fastMemcpy((memory + arg->cbPos_ +
|
|
i * ConstBuffer::VectorSize),
|
|
reinterpret_cast<const char*>(param) +
|
|
i * KernelArg::VectorSizeLimit * specVec,
|
|
specVec * KernelArg::VectorSizeLimit);
|
|
}
|
|
}
|
|
else {
|
|
// Copy data into the CB
|
|
amd::Os::fastMemcpy((memory + arg->cbPos_), param, size);
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
LogError("Unhandled argument's type!");
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool
|
|
Kernel::initLocalPrivateRanges(VirtualGPU& gpu) const
|
|
{
|
|
// Initialize HW local
|
|
gpu.cal_.progGrid_.localSize = hwLocalSize_;
|
|
|
|
// Bind the global buffer if emulated local or private memory
|
|
// was allocated by the kernel
|
|
if ((flags() & PrintfOutput && (printfId_ == UavIdUndefined)) &&
|
|
(uavRaw_ != UavIdUndefined)) {
|
|
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Bind the global buffer if emulated constant buffers are enabled
|
|
if (cbId_ != UavIdUndefined) {
|
|
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentCbID, cbId_)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Bind the printf buffer
|
|
if (printfId_ != UavIdUndefined) {
|
|
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentPrintfID, printfId_)) {
|
|
return false;
|
|
}
|
|
}
|
|
// Initialize the iterations count
|
|
gpu.cal_.iterations_ = 1;
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
Kernel::setLocalPrivateRanges(VirtualGPU& gpu) const
|
|
{
|
|
address cbBuf = gpu.cb(0)->sysMemCopy();
|
|
uint* data;
|
|
uint gridSize =
|
|
gpu.cal()->progGrid_.gridSize.width *
|
|
gpu.cal()->progGrid_.gridSize.height *
|
|
gpu.cal()->progGrid_.gridSize.depth;
|
|
uint blockSize =
|
|
gpu.cal()->progGrid_.gridBlock.width *
|
|
gpu.cal()->progGrid_.gridBlock.height *
|
|
gpu.cal()->progGrid_.gridBlock.depth;
|
|
|
|
//! \todo validate if the compiler still generates PrivateFixed
|
|
if (flags() & PrivateFixed) {
|
|
// Update private ring
|
|
data = reinterpret_cast<uint*>
|
|
(cbBuf + PrivateRingOffset * ConstBuffer::VectorSize);
|
|
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
|
|
|
|
if (abi64Bit()) {
|
|
reinterpret_cast<uint64_t*>(data)[0] = gpuMemory->hbOffset();
|
|
data[2] = 0;
|
|
data[3] = 0;
|
|
}
|
|
else {
|
|
data[0] = static_cast<uint>(gpuMemory->hbOffset());
|
|
data[1] = 0;
|
|
data[2] = data[3] = 0;
|
|
}
|
|
gpu.addVmMemory(gpuMemory);
|
|
}
|
|
|
|
// Copy the math lib constants
|
|
amd::Os::fastMemcpy(
|
|
(cbBuf + MathLibOffset * ConstBuffer::VectorSize),
|
|
MathLibConst, sizeof(MathLibConst));
|
|
|
|
// Update the offset to the global data
|
|
if (prog().glbData() != NULL) {
|
|
gpu.addVmMemory(prog().glbData());
|
|
uint64_t glbDataOffset = prog().glbData()->hbOffset();
|
|
if (abi64Bit()) {
|
|
*reinterpret_cast<uint64_t*>(cbBuf + GlobalDataStoreOffset *
|
|
ConstBuffer::VectorSize) = glbDataOffset;
|
|
}
|
|
else {
|
|
*reinterpret_cast<uint*>(cbBuf + GlobalDataStoreOffset *
|
|
ConstBuffer::VectorSize) = static_cast<uint>(glbDataOffset);
|
|
}
|
|
}
|
|
|
|
// Split workload if it was requested
|
|
if ((gpu.cal_.iterations_ < 2) &&
|
|
gpu.dmaFlushMgmt().dispatchSplitSize() != 0) {
|
|
uint totalSize = gridSize * blockSize;
|
|
if (totalSize > gpu.dmaFlushMgmt().dispatchSplitSize()) {
|
|
gpu.cal_.iterations_ = std::max(gpu.cal_.iterations_,
|
|
(totalSize / gpu.dmaFlushMgmt().dispatchSplitSize()));
|
|
}
|
|
}
|
|
|
|
// Initialize the number of iterations to the grid size
|
|
if (flags() & PrintfOutput) {
|
|
gpu.cal_.iterations_ = gridSize;
|
|
}
|
|
}
|
|
|
|
void
|
|
Kernel::setSampler(
|
|
VirtualGPU& gpu,
|
|
uint32_t state,
|
|
uint physUnit
|
|
) const
|
|
{
|
|
// All CAL sampler's parameters are in floats
|
|
float gslAddress = GSL_CLAMP_TO_BORDER;
|
|
float gslMinFilter = GSL_MIN_NEAREST;
|
|
float gslMagFilter = GSL_MAG_NEAREST;
|
|
|
|
state &= ~amd::Sampler::StateNormalizedCoordsMask;
|
|
|
|
// Program the sampler address mode
|
|
switch (state & amd::Sampler::StateAddressMask) {
|
|
case amd::Sampler::StateAddressRepeat:
|
|
gslAddress = GSL_REPEAT;
|
|
break;
|
|
case amd::Sampler::StateAddressClampToEdge:
|
|
gslAddress = GSL_CLAMP_TO_EDGE;
|
|
break;
|
|
case amd::Sampler::StateAddressMirroredRepeat:
|
|
gslAddress = GSL_MIRRORED_REPEAT;
|
|
break;
|
|
case amd::Sampler::StateAddressClamp:
|
|
case amd::Sampler::StateAddressNone:
|
|
default:
|
|
break;
|
|
}
|
|
state &= ~amd::Sampler::StateAddressMask;
|
|
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_S, &gslAddress);
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_T, &gslAddress);
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_R, &gslAddress);
|
|
|
|
// Program texture filter mode
|
|
if (state == amd::Sampler::StateFilterLinear) {
|
|
gslMinFilter = GSL_MIN_LINEAR;
|
|
gslMagFilter = GSL_MAG_LINEAR;
|
|
}
|
|
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MIN_FILTER, &gslMinFilter);
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MAG_FILTER, &gslMagFilter);
|
|
}
|
|
|
|
bool
|
|
Kernel::bindResource(
|
|
VirtualGPU& gpu,
|
|
const Resource& resource,
|
|
uint paramIdx,
|
|
ResourceType type,
|
|
uint physUnit,
|
|
Memory* memory,
|
|
size_t offset) const
|
|
{
|
|
gslUAVType uavType = GSL_UAV_TYPE_UNKNOWN;
|
|
|
|
// Find the original resource name from the IL program
|
|
switch (type) {
|
|
case GlobalBuffer:
|
|
if (gpu.state_.boundGlobal_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundGlobal_ = true;
|
|
physUnit = uavRaw_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case GlobalBufferArena:
|
|
if (gpu.state_.boundGlobal_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundGlobal_ = true;
|
|
physUnit = uavArena_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case ArgumentCbID:
|
|
if (gpu.state_.boundCb_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundCb_ = true;
|
|
physUnit = cbId_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case ArgumentPrintfID:
|
|
if (gpu.state_.boundPrintf_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundPrintf_ = true;
|
|
physUnit = printfId_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case ArgumentHeapBuffer:
|
|
case ArgumentBuffer:
|
|
case ArgumentImageRead:
|
|
case ArgumentImageWrite:
|
|
case ArgumentConstBuffer:
|
|
case ArgumentCounter:
|
|
// Early exit if resource is bound already
|
|
if (gpu.slots_[paramIdx].state_.bound_) {
|
|
return true;
|
|
}
|
|
|
|
// Associate resource with the slot
|
|
gpu.slots_[paramIdx].memory_ = memory;
|
|
|
|
// Mark resource as bound
|
|
gpu.slots_[paramIdx].state_.bound_ = true;
|
|
|
|
if (type == ArgumentCounter) {
|
|
GpuEvent calEvent;
|
|
|
|
// Bind memory with atomic counter
|
|
gpu.bindAtomicCounter(argument(paramIdx)->index_,
|
|
memory->gslResource());
|
|
|
|
// Copy the counter value into GDS
|
|
gpu.syncAtomicCounter(calEvent, argument(paramIdx)->index_, false);
|
|
|
|
// Mark resource as busy
|
|
memory->setBusy(gpu, calEvent);
|
|
return true;
|
|
}
|
|
else if (type == ArgumentHeapBuffer) {
|
|
// We return here, since we just have to bind the global heap
|
|
return true;
|
|
}
|
|
else if (type == ArgumentConstBuffer) {
|
|
gpu.slots_[paramIdx].state_.constant_ = true;
|
|
}
|
|
break;
|
|
case ArgumentUavID:
|
|
case ConstantBuffer:
|
|
break;
|
|
default:
|
|
LogPrintfError("Unspecified argument type ()!", type);
|
|
return false;
|
|
}
|
|
|
|
gslMemObject gslMem = NULL;
|
|
// Use global address space on SI+ for UAV setup
|
|
if (dev().settings().siPlus_ &&
|
|
((type == ArgumentBuffer) || (type == ArgumentCbID) ||
|
|
(type == ArgumentUavID) || (type == ArgumentPrintfID)) &&
|
|
!blitKernelHack_) {
|
|
gslMem = dev().heap()->resource().gslResource();
|
|
}
|
|
else {
|
|
gslMem = resource.gslResource();
|
|
}
|
|
|
|
// Associate memory with the physical unit, the actual binding
|
|
bool result = true;
|
|
switch (type) {
|
|
case GlobalBuffer:
|
|
case GlobalBufferArena:
|
|
case ArgumentBuffer:
|
|
case ArgumentImageWrite:
|
|
case ArgumentUavID:
|
|
case ArgumentCbID:
|
|
case ArgumentPrintfID:
|
|
if (type == ArgumentImageWrite) {
|
|
uavType = GSL_UAV_TYPE_TYPED;
|
|
}
|
|
else if ((type == ArgumentBuffer) || (type == ArgumentUavID)) {
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
}
|
|
if (gpu.cal_.uavs_[physUnit] != gslMem) {
|
|
result = gpu.setUAVBuffer(physUnit, gslMem, uavType);
|
|
gpu.setUAVChannelOrder(physUnit, gslMem);
|
|
gpu.cal_.uavs_[physUnit] = gslMem;
|
|
}
|
|
else if (!dev().settings().siPlus_)
|
|
{
|
|
gpu.setUAVChannelOrder(physUnit, gslMem);
|
|
}
|
|
break;
|
|
case ConstantBuffer:
|
|
case ArgumentConstBuffer:
|
|
if ((gpu.cal_.constBuffers_[physUnit] != gslMem) || (offset != 0)) {
|
|
result = gpu.setConstantBuffer(physUnit,
|
|
gslMem, offset, resource.hbSize());
|
|
gpu.cal_.constBuffers_[physUnit] = gslMem;
|
|
}
|
|
break;
|
|
case ArgumentImageRead:
|
|
if (gpu.cal_.readImages_[physUnit] != gslMem) {
|
|
result = gpu.setInput(physUnit, gslMem);
|
|
gpu.cal_.readImages_[physUnit] = gslMem;
|
|
}
|
|
break;
|
|
default:
|
|
result = false;
|
|
assert(false);
|
|
break;
|
|
}
|
|
if (!result) {
|
|
LogPrintfError("setMem failed unit:%d mem:0x%08x!", physUnit, gslMem);
|
|
return false;
|
|
}
|
|
|
|
if ((type == GlobalBuffer) && dev().settings().useAliases_) {
|
|
if (uavArena_ != 0) {
|
|
if (!setupArenaAliases(gpu, resource)) {
|
|
return false;
|
|
}
|
|
if ((uavArena_ != physUnit) &&
|
|
(gpu.cal_.uavs_[uavArena_] != gslMem)) {
|
|
gpu.cal_.uavs_[uavArena_] = gslMem;
|
|
// Associate memory with the name
|
|
if (!gpu.setUAVBuffer(uavArena_, gslMem,
|
|
GSL_UAV_TYPE_TYPELESS)) {
|
|
LogError("calCtxSetMem failed!");
|
|
return false;
|
|
}
|
|
gpu.setUAVChannelOrder(uavArena_, gslMem);
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void
|
|
Kernel::unbindResources(
|
|
VirtualGPU& gpu,
|
|
GpuEvent calEvent,
|
|
bool lastRun) const
|
|
{
|
|
// Make sure unbind occurs on the last run, in case the execution had a split
|
|
if (lastRun) {
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
if (gpu.slots_[i].state_.bound_) {
|
|
GpuEvent calEventTmp = calEvent;
|
|
|
|
if (KernelArg::Counter == argument(i)->type_) {
|
|
// Copy the counter value from GDS
|
|
gpu.syncAtomicCounter(calEventTmp, argument(i)->index_, true);
|
|
}
|
|
else if (!(gpu.slots_[i].state_.constant_ ||
|
|
argument(i)->memory_.readOnly_)) {
|
|
// Signal the abstraction layer that GPU memory is dirty
|
|
if (gpu.slots_[i].memory_->owner() != NULL) {
|
|
gpu.slots_[i].memory_->owner()->signalWrite(&gpu.dev());
|
|
}
|
|
}
|
|
// Mark resource as busy
|
|
gpu.slots_[i].memory_->setBusy(gpu, calEventTmp);
|
|
|
|
gpu.slots_[i].state_.value_ = 0;
|
|
}
|
|
}
|
|
|
|
// Unbind the global buffer
|
|
gpu.state_.boundGlobal_ = false;
|
|
|
|
// Unbind the constant buffer
|
|
gpu.state_.boundCb_ = false;
|
|
|
|
// Unbind the pritnf buffer
|
|
gpu.state_.boundPrintf_ = false;
|
|
}
|
|
|
|
// Mark CB busy
|
|
for (uint i = 0; i < numCb_; ++i) {
|
|
gpu.constBufs_[i]->setBusy(gpu, calEvent);
|
|
}
|
|
|
|
// 8xx workaround for the number of groups limit in HW
|
|
if (gpu.gslKernelDesc()->funcInfo_.setBufferForNumGroup) {
|
|
ConstBuffer* cb = gpu.numGrpCb();
|
|
assert((cb != NULL) && "Runtime must have the constant buffer");
|
|
cb->setBusy(gpu, calEvent);
|
|
}
|
|
|
|
// Set the event object for the scratch buffer
|
|
if (workGroupInfo()->scratchRegs_ > 0) {
|
|
for (uint i = 0; i < dev().scratch(gpu.hwRing())->memObjs_.size(); ++i) {
|
|
dev().scratch(gpu.hwRing())->memObjs_[i]->setBusy(gpu, calEvent);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool
|
|
Kernel::setupArenaAliases(VirtualGPU& gpu, const Resource& resource) const
|
|
{
|
|
const static uint ScArenaUavShortId = 9;
|
|
const static uint ScArenaUavByteId = 10;
|
|
|
|
Resource* buf = &(const_cast<Resource&>(resource));
|
|
Resource* alias;
|
|
gslMemObject gslMem = NULL;
|
|
//
|
|
// byte view
|
|
//
|
|
alias = buf->getAliasUAVBuffer(CM_SURF_FMT_R8I);
|
|
if (NULL == alias) {
|
|
return false;
|
|
}
|
|
gslMem = alias->gslResource();
|
|
if (gpu.cal_.uavs_[ScArenaUavByteId] != gslMem) {
|
|
if (!gpu.setUAVBuffer(ScArenaUavByteId,
|
|
gslMem, GSL_UAV_TYPE_TYPELESS)) {
|
|
return false;
|
|
}
|
|
gpu.cal_.uavs_[ScArenaUavByteId] = gslMem;
|
|
}
|
|
|
|
//
|
|
// short view
|
|
//
|
|
alias = buf->getAliasUAVBuffer(CM_SURF_FMT_R16I);
|
|
if (NULL == alias) {
|
|
return false;
|
|
}
|
|
bool result = true;
|
|
gslMem = alias->gslResource();
|
|
if (gpu.cal_.uavs_[ScArenaUavShortId] != gslMem) {
|
|
result = gpu.setUAVBuffer(ScArenaUavShortId,
|
|
gslMem, GSL_UAV_TYPE_TYPELESS);
|
|
gpu.cal_.uavs_[ScArenaUavShortId] = gslMem;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
void
|
|
Kernel::copyImageConstants(
|
|
const amd::Image* amdImage,
|
|
ImageConstants* imageData
|
|
) const
|
|
{
|
|
imageData->width_ = static_cast<uint32_t>(amdImage->getWidth());
|
|
imageData->height_ = static_cast<uint32_t>(amdImage->getHeight());
|
|
imageData->depth_ = static_cast<uint32_t>(amdImage->getDepth());
|
|
imageData->dataType_ =
|
|
static_cast<uint32_t>(amdImage->getImageFormat().image_channel_data_type);
|
|
|
|
imageData->widthFloat_ = 1.f / static_cast<float>(amdImage->getWidth());
|
|
imageData->heightFloat_ = 1.f / static_cast<float>(amdImage->getHeight());
|
|
imageData->depthFloat_ = 1.f / static_cast<float>(amdImage->getDepth());
|
|
imageData->channelOrder_ =
|
|
static_cast<uint32_t>(amdImage->getImageFormat().image_channel_order);
|
|
}
|
|
|
|
union MetadataVersion {
|
|
struct {
|
|
uint64_t revision_: 16; //!< LLVM metadata revision
|
|
uint64_t minorVersion_: 16; //!< LLVM metadata minor verison
|
|
uint64_t majorVersion_: 16; //!< LLVM metadata major version
|
|
};
|
|
uint64_t value_;
|
|
MetadataVersion(uint mj, uint mi, uint rev): value_(0)
|
|
{
|
|
revision_ = rev;
|
|
minorVersion_ = mi;
|
|
majorVersion_ = mj;
|
|
}
|
|
MetadataVersion(): value_(0) {}
|
|
};
|
|
|
|
//! Version of metadata with buffer attributes
|
|
const MetadataVersion MetadataBufferAttributes = MetadataVersion(2, 0, 88);
|
|
|
|
//! Version of metadata with type qualifiers
|
|
const MetadataVersion MetadataTypeQualifiers = MetadataVersion(3, 1, 103);
|
|
|
|
bool
|
|
NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount)
|
|
{
|
|
// Initialize workgroup info
|
|
workGroupInfo_.size_ = nullDev().info().maxWorkGroupSize_;
|
|
MetadataVersion mdVersion;
|
|
|
|
// Find first tag
|
|
size_t pos = metaData.find(";");
|
|
|
|
// Loop through all provided program arguments
|
|
while (pos != std::string::npos) {
|
|
KernelArg arg;
|
|
|
|
if (!expect(metaData, &pos, ";")) {
|
|
break;
|
|
}
|
|
|
|
arg.type_ = KernelArg::None;
|
|
|
|
// Loop through all available metadata types
|
|
for (uint i = 0; i < ArgStateTotal; ++i) {
|
|
uint tmpValue;
|
|
// Find the name tag
|
|
if (expect(metaData, &pos, ArgState[i].typeName_)) {
|
|
switch (ArgState[i].type_) {
|
|
case KernelArg::None:
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::Reflection: {
|
|
uint argIdx;
|
|
// Read the argument's index
|
|
if (!getuint(metaData, &pos, &argIdx)) {
|
|
LogWarning("Couldn't get the argument index!");
|
|
return false;
|
|
}
|
|
KernelArg* tmpArg = arguments_[argIdx];
|
|
if (!getstring(metaData, &pos, &tmpArg->typeName_)) {
|
|
LogWarning("Couldn't get the argument type!");
|
|
return false;
|
|
}
|
|
}
|
|
continue;
|
|
case KernelArg::ConstArg: {
|
|
uint argIdx;
|
|
// Read the argument's index
|
|
if (!getuint(metaData, &pos, &argIdx)) {
|
|
LogWarning("Couldn't get the argument index!");
|
|
return false;
|
|
}
|
|
KernelArg* tmpArg = arguments_[argIdx];
|
|
tmpArg->typeQualifier_ |= CL_KERNEL_ARG_TYPE_CONST;
|
|
}
|
|
continue;
|
|
case KernelArg::Grouping:
|
|
for (uint j = 0; j < 3; ++j) {
|
|
uint temp;
|
|
// Read the compile workgroup size
|
|
if (!getuint(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the compile workgroup size!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.compileSize_[j] = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::WrkgrpSize: {
|
|
uint temp;
|
|
// Read the workgroup size
|
|
if (!getuint(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the workgroup size!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.size_ = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::Wavefront:
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::UavId:
|
|
// Read index
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
return false;
|
|
}
|
|
break;
|
|
case KernelArg::ConstBufId:
|
|
// Read index
|
|
if (!getuint(metaData, &pos, &cbId_)) {
|
|
return false;
|
|
}
|
|
continue;
|
|
case KernelArg::PrintfBufId:
|
|
// Read index
|
|
if (!getuint(metaData, &pos, &printfId_)) {
|
|
return false;
|
|
}
|
|
continue;
|
|
case KernelArg::MetadataVersion:
|
|
// Read metadata version
|
|
if (!getuint(metaData, &pos, &tmpValue)) {
|
|
return false;
|
|
}
|
|
mdVersion.majorVersion_ = tmpValue;
|
|
if (!getuint(metaData, &pos, &tmpValue)) {
|
|
return false;
|
|
}
|
|
mdVersion.minorVersion_ = tmpValue;
|
|
if (!getuint(metaData, &pos, &tmpValue)) {
|
|
return false;
|
|
}
|
|
mdVersion.revision_ = tmpValue;
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::GroupingHint:
|
|
for (uint j = 0; j < 3; ++j) {
|
|
uint temp;
|
|
// Read the compile workgroup size hint
|
|
if (!getuint(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the compile workgroup size hint!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.compileSizeHint_[j] = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::VecTypeHint:
|
|
{
|
|
std::string temp;
|
|
// Read the compile vector type hint
|
|
if (!getstring(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the compile vector type hint!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.compileVecTypeHint_ = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
char argName[256];
|
|
// Save the argument type
|
|
arg.type_ = ArgState[i].type_;
|
|
|
|
// Check if we should expect the name
|
|
if (ArgState[i].name_) {
|
|
// Read the parameter's name
|
|
if (!getword(metaData, &pos, argName)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
arg.name_ = argName;
|
|
}
|
|
|
|
if (arg.type_ == KernelArg::Sampler) {
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
if (!getuint(metaData, &pos, &arg.location_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
if (!getuint(metaData, &pos, &arg.cbPos_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check if we should expect the resource data type
|
|
if (ArgState[i].resType_) {
|
|
uint k;
|
|
// Search for the data type
|
|
for (k = 0; k < DataTypeTotal; k++) {
|
|
if (expect(metaData, &pos, DataType[k].tagName_)) {
|
|
arg.dataType_ = DataType[k].type_;
|
|
if (arg.type_ == KernelArg::Image) {
|
|
flags_ |= ImageEnable;
|
|
if (expect(metaData, &pos, "RO:")) {
|
|
arg.memory_.readOnly_ = 1;
|
|
}
|
|
else if (expect(metaData, &pos, "RW:")) {
|
|
arg.memory_.readWrite_ = 1;
|
|
flags_ |= ImageWrite;
|
|
}
|
|
else if (expect(metaData, &pos, "WO:")) {
|
|
arg.memory_.writeOnly_ = 1;
|
|
flags_ |= ImageWrite;
|
|
}
|
|
}
|
|
else if (arg.type_ == KernelArg::Value) {
|
|
arg.type_ = DataType[k].type_;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
if (k == DataTypeTotal) {
|
|
LogWarning("We couldn't find the argument's type.");
|
|
if ((arg.type_ == KernelArg::Value) ||
|
|
!getword(metaData, &pos, argName)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
//! @todo temporary condition
|
|
if ((arg.type_ == KernelArg::Opaque) ||
|
|
(arg.type_ == KernelArg::Sampler)) {
|
|
assert(false);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Check if we should expect the data size
|
|
if (ArgState[i].size_) {
|
|
uint tmpData;
|
|
// Read the data size
|
|
if (!getuint(metaData, &pos, &tmpData)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
if (arg.type_ == KernelArg::Image) {
|
|
arg.type_ = arg.dataType_;
|
|
arg.index_ = tmpData;
|
|
}
|
|
else {
|
|
arg.size_ = tmpData;
|
|
}
|
|
}
|
|
|
|
if (arg.type_ == KernelArg::Counter) {
|
|
// Read a counter index
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
LogWarning("Couldn't get a counter index!");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check if we should expect a resource index
|
|
if (ArgState[i].cbIdx_) {
|
|
// Read resource index
|
|
if (!getuint(metaData, &pos, &arg.cbIdx_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
|
|
if (arg.isCbNeeded() && (numCb_ < arg.cbIdx_)) {
|
|
numCb_ = arg.cbIdx_;
|
|
}
|
|
}
|
|
// Check if we should expect the CB offset
|
|
if (ArgState[i].cbPos_) {
|
|
// Read position in the constant buffer
|
|
if (!getuint(metaData, &pos, &arg.cbPos_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
// Check if we should expect the buffer type
|
|
if (ArgState[i].buf_) {
|
|
// Read the buffer type
|
|
if (!getword(metaData, &pos, argName)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
arg.buf_ = argName;
|
|
|
|
for (uint k = 0; k < BufTypeTotal; ++k) {
|
|
if (0 == arg.buf_.compare(BufType[k].tagName_)) {
|
|
// Update the parameter type
|
|
arg.type_ = BufType[k].type_;
|
|
// Check if we should expect a buffer index
|
|
if (BufType[k].number_) {
|
|
// Read a buffer index
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
// Check for the required alignment
|
|
if (BufType[k].alignment_) {
|
|
// Read data alignment
|
|
if (!getuint(metaData, &pos, &arg.alignment_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
// Check for the buffer's attribute
|
|
if ((mdVersion.value_ >= MetadataBufferAttributes.value_) &&
|
|
BufType[k].attribute_) {
|
|
if (expect(metaData, &pos, "RO")) {
|
|
arg.memory_.readOnly_ = 1;
|
|
}
|
|
else if (expect(metaData, &pos, "RW")) {
|
|
arg.memory_.readWrite_ = 1;
|
|
}
|
|
else if (expect(metaData, &pos, "WO")) {
|
|
arg.memory_.writeOnly_ = 1;
|
|
}
|
|
}
|
|
// Check for the type qualifier
|
|
if ((mdVersion.value_ >= MetadataTypeQualifiers.value_) &&
|
|
BufType[k].attribute_) {
|
|
uint tmp;
|
|
pos += 1;
|
|
if (!getuint(metaData, &pos, &tmp)) {
|
|
LogWarning("Couldn't get volatile type!");
|
|
return false;
|
|
}
|
|
if (tmp == 1) {
|
|
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
|
}
|
|
if (!getuint(metaData, &pos, &tmp)) {
|
|
LogWarning("Couldn't get restrict type!");
|
|
return false;
|
|
}
|
|
if (tmp == 1) {
|
|
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Find multiple UAV references
|
|
switch (arg.type_) {
|
|
case KernelArg::PointerGlobal:
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
case KernelArg::UavId:
|
|
uavRefCount[arg.index_]++;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
// Check if this argument will be passed in constant buffer
|
|
if (arg.isCbNeeded() || (arg.type_ == KernelArg::UavId)) {
|
|
if (arg.type_ == KernelArg::Sampler) {
|
|
// Serach for the passed by value sampler
|
|
for (uint i = 0; i < argSize(); ++i) {
|
|
KernelArg* value = arguments_[i];
|
|
if (0 == value->name_.compare(arg.name_)) {
|
|
value->type_ = arg.type_;
|
|
value->index_ = arg.index_;
|
|
value->location_ = 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
KernelArg* argument = new KernelArg(arg);
|
|
if (argument != NULL) {
|
|
addArgument(argument);
|
|
}
|
|
else {
|
|
LogError("Couldn't allocate memory!");
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
// Check if we have a pre-defined sampler
|
|
else if (arg.type_ == KernelArg::Sampler) {
|
|
KernelArg* sampler = new KernelArg(arg);
|
|
if (sampler != NULL) {
|
|
addSampler(sampler);
|
|
}
|
|
else {
|
|
LogError("Couldn't allocate memory!");
|
|
return false;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Next argument
|
|
pos = metaData.find(";", pos);
|
|
}
|
|
|
|
// Find arguments that will require a reallocation
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
KernelArg* arg = arguments_[i];
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerGlobal:
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
// Check if can't use a dedicated UAV,
|
|
// so realloc memory in the heap
|
|
arg->memory_.realloc_ = isRealloc();
|
|
|
|
if (nullDev().settings().useAliases_) {
|
|
if (uavRefCount[arg->index_] > 1) {
|
|
// Multiple accesses, assume this is the heap
|
|
uavRaw_ = arg->index_;
|
|
}
|
|
// Mark argument as an UAV buffer if no aliases or it's not arena
|
|
else if (arg->index_ != VirtualGPU::UavArena) {
|
|
arg->memory_.uavBuf_ = true;
|
|
}
|
|
}
|
|
else {
|
|
arg->memory_.uavBuf_ = true;
|
|
}
|
|
break;
|
|
case KernelArg::PointerHwConst:
|
|
arg->memory_.realloc_ = true;
|
|
break;
|
|
case KernelArg::UavId:
|
|
if (!nullDev().settings().useAliases_ ||
|
|
(arg->index_ != VirtualGPU::UavArena)) {
|
|
uavRaw_ = arg->index_;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
// If argument marked with the const qualifier, then overwrite
|
|
// Read-Write attributes, since compiler doesn't mark it properly
|
|
if (arg->typeQualifier() & CL_KERNEL_ARG_TYPE_CONST) {
|
|
arg->memory_.readOnly_ = 1;
|
|
arg->memory_.readWrite_ = 0;
|
|
arg->memory_.writeOnly_ = 0;
|
|
}
|
|
}
|
|
|
|
if (!nullDev().settings().useAliases_ &&
|
|
(uavRaw_ != UavIdUndefined) &&
|
|
!(flags() & PrintfOutput)) {
|
|
// Find if default UAV is already assigned to an argument
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
KernelArg* arg = arguments_[i];
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerGlobal:
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
if (uavRaw_ == arg->index_) {
|
|
uavRaw_ = UavIdUndefined;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
// There is always 1 constant buffer, associated with the kernel
|
|
numCb_++;
|
|
assert((numCb_ <= MaxConstBuffersArguments) &&
|
|
"Runtime doesn't support more than max CBs for arguments!");
|
|
|
|
// Limit workgroup size if requested
|
|
if ((flags() & LimitWorkgroup) && (GPU_MAX_WORKGROUP_SIZE == 0)) {
|
|
size_t temp = 1;
|
|
workGroupInfo_.size_ = workGroupInfo()->wavefrontSize_;
|
|
for (uint j = 0; j < 3; ++j) {
|
|
if (workGroupInfo()->compileSize_[j] != 0) {
|
|
temp *= workGroupInfo_.compileSize_[j];
|
|
}
|
|
}
|
|
// Report a compilation error if requested compile size doesn't
|
|
// match the required workgroup size
|
|
if (workGroupInfo()->size_ < temp) {
|
|
char str[8];
|
|
intToStr(workGroupInfo_.size_, str, 8);
|
|
buildError_ = CL_OUT_OF_RESOURCES;
|
|
buildLog_ +=
|
|
"Error: Requested compile size is bigger than the required workgroup size of ";
|
|
buildLog_ += str;
|
|
buildLog_ += " elements\n";
|
|
LogError(buildLog().c_str());
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Read/Write attributes are provided in metadata
|
|
if (mdVersion.value_ >= MetadataBufferAttributes.value_) {
|
|
rwAttributes_ = true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
Kernel::validateMemory(uint idx, amd::Memory* amdMem) const
|
|
{
|
|
// Check if memory doesn't require reallocation
|
|
bool noRealloc = (!argument(idx)->memory_.realloc_ ||
|
|
amdMem->reallocedDeviceMemory(&dev()));
|
|
|
|
return noRealloc;
|
|
}
|
|
|
|
inline static HSAIL_ARG_TYPE
|
|
GetHSAILArgType(const aclArgData* argInfo)
|
|
{
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
if (argInfo->arg.pointer.memory == PTR_MT_SCRATCH_EMU) {
|
|
return HSAIL_ARGTYPE_QUEUE;
|
|
}
|
|
return HSAIL_ARGTYPE_POINTER;
|
|
case ARG_TYPE_VALUE:
|
|
return HSAIL_ARGTYPE_VALUE;
|
|
case ARG_TYPE_IMAGE:
|
|
return HSAIL_ARGTYPE_IMAGE;
|
|
case ARG_TYPE_SAMPLER:
|
|
return HSAIL_ARGTYPE_SAMPLER;
|
|
case ARG_TYPE_ERROR:
|
|
default:
|
|
return HSAIL_ARGTYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
inline static size_t
|
|
GetHSAILArgAlignment(const aclArgData* argInfo)
|
|
{
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return argInfo->arg.pointer.align;
|
|
default:
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
inline static HSAIL_ADDRESS_QUALIFIER
|
|
GetHSAILAddrQual(const aclArgData* argInfo)
|
|
{
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_CONSTANT_EMU:
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV:
|
|
case PTR_MT_GLOBAL:
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
case PTR_MT_LDS_EMU:
|
|
case PTR_MT_LDS:
|
|
return HSAIL_ADDRESS_LOCAL;
|
|
case PTR_MT_SCRATCH_EMU:
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
case PTR_MT_ERROR:
|
|
default:
|
|
LogError("Unsupported address type");
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
}
|
|
else if ((argInfo->type == ARG_TYPE_IMAGE) ||
|
|
(argInfo->type == ARG_TYPE_SAMPLER)) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
}
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
|
|
/* f16 returns f32 - workaround due to comp lib */
|
|
inline static HSAIL_DATA_TYPE
|
|
GetHSAILDataType(const aclArgData* argInfo)
|
|
{
|
|
aclArgDataType dataType;
|
|
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
dataType = argInfo->arg.pointer.data;
|
|
}
|
|
else if (argInfo->type == ARG_TYPE_VALUE) {
|
|
dataType = argInfo->arg.value.data;
|
|
}
|
|
else {
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
switch (dataType) {
|
|
case DATATYPE_i1:
|
|
return HSAIL_DATATYPE_B1;
|
|
case DATATYPE_i8:
|
|
return HSAIL_DATATYPE_S8;
|
|
case DATATYPE_i16:
|
|
return HSAIL_DATATYPE_S16;
|
|
case DATATYPE_i32:
|
|
return HSAIL_DATATYPE_S32;
|
|
case DATATYPE_i64:
|
|
return HSAIL_DATATYPE_S64;
|
|
case DATATYPE_u8:
|
|
return HSAIL_DATATYPE_U8;
|
|
case DATATYPE_u16:
|
|
return HSAIL_DATATYPE_U16;
|
|
case DATATYPE_u32:
|
|
return HSAIL_DATATYPE_U32;
|
|
case DATATYPE_u64:
|
|
return HSAIL_DATATYPE_U64;
|
|
case DATATYPE_f16:
|
|
return HSAIL_DATATYPE_F32;
|
|
case DATATYPE_f32:
|
|
return HSAIL_DATATYPE_F32;
|
|
case DATATYPE_f64:
|
|
return HSAIL_DATATYPE_F64;
|
|
case DATATYPE_struct:
|
|
return HSAIL_DATATYPE_STRUCT;
|
|
case DATATYPE_opaque:
|
|
return HSAIL_DATATYPE_OPAQUE;
|
|
case DATATYPE_ERROR:
|
|
default:
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
inline static int
|
|
GetHSAILArgSize(const aclArgData *argInfo)
|
|
{
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_VALUE:
|
|
switch (GetHSAILDataType(argInfo)) {
|
|
case HSAIL_DATATYPE_B1:
|
|
return 1;
|
|
case HSAIL_DATATYPE_B8:
|
|
case HSAIL_DATATYPE_S8:
|
|
case HSAIL_DATATYPE_U8:
|
|
return 1;
|
|
case HSAIL_DATATYPE_B16:
|
|
case HSAIL_DATATYPE_U16:
|
|
case HSAIL_DATATYPE_S16:
|
|
case HSAIL_DATATYPE_F16:
|
|
return 2;
|
|
case HSAIL_DATATYPE_B32:
|
|
case HSAIL_DATATYPE_U32:
|
|
case HSAIL_DATATYPE_S32:
|
|
case HSAIL_DATATYPE_F32:
|
|
return 4;
|
|
case HSAIL_DATATYPE_B64:
|
|
case HSAIL_DATATYPE_U64:
|
|
case HSAIL_DATATYPE_S64:
|
|
case HSAIL_DATATYPE_F64:
|
|
return 8;
|
|
case HSAIL_DATATYPE_STRUCT:
|
|
return argInfo->arg.value.numElements;
|
|
default:
|
|
return -1;
|
|
}
|
|
case ARG_TYPE_POINTER:
|
|
case ARG_TYPE_IMAGE:
|
|
case ARG_TYPE_SAMPLER:
|
|
return sizeof(void*);
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
inline static clk_value_type_t
|
|
GetOclType(const aclArgData* argInfo)
|
|
{
|
|
static const clk_value_type_t ClkValueMapType[6][6] = {
|
|
{ T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 },
|
|
{ T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 },
|
|
{ T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 },
|
|
{ T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 },
|
|
{ T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 },
|
|
{ T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 },
|
|
};
|
|
|
|
uint sizeType;
|
|
if ((argInfo->type == ARG_TYPE_POINTER) &&
|
|
(argInfo->arg.pointer.memory == PTR_MT_SCRATCH_EMU)) {
|
|
return T_QUEUE;
|
|
}
|
|
if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) {
|
|
return T_POINTER;
|
|
}
|
|
else if (argInfo->type == ARG_TYPE_VALUE) {
|
|
switch (argInfo->arg.value.data) {
|
|
case DATATYPE_i8:
|
|
case DATATYPE_u8:
|
|
sizeType = 0;
|
|
break;
|
|
case DATATYPE_i16:
|
|
case DATATYPE_u16:
|
|
sizeType = 1;
|
|
break;
|
|
case DATATYPE_i32:
|
|
case DATATYPE_u32:
|
|
sizeType = 2;
|
|
break;
|
|
case DATATYPE_i64:
|
|
case DATATYPE_u64:
|
|
sizeType = 3;
|
|
break;
|
|
case DATATYPE_f16:
|
|
case DATATYPE_f32:
|
|
sizeType = 4;
|
|
break;
|
|
case DATATYPE_f64:
|
|
sizeType = 5;
|
|
break;
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
switch (argInfo->arg.value.numElements) {
|
|
case 1: return ClkValueMapType[sizeType][0];
|
|
case 2: return ClkValueMapType[sizeType][1];
|
|
case 3: return ClkValueMapType[sizeType][2];
|
|
case 4: return ClkValueMapType[sizeType][3];
|
|
case 8: return ClkValueMapType[sizeType][4];
|
|
case 16: return ClkValueMapType[sizeType][5];
|
|
default: return T_VOID;
|
|
}
|
|
}
|
|
else if (argInfo->type == ARG_TYPE_SAMPLER) {
|
|
return T_SAMPLER;
|
|
}
|
|
else {
|
|
return T_VOID;
|
|
}
|
|
}
|
|
|
|
inline static cl_kernel_arg_address_qualifier
|
|
GetOclAddrQual(const aclArgData* argInfo)
|
|
{
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_UAV:
|
|
case PTR_MT_GLOBAL:
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV_CONSTANT:
|
|
case PTR_MT_CONSTANT_EMU:
|
|
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
|
case PTR_MT_LDS_EMU:
|
|
case PTR_MT_LDS:
|
|
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
|
default:
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
}
|
|
else if (argInfo->type == ARG_TYPE_IMAGE) {
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
}
|
|
//default for all other cases
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
|
|
inline static cl_kernel_arg_access_qualifier
|
|
GetOclAccessQual(const aclArgData* argInfo)
|
|
{
|
|
if (argInfo->type == ARG_TYPE_IMAGE) {
|
|
switch (argInfo->arg.image.type) {
|
|
case ACCESS_TYPE_RO:
|
|
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
|
case ACCESS_TYPE_WO:
|
|
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
|
case ACCESS_TYPE_RW:
|
|
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
|
default:
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
}
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
|
|
inline static cl_kernel_arg_type_qualifier
|
|
GetOclTypeQual(const aclArgData* argInfo)
|
|
{
|
|
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
if (argInfo->arg.pointer.isVolatile) {
|
|
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
|
}
|
|
if (argInfo->arg.pointer.isRestrict) {
|
|
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
|
}
|
|
if (argInfo->arg.pointer.isPipe) {
|
|
rv |= CL_KERNEL_ARG_TYPE_PIPE;
|
|
}
|
|
if (argInfo->isConst) {
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
}
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV_CONSTANT:
|
|
case PTR_MT_CONSTANT_EMU:
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
static int
|
|
GetOclSize(const aclArgData* argInfo)
|
|
{
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER: return sizeof(void *);
|
|
case ARG_TYPE_VALUE:
|
|
//! \note OCL 6.1.5. For 3-component vector data types,
|
|
//! the size of the data type is 4 * sizeof(component).
|
|
switch (argInfo->arg.value.data) {
|
|
case DATATYPE_struct:
|
|
return 1 * argInfo->arg.value.numElements;
|
|
case DATATYPE_i8:
|
|
case DATATYPE_u8:
|
|
return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_u16:
|
|
case DATATYPE_i16:
|
|
case DATATYPE_f16:
|
|
return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_u32:
|
|
case DATATYPE_i32:
|
|
case DATATYPE_f32:
|
|
return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_i64:
|
|
case DATATYPE_u64:
|
|
case DATATYPE_f64:
|
|
return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_ERROR:
|
|
default: return -1;
|
|
}
|
|
case ARG_TYPE_IMAGE: return sizeof(cl_mem);
|
|
case ARG_TYPE_SAMPLER: return sizeof(cl_sampler);
|
|
default: return -1;
|
|
}
|
|
}
|
|
|
|
void
|
|
HSAILKernel::initArgList(const aclArgData* aclArg)
|
|
{
|
|
// Initialize the hsail argument list too
|
|
initHsailArgs(aclArg);
|
|
|
|
// Iterate through the arguments and insert into parameterList
|
|
device::Kernel::parameters_t params;
|
|
amd::KernelParameterDescriptor desc;
|
|
size_t offset = 0;
|
|
|
|
// Reserved arguments for HSAIL launch
|
|
aclArg += ExtraArguments;
|
|
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
|
|
desc.name_ = arguments_[i]->name_.c_str();
|
|
desc.type_ = GetOclType(aclArg);
|
|
desc.addressQualifier_ = GetOclAddrQual(aclArg);
|
|
desc.accessQualifier_ = GetOclAccessQual(aclArg);
|
|
desc.typeQualifier_ = GetOclTypeQual(aclArg);
|
|
desc.typeName_ = arguments_[i]->typeName_.c_str();
|
|
|
|
// Make a check if it is local or global
|
|
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
|
desc.size_ = 0;
|
|
}
|
|
else {
|
|
desc.size_ = GetOclSize(aclArg);
|
|
}
|
|
|
|
// Make offset alignment to match CPU metadata, since
|
|
// in multidevice config abstraction layer has a single signature
|
|
// and CPU sends the paramaters as they are allocated in memory
|
|
size_t size = desc.size_;
|
|
if (size == 0) {
|
|
// Local memory for CPU
|
|
size = sizeof(cl_mem);
|
|
}
|
|
offset = amd::alignUp(offset, std::min(size, size_t(16)));
|
|
desc.offset_ = offset;
|
|
offset += amd::alignUp(size, sizeof(uint32_t));
|
|
params.push_back(desc);
|
|
|
|
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
|
|
flags_.imageEna_ = true;
|
|
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
|
|
flags_.imageWriteEna_ = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
createSignature(params);
|
|
}
|
|
|
|
void
|
|
HSAILKernel::initHsailArgs(const aclArgData* aclArg)
|
|
{
|
|
int offset = 0;
|
|
|
|
// Reserved arguments for HSAIL launch
|
|
aclArg += ExtraArguments;
|
|
|
|
// Iterate through the each kernel argument
|
|
for (; aclArg->struct_size != 0; aclArg++) {
|
|
Argument* arg = new Argument;
|
|
// Initialize HSAIL kernel argument
|
|
arg->name_ = aclArg->argStr;
|
|
arg->typeName_ = aclArg->typeStr;
|
|
arg->size_ = GetHSAILArgSize(aclArg);
|
|
arg->offset_ = offset;
|
|
arg->type_ = GetHSAILArgType(aclArg);
|
|
arg->addrQual_ = GetHSAILAddrQual(aclArg);
|
|
arg->dataType_ = GetHSAILDataType(aclArg);
|
|
// If vector of args we add additional arguments to flatten it out
|
|
arg->numElem_ = ((aclArg->type == ARG_TYPE_VALUE) &&
|
|
(aclArg->arg.value.data != DATATYPE_struct)) ?
|
|
aclArg->arg.value.numElements : 1;
|
|
arg->alignment_ = GetHSAILArgAlignment(aclArg);
|
|
offset += GetHSAILArgSize(aclArg);
|
|
arguments_.push_back(arg);
|
|
}
|
|
}
|
|
|
|
void
|
|
HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf)
|
|
{
|
|
PrintfInfo info;
|
|
uint index = 0;
|
|
for (; aclPrintf->struct_size != 0; aclPrintf++) {
|
|
index = aclPrintf->ID;
|
|
if (printf_.size() <= index) {
|
|
printf_.resize(index + 1);
|
|
}
|
|
info.fmtString_ = aclPrintf->fmtStr;
|
|
info.fmtString_ += "\n";
|
|
uint32_t *tmp_ptr = const_cast<uint32_t*>(aclPrintf->argSizes);
|
|
for (uint i = 0; i < aclPrintf->numSizes; i++ , tmp_ptr++) {
|
|
info.arguments_.push_back(*tmp_ptr);
|
|
}
|
|
printf_[index] = info;
|
|
info.arguments_.clear();
|
|
}
|
|
}
|
|
|
|
HSAILKernel::HSAILKernel(std::string name,
|
|
HSAILProgram* prog,
|
|
std::string compileOptions)
|
|
: device::Kernel(name)
|
|
, compileOptions_(compileOptions)
|
|
, dev_(prog->dev())
|
|
, prog_(*prog)
|
|
, index_(0)
|
|
, code_(NULL)
|
|
, hwMetaData_(NULL)
|
|
{
|
|
hsa_ = true;
|
|
}
|
|
|
|
HSAILKernel::~HSAILKernel()
|
|
{
|
|
while (!arguments_.empty()) {
|
|
Argument* arg = arguments_.back();
|
|
delete arg;
|
|
arguments_.pop_back();
|
|
}
|
|
|
|
delete [] hwMetaData_;
|
|
|
|
delete code_;
|
|
}
|
|
|
|
bool
|
|
HSAILKernel::init()
|
|
{
|
|
acl_error error;
|
|
//compile kernel down to ISA
|
|
std::string openClKernelName("&__OpenCL_" + name() + "_kernel");
|
|
|
|
std::string options(compileOptions_.c_str());
|
|
options.append(" -just-kernel=");
|
|
options.append(openClKernelName.c_str());
|
|
// Get the ISA out
|
|
size_t size_isa;
|
|
void* shader_isa = NULL;
|
|
error = aclCompile(dev().hsaCompiler(), prog().binaryElf(),
|
|
options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL);
|
|
if (error != ACL_SUCCESS) {
|
|
LogError("Failed to finalize");
|
|
return false;
|
|
}
|
|
|
|
shader_isa = const_cast<void *>(aclGetDeviceBinary(dev().hsaCompiler(),
|
|
prog().binaryElf(), openClKernelName.c_str(), &size_isa, &error));
|
|
if (shader_isa == NULL) {
|
|
LogError("Failed find the ISA");
|
|
return false;
|
|
}
|
|
|
|
aqlCreateHWInfo(shader_isa, size_isa);
|
|
|
|
// Pull out metadata from the ELF
|
|
size_t sizeOfArgList;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_ARGUMENT_ARRAY, openClKernelName.c_str(), NULL, &sizeOfArgList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
char* aclArgList = new char[sizeOfArgList];
|
|
if (NULL == aclArgList) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_ARGUMENT_ARRAY, openClKernelName.c_str(), aclArgList, &sizeOfArgList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
// Set the argList
|
|
initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
|
|
delete [] aclArgList;
|
|
|
|
size_t sizeOfWorkGroupSize;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_WORK_GROUP_SIZE, openClKernelName.c_str(), NULL, &sizeOfWorkGroupSize);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_WORK_GROUP_SIZE, openClKernelName.c_str(),
|
|
workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Copy wavefront size
|
|
workGroupInfo_.wavefrontSize_ = dev().getAttribs().wavefrontSize;
|
|
// Find total workgroup size
|
|
if (workGroupInfo_.compileSize_[0] != 0) {
|
|
workGroupInfo_.size_ =
|
|
workGroupInfo_.compileSize_[0] *
|
|
workGroupInfo_.compileSize_[1] *
|
|
workGroupInfo_.compileSize_[2];
|
|
}
|
|
else {
|
|
workGroupInfo_.size_ = dev().info().maxWorkGroupSize_;
|
|
}
|
|
|
|
// Pull out printf metadata from the ELF
|
|
size_t sizeOfPrintfList;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), NULL, &sizeOfPrintfList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Make sure kernel has any printf info
|
|
if (0 != sizeOfPrintfList) {
|
|
char* aclPrintfList = new char[sizeOfPrintfList];
|
|
if (NULL == aclPrintfList) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), aclPrintfList,
|
|
&sizeOfPrintfList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Set the PrintfList
|
|
initPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
|
|
delete [] aclPrintfList;
|
|
}
|
|
|
|
size_t sizeOfDevice;
|
|
bool hasKernelEnqueue = false;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_DEVICE_ENQUEUE, openClKernelName.c_str(),
|
|
&hasKernelEnqueue, &sizeOfDevice);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
flags_.dynamicParallelism_ = hasKernelEnqueue;
|
|
|
|
int index = -1;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(),
|
|
RT_KERNEL_INDEX, openClKernelName.c_str(),
|
|
&index, &sizeOfDevice);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
index_ = static_cast<uint>(index);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const
|
|
{
|
|
// Check if memory doesn't require reallocation
|
|
bool noRealloc = true;
|
|
//amdMem->reallocedDeviceMemory(&dev()));
|
|
|
|
return noRealloc;
|
|
}
|
|
|
|
const Device&
|
|
HSAILKernel::dev() const
|
|
{
|
|
return reinterpret_cast<const Device&>(dev_);
|
|
}
|
|
|
|
const HSAILProgram&
|
|
HSAILKernel::prog() const
|
|
{
|
|
return reinterpret_cast<const HSAILProgram&>(prog_);
|
|
}
|
|
|
|
void
|
|
HSAILKernel::findLocalWorkSize(
|
|
size_t workDim,
|
|
const amd::NDRange& gblWorkSize,
|
|
amd::NDRange& lclWorkSize) const
|
|
{
|
|
// Initialize the default workgoup info
|
|
// Check if the kernel has the compiled sizes
|
|
if (workGroupInfo()->compileSize_[0] == 0) {
|
|
// Find the default local workgroup size, if it wasn't specified
|
|
if (lclWorkSize[0] == 0) {
|
|
size_t thrPerGrp;
|
|
bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE);
|
|
bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y);
|
|
bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) ||
|
|
!flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z);
|
|
|
|
bool overrideSet = ((workDim == 1) && b1DOverrideSet) ||
|
|
((workDim == 2) && b2DOverrideSet) ||
|
|
((workDim == 3) && b3DOverrideSet);
|
|
if (!overrideSet) {
|
|
// Find threads per group
|
|
thrPerGrp = workGroupInfo()->size_;
|
|
|
|
// Check if kernel uses images
|
|
if (flags_.imageEna_ &&
|
|
// and thread group is a multiple value of wavefronts
|
|
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
|
|
// and it's 2 or 3-dimensional workload
|
|
(workDim > 1) &&
|
|
((dev().settings().partialDispatch_) ||
|
|
(((gblWorkSize[0] % 16) == 0) &&
|
|
((gblWorkSize[1] % 16) == 0)))) {
|
|
// Use 8x8 workgroup size if kernel has image writes
|
|
if (flags_.imageWriteEna_ ||
|
|
(thrPerGrp != dev().info().maxWorkGroupSize_)) {
|
|
lclWorkSize[0] = 8;
|
|
lclWorkSize[1] = 8;
|
|
}
|
|
else {
|
|
lclWorkSize[0] = 16;
|
|
lclWorkSize[1] = 16;
|
|
}
|
|
if (workDim == 3) {
|
|
lclWorkSize[2] = 1;
|
|
}
|
|
}
|
|
else {
|
|
size_t tmp = thrPerGrp;
|
|
// Split the local workgroup into the most efficient way
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
size_t div = tmp;
|
|
for (; (gblWorkSize[d] % div) != 0; div--);
|
|
lclWorkSize[d] = div;
|
|
tmp /= div;
|
|
}
|
|
|
|
// Check if partial dispatch is enabled and
|
|
if (dev().settings().partialDispatch_ &&
|
|
// we couldn't find optimal workload
|
|
(lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) {
|
|
size_t maxSize = 0;
|
|
size_t maxDim = 0;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (maxSize < gblWorkSize[d]) {
|
|
maxSize = gblWorkSize[d];
|
|
maxDim = d;
|
|
}
|
|
}
|
|
// Check if a local workgroup has the most optimal size
|
|
if (thrPerGrp > maxSize) {
|
|
thrPerGrp = maxSize;
|
|
}
|
|
lclWorkSize[maxDim] = thrPerGrp;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (d != maxDim) {
|
|
lclWorkSize[d] = 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
// Use overrides when app doesn't provide workgroup dimensions
|
|
if (workDim == 1) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
|
|
}
|
|
else if (workDim == 2) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
|
|
}
|
|
else if (workDim == 3) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
|
|
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
|
|
}
|
|
else
|
|
{
|
|
assert(0 && "Invalid workDim!");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
|
|
}
|
|
}
|
|
}
|
|
|
|
inline static void
|
|
WriteAqlArg(
|
|
unsigned char** dst,//!< The write pointer to the buffer
|
|
const void* src, //!< The source pointer
|
|
uint size, //!< The size in bytes to copy
|
|
uint alignment = 0 //!< The alignment to follow while writing to the buffer
|
|
)
|
|
{
|
|
if (alignment == 0) {
|
|
*dst = amd::alignUp(*dst, size);
|
|
}
|
|
else {
|
|
*dst = amd::alignUp(*dst, alignment);
|
|
}
|
|
memcpy(*dst, src, size);
|
|
*dst += size;
|
|
}
|
|
|
|
HsaAqlDispatchPacket*
|
|
HSAILKernel::loadArguments(
|
|
VirtualGPU& gpu,
|
|
const amd::Kernel& kernel,
|
|
const amd::NDRangeContainer& sizes,
|
|
const_address parameters,
|
|
bool nativeMem,
|
|
uint64_t vmDefQueue,
|
|
uint64_t* vmParentWrap,
|
|
std::vector<const Resource*>& memList) const
|
|
{
|
|
static const bool WaitOnBusyEngine = true;
|
|
uint64_t ldsAddress = ldsSize();
|
|
address aqlArgBuf = gpu.cb(0)->sysMemCopy();
|
|
address aqlStruct = gpu.cb(1)->sysMemCopy();
|
|
bool srdResource = false;
|
|
|
|
// The HLC generates 3 additional arguments for the global offsets
|
|
//and fourth argument is the printf_buffer pointer
|
|
size_t offsetSize[HSAILKernel::ExtraArguments] = { 0, 0, 0, 0, 0, 0 };
|
|
for (uint i = 0; i < sizes.dimensions(); ++i) {
|
|
offsetSize[i] = sizes.offset()[i];
|
|
}
|
|
|
|
if (dynamicParallelism()) {
|
|
// Provide the host parent AQL wrap object to the kernel
|
|
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(aqlStruct);
|
|
memset(wrap, 0, sizeof(AmdAqlWrap));
|
|
wrap->state = AQL_WRAP_BUSY;
|
|
ConstBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(sizeof(AmdAqlWrap));
|
|
*vmParentWrap = cb->vmAddress() + cb->wrtOffset();
|
|
offsetSize[4] = vmDefQueue;
|
|
offsetSize[5] = *vmParentWrap;
|
|
memList.push_back(cb);
|
|
}
|
|
|
|
// Check if the kernel may have printf output
|
|
if ((printfInfo().size() > 0) &&
|
|
// and printf buffer was allocated
|
|
(gpu.printfDbgHSA().dbgBuffer() != NULL)) {
|
|
offsetSize[3] = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
|
|
memList.push_back(gpu.printfDbgHSA().dbgBuffer());
|
|
}
|
|
WriteAqlArg(&aqlArgBuf, offsetSize, sizeof(offsetSize), sizeof(size_t));
|
|
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
const amd::KernelParameters& kernelParams = kernel.parameters();
|
|
|
|
// Find all parameters for the current kernel
|
|
for (uint i = 0; i != signature.numParameters(); ++i) {
|
|
const HSAILKernel::Argument* arg = argument(i);
|
|
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
|
const_address paramaddr = parameters + desc.offset_;
|
|
|
|
switch (arg->type_) {
|
|
case HSAIL_ARGTYPE_POINTER:
|
|
// If it is a global pointer
|
|
if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) {
|
|
|
|
Memory* gpuMem = NULL;
|
|
amd::Memory* mem = NULL;
|
|
|
|
if (kernelParams.boundToSvmPointer(dev(), parameters, i)) {
|
|
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr));
|
|
mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast<void* const*>(paramaddr));
|
|
if (mem != NULL) {
|
|
gpuMem = dev().getGpuMemory(mem);
|
|
gpuMem->wait(gpu, WaitOnBusyEngine);
|
|
memList.push_back(gpuMem);
|
|
}
|
|
else {
|
|
return NULL;
|
|
}
|
|
break;
|
|
}
|
|
if (nativeMem) {
|
|
gpuMem = *reinterpret_cast<Memory* const*>(paramaddr);
|
|
}
|
|
else {
|
|
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
|
|
if (mem != NULL) {
|
|
gpuMem = dev().getGpuMemory(mem);
|
|
}
|
|
}
|
|
if (gpuMem == NULL) {
|
|
WriteAqlArg(&aqlArgBuf, &gpuMem, sizeof(void*));
|
|
break;
|
|
}
|
|
|
|
//! @todo 64 bit isn't supported with 32 bit binary
|
|
uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset();
|
|
WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*));
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, WaitOnBusyEngine);
|
|
|
|
//! @todo Compiler has to return read/write attributes
|
|
if ((NULL != mem) &&
|
|
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
memList.push_back(gpuMem);
|
|
}
|
|
// If it is a local pointer
|
|
else {
|
|
assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) &&
|
|
"Unsupported address type");
|
|
ldsAddress = amd::alignUp(ldsAddress, arg->alignment_);
|
|
WriteAqlArg(&aqlArgBuf, &ldsAddress, sizeof(size_t));
|
|
ldsAddress += *reinterpret_cast<const size_t *>(paramaddr);
|
|
}
|
|
break;
|
|
case HSAIL_ARGTYPE_VALUE:
|
|
// Special case for structrues
|
|
if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) {
|
|
// Copy the current structre into CB1
|
|
memcpy(aqlStruct, paramaddr, arg->size_);
|
|
ConstBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(arg->size_);
|
|
// Then use a pointer in aqlArgBuffer to CB1
|
|
uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset();
|
|
WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*));
|
|
memList.push_back(cb);
|
|
}
|
|
else {
|
|
WriteAqlArg(&aqlArgBuf, paramaddr,
|
|
arg->numElem_ * arg->size_, arg->size_);
|
|
}
|
|
break;
|
|
case HSAIL_ARGTYPE_IMAGE: {
|
|
Image* image = NULL;
|
|
amd::Memory* mem = NULL;
|
|
if (nativeMem) {
|
|
image = static_cast<Image*>(*reinterpret_cast<Memory* const*>(paramaddr));
|
|
}
|
|
else {
|
|
mem = *reinterpret_cast<amd::Memory* const*>(paramaddr);
|
|
if (mem == NULL) {
|
|
LogError( "The kernel image argument isn't an image object!");
|
|
return false;
|
|
}
|
|
image = static_cast<Image*>(dev().getGpuMemory(mem));
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
image->wait(gpu, WaitOnBusyEngine);
|
|
|
|
if (dev().settings().hsailDirectSRD_) {
|
|
// Image arguments are of size 48 bytes and aligned to 16 bytes
|
|
WriteAqlArg(&aqlArgBuf, image->hwState(),
|
|
HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
|
|
}
|
|
else {
|
|
uint64_t srd = image->hwSrd();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
srdResource = true;
|
|
}
|
|
|
|
//! @todo Compiler has to return read/write attributes
|
|
if ((NULL != mem) &&
|
|
((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
|
|
memList.push_back(image);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_SAMPLER: {
|
|
const amd::Sampler* sampler =
|
|
*reinterpret_cast<amd::Sampler* const*>(paramaddr);
|
|
const Sampler* gpuSampler = static_cast<Sampler*>
|
|
(sampler->getDeviceSampler(dev()));
|
|
if (dev().settings().hsailDirectSRD_) {
|
|
WriteAqlArg(&aqlArgBuf, gpuSampler->hwState(),
|
|
HSA_SAMPLER_OBJECT_SIZE, HSA_SAMPLER_OBJECT_ALIGNMENT);
|
|
}
|
|
else {
|
|
uint64_t srd = gpuSampler->hwSrd();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
srdResource = true;
|
|
}
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_QUEUE: {
|
|
const amd::DeviceQueue* queue =
|
|
*reinterpret_cast<amd::DeviceQueue* const*>(paramaddr);
|
|
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
|
|
uint64_t vmQueue = gpuQueue->vQueue()->vmAddress();
|
|
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
|
|
break;
|
|
}
|
|
default:
|
|
LogError(" Unsupported address type ");
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
if (ldsAddress > dev().info().localMemSize_) {
|
|
LogError("No local memory available\n");
|
|
return NULL;
|
|
}
|
|
|
|
assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) &&
|
|
"Size and the number of arguments don't match!");
|
|
uint argBufSize = amd::alignUp(
|
|
static_cast<uint>(argsBufferSize()), sizeof(uint32_t));
|
|
HsaAqlDispatchPacket* aqlPkt = reinterpret_cast<HsaAqlDispatchPacket*>(
|
|
gpu.cb(0)->sysMemCopy() + argBufSize);
|
|
|
|
amd::NDRange local(sizes.local());
|
|
amd::NDRange global(sizes.global());
|
|
|
|
// Initialize the Global, Local and Offset values
|
|
aqlPkt->dimensions = sizes.dimensions();
|
|
// Initialize the work grid parameter
|
|
for (uint idx = 0; idx < 3; idx++) {
|
|
aqlPkt->grid_size[idx] = 1;
|
|
aqlPkt->workgroup_size[idx] = 1;
|
|
}
|
|
|
|
// Check if runtime has to find local workgroup size
|
|
findLocalWorkSize(aqlPkt->dimensions, global, local);
|
|
for (uint idx = 0; idx < aqlPkt->dimensions; idx++) {
|
|
aqlPkt->grid_size[idx] = global[idx];
|
|
aqlPkt->workgroup_size[idx] = local[idx];
|
|
}
|
|
|
|
// Initialize if dispatch should enable profiling
|
|
aqlPkt->reserved2 = 0; //config->profile ? 1:0;
|
|
|
|
// Initialize kernel ISA and execution buffer requirements
|
|
aqlPkt->kernel_object_address = gpuAqlCode()->vmAddress();
|
|
aqlPkt->group_segment_size_bytes = ldsAddress - ldsSize();
|
|
aqlPkt->private_segment_size_bytes = spillSegSize();
|
|
|
|
// Initialize cache flush configuration for the dispatch
|
|
//! @todo Currently not used in emulation
|
|
aqlPkt->barrier = 1;
|
|
aqlPkt->release_fence_scope = 1;
|
|
aqlPkt->acquire_fence_scope = 2;
|
|
aqlPkt->invalidate_instruction_cache = 1;
|
|
|
|
ConstBuffer* cb = gpu.constBufs_[0];
|
|
cb->uploadDataToHw(argBufSize + sizeof(HsaAqlDispatchPacket));
|
|
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
|
|
aqlPkt->kernel_arg_address = argList;
|
|
memList.push_back(cb);
|
|
memList.push_back(gpuAqlCode());
|
|
if (NULL != prog().globalStore()) {
|
|
memList.push_back(prog().globalStore());
|
|
}
|
|
if (cpuAqlCode_->enable_sgpr_queue_ptr) {
|
|
memList.push_back(gpu.hsaQueueMem());
|
|
}
|
|
|
|
if (srdResource) {
|
|
dev().srds().fillResourceList(memList);
|
|
}
|
|
|
|
return aqlPkt;
|
|
}
|
|
|
|
} // namespace gpu
|