d1abc0baa2
SWDEV-187399 - Linux-Pro Pyro_Explosion.hip Image difference is observed specific to W9000 ASIC in OCL mode The issue disappeared when we invalidate L1 cache after each dispatch. This change also fix ocltst runtime OCLMultiQueue test in Tahiti Brahma driver. Tests: http://ocltc.amd.com:8111/viewModification.html?modId=121519&personal=true&tab=vcsModificationBuilds Test in the Houdini app. The symptom disappeared. This change also fix ocltst runtime OCLMultiQueue test in Tahiti Brahma driver. ReviewBoard: http://ocltc.amd.com/reviews/r/17509/ Affected files ... ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#337 edit
3506 baris
112 KiB
C++
3506 baris
112 KiB
C++
//
|
|
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
|
//
|
|
|
|
#include "device/gpu/gpukernel.hpp"
|
|
#include "device/gpu/gpuprogram.hpp"
|
|
#include "device/gpu/gpublit.hpp"
|
|
#include "device/gpu/gpuconstbuf.hpp"
|
|
#include "device/gpu/gpusched.hpp"
|
|
#include "platform/commandqueue.hpp"
|
|
#include "shader/ComputeProgramObject.h"
|
|
#include "utils/options.hpp"
|
|
|
|
#include "acl.h"
|
|
#include "SCCommon.h"
|
|
|
|
#include <string>
|
|
#include <memory>
|
|
#include <fstream>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <ctime>
|
|
#include <algorithm>
|
|
|
|
namespace gpu {
|
|
|
|
const MetaDataConst ArgState[ArgStateTotal] = {
|
|
// Note: the order is important
|
|
// Name Type Properties
|
|
// Kernel description (special properties)
|
|
{"memory:compilerwrite", KernelArg::PrivateFixed, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"uniqueid:", KernelArg::NoType, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"memory:private:", KernelArg::PrivateSize, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"memory:local:", KernelArg::LocalSize, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"memory:hwprivate:", KernelArg::HwPrivateSize, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"memory:uavprivate:", KernelArg::HwPrivateSize, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"memory:hwlocal:", KernelArg::HwLocalSize, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"memory:64bitABI", KernelArg::ABI64Bit, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"limitgroupsize", KernelArg::Wavefront, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"function:", KernelArg::NoType, {1, 1, 0, 0, 0, 0, 0}},
|
|
{"intrinsic:", KernelArg::NoType, {1, 0, 0, 0, 0, 0, 0}},
|
|
{"error:", KernelArg::ErrorMessage, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"warning:", KernelArg::WarningMessage, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"printf_fmt:", KernelArg::PrintfFormatStr, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"version:", KernelArg::MetadataVersion, {0, 0, 0, 0, 0, 0, 0}},
|
|
// Kernel basic types
|
|
{"pointer:", KernelArg::PointerGlobal, {1, 1, 1, 1, 1, 1, 0}},
|
|
{"value:", KernelArg::Value, {1, 1, 1, 1, 1, 0, 0}},
|
|
{"image:", KernelArg::Image, {1, 1, 1, 1, 1, 0, 0}},
|
|
{"sampler:", KernelArg::Sampler, {0, 1, 0, 0, 0, 0, 0}},
|
|
{"counter:", KernelArg::Counter, {1, 1, 0, 1, 1, 0, 0}},
|
|
{"cws:", KernelArg::Grouping, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"lws:", KernelArg::WrkgrpSize, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"uavid:", KernelArg::UavId, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"reflection:", KernelArg::Reflection, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"constarg:", KernelArg::ConstArg, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"cbid:", KernelArg::ConstBufId, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"printfid:", KernelArg::PrintfBufId, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"wsh:", KernelArg::GroupingHint, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"vth:", KernelArg::VecTypeHint, {0, 0, 0, 0, 0, 0, 0}},
|
|
{"WavesPerSimdHint:", KernelArg::WavesPerSimdHint, {0, 0, 0, 0, 0, 0, 0}},
|
|
};
|
|
|
|
const DataTypeConst DataType[] = {
|
|
{
|
|
"i8:", KernelArg::Char,
|
|
},
|
|
{
|
|
"i16:", KernelArg::Short,
|
|
},
|
|
{
|
|
"i32:", KernelArg::Int,
|
|
},
|
|
{
|
|
"i64:", KernelArg::Long,
|
|
},
|
|
{
|
|
"u8:", KernelArg::UChar,
|
|
},
|
|
{
|
|
"u16:", KernelArg::UShort,
|
|
},
|
|
{
|
|
"u32:", KernelArg::UInt,
|
|
},
|
|
{
|
|
"u64:", KernelArg::ULong,
|
|
},
|
|
{
|
|
"float:", KernelArg::Float,
|
|
},
|
|
{
|
|
"double:", KernelArg::Double,
|
|
},
|
|
{
|
|
"struct:", KernelArg::Struct,
|
|
},
|
|
{
|
|
"union:", KernelArg::Union,
|
|
},
|
|
{
|
|
"1D:", KernelArg::Image1D,
|
|
},
|
|
{
|
|
"2D:", KernelArg::Image2D,
|
|
},
|
|
{
|
|
"3D:", KernelArg::Image3D,
|
|
},
|
|
{
|
|
"1DB:", KernelArg::Image1DB,
|
|
},
|
|
{
|
|
"1DA:", KernelArg::Image1DA,
|
|
},
|
|
{
|
|
"2DA:", KernelArg::Image2DA,
|
|
},
|
|
{
|
|
"opaque:", KernelArg::Opaque,
|
|
},
|
|
{
|
|
"event:", KernelArg::Event,
|
|
},
|
|
{
|
|
"sampler:", KernelArg::Sampler,
|
|
},
|
|
{
|
|
"half:", KernelArg::Half,
|
|
},
|
|
};
|
|
|
|
const uint DataTypeTotal = sizeof(DataType) / sizeof(DataTypeConst);
|
|
|
|
struct BufDataConst {
|
|
const char* tagName_; //!< buffer's name
|
|
KernelArg::ArgumentType type_; //!< type of argument
|
|
struct {
|
|
uint number_ : 1; //!< buffer's number
|
|
uint alignment_ : 1; //!< buffer's alignment
|
|
uint attribute_ : 1; //!< buffer's read/write attribute
|
|
uint reserved : 29; //!< reserved
|
|
};
|
|
};
|
|
|
|
static const BufDataConst BufType[] = {{"g", KernelArg::PointerGlobal, {1, 0, 0, 0}},
|
|
{"p", KernelArg::PointerPrivate, {1, 1, 1, 0}},
|
|
{"l", KernelArg::PointerLocal, {1, 1, 1, 0}},
|
|
{"uav", KernelArg::PointerGlobal, {1, 1, 1, 0}},
|
|
{"c", KernelArg::PointerConst, {1, 1, 1, 0}},
|
|
{"hl", KernelArg::PointerHwLocal, {1, 1, 1, 0}},
|
|
{"hp", KernelArg::PointerHwPrivate, {1, 1, 1, 0}},
|
|
{"hc", KernelArg::PointerHwConst, {1, 1, 1, 0}}};
|
|
static const uint BufTypeTotal = sizeof(BufType) / sizeof(BufDataConst);
|
|
|
|
//! The mathlib constants for each kernel execution
|
|
static const float MathLibConst[4] = {0.0f, 0.5f, 1.0f, 2.0f};
|
|
|
|
bool expect(const std::string& str, size_t* pos, const std::string& sym) {
|
|
bool result = true;
|
|
uint i;
|
|
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
// Check if we have expected symbols
|
|
for (i = 0; i < sym.size(); ++i) {
|
|
char deb = str[*pos + i];
|
|
if (deb != sym[i]) {
|
|
result = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (result) *pos += i;
|
|
|
|
return result;
|
|
}
|
|
|
|
bool getword(const std::string& str, size_t* pos, std::string& sym) {
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;", *pos);
|
|
size_t count = posEnd - *pos;
|
|
|
|
if (count != 0) {
|
|
sym = str.substr(*pos, count);
|
|
}
|
|
sym[count] = 0;
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool getstring(const std::string& str, size_t* pos, std::string* out) {
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(":\n\r;", *pos);
|
|
size_t count = posEnd - *pos;
|
|
|
|
char* sym = new char[count + 1];
|
|
if (count != 0) {
|
|
if (!str.copy(sym, count, *pos)) {
|
|
return false;
|
|
}
|
|
}
|
|
sym[count] = 0;
|
|
*out = sym;
|
|
delete[] sym;
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool getuint(const std::string& str, size_t* pos, uint* val) {
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
char sym[16];
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
|
|
|
|
if (!str.copy(sym, posEnd - *pos, *pos)) {
|
|
return false;
|
|
}
|
|
*val = 0;
|
|
for (size_t i = 0; i < (posEnd - *pos); ++i) {
|
|
*val = (*val * 10) + (sym[i] - 0x30);
|
|
}
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool getuintHex(const std::string& str, size_t* pos, uint* val) {
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
char sym[16];
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
|
|
|
|
if (!str.copy(sym, posEnd - *pos, *pos)) {
|
|
return false;
|
|
}
|
|
*val = 0;
|
|
for (size_t i = 0; i < (posEnd - *pos); ++i) {
|
|
if (sym[i] >= '0' && sym[i] <= 'F') {
|
|
*val = (*val * 16) + (sym[i] - '0');
|
|
} else if (sym[i] >= 'a' && sym[i] <= 'f') {
|
|
*val = (*val * 16) + (sym[i] - 'a' + 10);
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
bool getuint64Hex(const std::string& str, size_t* pos, uint64_t* val) {
|
|
if (*pos == std::string::npos) {
|
|
return false;
|
|
}
|
|
|
|
char sym[16];
|
|
*pos = str.find_first_not_of(" \n\r", *pos);
|
|
size_t posEnd = str.find_first_of(": \n\r;)", *pos);
|
|
|
|
if (!str.copy(sym, posEnd - *pos, *pos)) {
|
|
return false;
|
|
}
|
|
*val = 0;
|
|
for (size_t i = 0; i < (posEnd - *pos); ++i) {
|
|
if (sym[i] >= '0' && sym[i] <= 'F') {
|
|
*val = (*val * 16) + (sym[i] - '0');
|
|
} else if (sym[i] >= 'a' && sym[i] <= 'f') {
|
|
*val = (*val * 16) + (sym[i] - 'a' + 10);
|
|
} else {
|
|
return false;
|
|
}
|
|
}
|
|
*pos = posEnd + 1;
|
|
return true;
|
|
}
|
|
|
|
void intToStr(size_t value, char* str, size_t size) {
|
|
static const uint MaxDigits32bit = 10;
|
|
char result[MaxDigits32bit];
|
|
uint idx = MaxDigits32bit;
|
|
|
|
do {
|
|
idx--;
|
|
result[idx] = static_cast<char>((value % 10) + '0');
|
|
value /= 10;
|
|
} while ((value != 0) && (idx > 0));
|
|
size_t len = MaxDigits32bit - idx;
|
|
size_t n = std::min<size_t>(len, size - 1);
|
|
memcpy(str, &result[idx], n);
|
|
str[n] = '\0';
|
|
}
|
|
|
|
//! Default destructor
|
|
CalImageReference::~CalImageReference() {
|
|
// Free CAL image
|
|
free(image_);
|
|
}
|
|
|
|
KernelArg::KernelArg()
|
|
: type_(KernelArg::NoType),
|
|
size_(0),
|
|
cbIdx_(0),
|
|
cbPos_(0),
|
|
index_(0),
|
|
alignment_(1),
|
|
dataType_(KernelArg::NoType) {
|
|
name_ = "";
|
|
buf_ = "";
|
|
memory_.value_ = 0;
|
|
typeQualifier_ = CL_KERNEL_ARG_TYPE_NONE;
|
|
}
|
|
|
|
KernelArg::KernelArg(const KernelArg& data) {
|
|
// Fill the new object
|
|
*this = data;
|
|
}
|
|
|
|
KernelArg& KernelArg::operator=(const KernelArg& data) {
|
|
// Fill the fields of the current object
|
|
name_ = data.name_;
|
|
typeName_ = data.typeName_;
|
|
typeQualifier_ = data.typeQualifier_;
|
|
type_ = data.type_;
|
|
size_ = data.size_;
|
|
cbIdx_ = data.cbIdx_;
|
|
cbPos_ = data.cbPos_;
|
|
buf_ = data.buf_;
|
|
index_ = data.index_;
|
|
alignment_ = data.alignment_;
|
|
dataType_ = data.dataType_;
|
|
memory_.value_ = data.memory_.value_;
|
|
return *this;
|
|
}
|
|
|
|
bool KernelArg::isCbNeeded() const {
|
|
//! \note not a safe way
|
|
bool result = ((type_ > NoType) && (type_ < Sampler)) ? true : false;
|
|
if ((type_ == Sampler) && (location_ == 0)) {
|
|
// Sampler is defined outside the kernel
|
|
result = true;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
size_t KernelArg::size(bool gpuLayer) const {
|
|
switch (type_) {
|
|
case NoType:
|
|
return 0;
|
|
case PointerConst:
|
|
case PointerHwConst:
|
|
case PointerGlobal:
|
|
return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem);
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
return (gpuLayer) ? sizeof(ImageConstants) : sizeof(cl_mem);
|
|
case Sampler:
|
|
return (gpuLayer) ? 2 * sizeof(uint32_t) : sizeof(cl_sampler);
|
|
case Counter:
|
|
return (gpuLayer) ? 0 : sizeof(cl_mem);
|
|
case PointerLocal:
|
|
case PointerHwLocal:
|
|
return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem);
|
|
case PointerPrivate:
|
|
case PointerHwPrivate:
|
|
return (gpuLayer) ? sizeof(uint32_t) * size_ : 0;
|
|
case Float:
|
|
return sizeof(cl_float) * amd::nextPowerOfTwo(size_);
|
|
case Double:
|
|
return sizeof(cl_double) * amd::nextPowerOfTwo(size_);
|
|
case Char:
|
|
case UChar:
|
|
return sizeof(cl_char) * amd::nextPowerOfTwo(size_);
|
|
case Short:
|
|
case UShort:
|
|
return sizeof(cl_short) * amd::nextPowerOfTwo(size_);
|
|
case Int:
|
|
case UInt:
|
|
return sizeof(cl_uint) * amd::nextPowerOfTwo(size_);
|
|
case Long:
|
|
case ULong:
|
|
return sizeof(cl_ulong) * amd::nextPowerOfTwo(size_);
|
|
case Struct:
|
|
case Union:
|
|
return (gpuLayer) ? amd::alignUp(size_, 16) : size_;
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
cl_kernel_arg_address_qualifier KernelArg::addressQualifier() const {
|
|
switch (type_) {
|
|
case PointerGlobal:
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
case PointerLocal:
|
|
case PointerHwLocal:
|
|
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
|
case PointerConst:
|
|
case PointerHwConst:
|
|
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
|
default:
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
}
|
|
|
|
cl_kernel_arg_access_qualifier KernelArg::accessQualifier() const {
|
|
switch (type_) {
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
if (memory_.readOnly_) {
|
|
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
|
} else if (memory_.writeOnly_) {
|
|
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
|
} else if (memory_.readWrite_) {
|
|
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
|
}
|
|
// Fall through ...
|
|
default:
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
}
|
|
|
|
//! temporary solution for the vectors handling in compiler
|
|
size_t KernelArg::specialVector() const {
|
|
if (size_ > VectorSizeLimit) {
|
|
switch (type_) {
|
|
case Char:
|
|
case UChar:
|
|
return sizeof(cl_char);
|
|
case Short:
|
|
case UShort:
|
|
return sizeof(cl_short);
|
|
default:
|
|
return 0;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
clk_value_type_t KernelArg::type() const {
|
|
switch (type_) {
|
|
case PointerGlobal:
|
|
case PointerLocal:
|
|
case PointerHwLocal:
|
|
case PointerConst:
|
|
case PointerHwConst:
|
|
case Image1D:
|
|
case Image2D:
|
|
case Image3D:
|
|
case Image1DB:
|
|
case Image1DA:
|
|
case Image2DA:
|
|
case Counter:
|
|
return T_POINTER;
|
|
case Float:
|
|
return T_FLOAT;
|
|
case Double:
|
|
return T_DOUBLE;
|
|
case Char:
|
|
case UChar:
|
|
return T_CHAR;
|
|
case Short:
|
|
case UShort:
|
|
return T_SHORT;
|
|
case Int:
|
|
return T_INT;
|
|
case UInt:
|
|
//! \note No UINT type
|
|
return T_INT;
|
|
case Long:
|
|
return T_LONG;
|
|
case ULong:
|
|
//! \note No ULONG type
|
|
return T_LONG;
|
|
case Struct:
|
|
case Union:
|
|
//! @todo What should we report?
|
|
return T_CHAR;
|
|
case Sampler:
|
|
return T_SAMPLER;
|
|
case PointerPrivate:
|
|
case PointerHwPrivate:
|
|
case NoType:
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
}
|
|
|
|
NullKernel::NullKernel(const std::string& name, const NullDevice& gpuNullDev,
|
|
const NullProgram& nullprog)
|
|
: device::Kernel(gpuNullDev, name, nullprog),
|
|
buildError_(CL_BUILD_PROGRAM_FAILURE),
|
|
gpuDev_(gpuNullDev),
|
|
calRef_(NULL),
|
|
internal_(false),
|
|
flags_(0),
|
|
cbSizes_(NULL),
|
|
numCb_(0),
|
|
rwAttributes_(false),
|
|
instructionCnt_(4) {
|
|
// UAV raw index will be detected
|
|
uavRaw_ = UavIdUndefined;
|
|
// CB index will be detected
|
|
cbId_ = UavIdUndefined;
|
|
// Printf index will be detected
|
|
printfId_ = UavIdUndefined;
|
|
}
|
|
|
|
NullKernel::~NullKernel() {
|
|
uint idx;
|
|
|
|
if (calRef_ == NULL) {
|
|
return;
|
|
}
|
|
calRef_->release();
|
|
|
|
// Destroy all kernel arguments
|
|
for (idx = 0; idx < arguments_.size(); ++idx) {
|
|
delete arguments_[idx];
|
|
}
|
|
arguments_.clear();
|
|
|
|
// Destroy all sampler kernel arguments
|
|
for (idx = 0; idx < intSamplers_.size(); ++idx) {
|
|
delete intSamplers_[idx];
|
|
}
|
|
intSamplers_.clear();
|
|
}
|
|
|
|
|
|
static int scComponentToArrayIndex(E_SC_COMPONENT dstComp) {
|
|
switch (dstComp) {
|
|
case SC_COMPONENT_X:
|
|
return 0;
|
|
case SC_COMPONENT_Y:
|
|
return 1;
|
|
case SC_COMPONENT_Z:
|
|
return 2;
|
|
case SC_COMPONENT_W:
|
|
return 3;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void addLoopConst(const SC_HWSHADER* shader, AMUabiAddEncoding& encoding) {
|
|
uint count = shader->dep.NumIntrlIConstants;
|
|
encoding.litConstsCount = shader->dep.NumIntrlIConstants;
|
|
|
|
// only suppport loop consts (int consts)
|
|
if (count) {
|
|
AMUabiLiteralConst* allocatedconsts = encoding.litConsts;
|
|
memset(allocatedconsts, 0, count * sizeof(AMUabiLiteralConst));
|
|
uint usedConsts = 0;
|
|
for (uint i = 0; i < count; ++i) {
|
|
uint currentConst;
|
|
for (currentConst = 0; currentConst < usedConsts; ++currentConst) {
|
|
if (allocatedconsts[currentConst].addr ==
|
|
HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber) {
|
|
break;
|
|
}
|
|
}
|
|
if (currentConst == usedConsts) {
|
|
usedConsts++;
|
|
assert(usedConsts <= count);
|
|
}
|
|
allocatedconsts[currentConst].addr = HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber;
|
|
allocatedconsts[currentConst].type = AMU_ABI_INT32;
|
|
allocatedconsts[currentConst].value.int32[scComponentToArrayIndex(
|
|
HWSHADER_Get(shader, dep.IntrlIConstants)[i].eDstComp)] =
|
|
HWSHADER_Get(shader, dep.IntrlIConstants)[i].iValue;
|
|
}
|
|
encoding.litConstsCount = usedConsts;
|
|
}
|
|
}
|
|
|
|
bool NullKernel::create(const std::string& code, const std::string& metadata,
|
|
const void* binaryCode, size_t binarySize) {
|
|
std::auto_ptr<uint> uavRefCount(new uint[MaxUavArguments]);
|
|
if (NULL == uavRefCount.get()) {
|
|
return false;
|
|
}
|
|
|
|
// Set all ref counts to 0
|
|
memset(uavRefCount.get(), 0, sizeof(uavRefCount.get()[0]) * MaxUavArguments);
|
|
|
|
// parse the metadata fields
|
|
if (!parseArguments(metadata, uavRefCount.get())) {
|
|
return false;
|
|
}
|
|
|
|
CALimage calImage;
|
|
// Save source if DEBUG build
|
|
#if DEBUG
|
|
ilSource_ = code;
|
|
#endif // DEBUG
|
|
|
|
amd::option::Options* options = nullProg().getCompilerOptions();
|
|
internal_ = options->oVariables->clInternalKernel;
|
|
|
|
if ((binaryCode == NULL) && (binarySize == 0) && !code.empty()) {
|
|
acl_error err;
|
|
std::string arch = "amdil";
|
|
if (nullDev().settings().use64BitPtr_) {
|
|
arch += "64";
|
|
}
|
|
aclTargetInfo info = aclGetTargetInfo(arch.c_str(), nullDev().hwInfo()->targetName_, &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclGetTargetInfo failed");
|
|
return false;
|
|
}
|
|
|
|
aclBinaryOptions binOpts = {0};
|
|
binOpts.struct_size = sizeof(binOpts);
|
|
binOpts.elfclass = info.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32;
|
|
binOpts.bitness = ELFDATA2LSB;
|
|
binOpts.alloc = &::malloc;
|
|
binOpts.dealloc = &::free;
|
|
|
|
aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &info, &binOpts, &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclBinaryInit failed");
|
|
return false;
|
|
}
|
|
|
|
if (ACL_SUCCESS !=
|
|
aclInsertSection(nullDev().amdilCompiler(), bin, code.data(), code.size(), aclSOURCE)) {
|
|
LogWarning("aclInsertSection failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
|
|
amd::option::Options* Opts = (amd::option::Options*)bin->options;
|
|
|
|
// Append an option so that we can selectively enable a SCOption on CZ
|
|
// whenever IOMMUv2 is enabled.
|
|
if (nullDev().settings().svmFineGrainSystem_) {
|
|
options->origOptionStr.append(" -sc-xnack-iommu");
|
|
}
|
|
// temporary solution to synchronize buildNo between runtime and complib
|
|
// until we move runtime inside complib
|
|
Opts->setBuildNo(options->getBuildNo());
|
|
|
|
// pass kernel name to compiler
|
|
Opts->setCurrKernelName(name().c_str());
|
|
|
|
err = aclCompile(nullDev().amdilCompiler(), bin, options->origOptionStr.c_str(), ACL_TYPE_AMDIL_TEXT,
|
|
ACL_TYPE_ISA, NULL);
|
|
|
|
buildLog_ += aclGetCompilerLog(nullDev().amdilCompiler());
|
|
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclCompile failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
if (!options->oVariables->BinEXE) {
|
|
// Early exit if binary doesn't contain EXE
|
|
aclBinaryFini(bin);
|
|
return true;
|
|
}
|
|
size_t len;
|
|
const void* isa = aclExtractSection(nullDev().amdilCompiler(), bin, &len, aclTEXT, &err);
|
|
if (err != ACL_SUCCESS) {
|
|
LogWarning("aclExtractSection failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
|
|
uint calImageSize;
|
|
if (!createMultiBinary(&calImageSize, reinterpret_cast<void**>(&calImage), isa)) {
|
|
LogWarning("initSrcEncoding failed");
|
|
aclBinaryFini(bin);
|
|
return false;
|
|
}
|
|
|
|
aclBinaryFini(bin);
|
|
} else if ((binaryCode != NULL) && (binarySize != 0)) {
|
|
uint size = 0;
|
|
if (!amuABIMultiBinaryGetSize(&size, const_cast<void*>(binaryCode)) || size > binarySize) {
|
|
buildLog_ += "Invalid binary image";
|
|
LogError("amuABIMultiBinaryGetSize failed!");
|
|
return false;
|
|
}
|
|
|
|
calImage = static_cast<CALimage>(malloc(size));
|
|
::memcpy(calImage, binaryCode, size);
|
|
} else {
|
|
LogError("Incorrect initialization parameters!");
|
|
return false;
|
|
}
|
|
|
|
calRef_ = new CalImageReference(calImage);
|
|
if (calRef_ == NULL) {
|
|
LogError("Memory allocation failure!");
|
|
// Free CAL image
|
|
free(calImage);
|
|
return false;
|
|
}
|
|
|
|
CALfuncInfo calFuncInfo;
|
|
|
|
// Get kernel compiled information
|
|
getFuncInfoFromImage(calImage, &calFuncInfo);
|
|
if (calFuncInfo.maxScratchRegsNeeded > 0) {
|
|
LogPrintfInfo(
|
|
"%s kernel has register spilling."
|
|
"Lower performance is expected.",
|
|
name().c_str());
|
|
}
|
|
|
|
workGroupInfo_.scratchRegs_ = calFuncInfo.maxScratchRegsNeeded;
|
|
workGroupInfo_.wavefrontPerSIMD_ = calFuncInfo.numWavefrontPerSIMD;
|
|
workGroupInfo_.wavefrontSize_ = calFuncInfo.wavefrontSize;
|
|
workGroupInfo_.availableGPRs_ = calFuncInfo.numGPRsAvailable;
|
|
workGroupInfo_.usedGPRs_ = calFuncInfo.numGPRsUsed;
|
|
workGroupInfo_.availableSGPRs_ = calFuncInfo.numSGPRsAvailable;
|
|
workGroupInfo_.usedSGPRs_ = calFuncInfo.numSGPRsUsed;
|
|
workGroupInfo_.availableVGPRs_ = calFuncInfo.numVGPRsAvailable;
|
|
workGroupInfo_.usedVGPRs_ = calFuncInfo.numVGPRsUsed;
|
|
workGroupInfo_.availableLDSSize_ = calFuncInfo.LDSSizeAvailable;
|
|
workGroupInfo_.usedLDSSize_ = calFuncInfo.LDSSizeUsed;
|
|
workGroupInfo_.availableStackSize_ = calFuncInfo.stackSizeAvailable;
|
|
workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed;
|
|
|
|
device::Kernel::parameters_t params;
|
|
if (!createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
size_t NullKernel::getCalBinarySize() const {
|
|
CALuint imageSize;
|
|
if (!amuABIMultiBinaryGetSize(&imageSize, calImage())) {
|
|
LogError("Failed to get the image size!");
|
|
return 0;
|
|
}
|
|
return static_cast<size_t>(imageSize);
|
|
}
|
|
|
|
bool NullKernel::getCalBinary(void* binary, size_t size) const {
|
|
uint calImageSize = 0;
|
|
if (!amuABIMultiBinaryGetSize(&calImageSize, calImage()) || size < calImageSize) {
|
|
LogError("CAL failed to save the kernel binary!");
|
|
return false;
|
|
}
|
|
::memcpy(binary, calImage(), calImageSize);
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Kernel::create(const std::string& code, const std::string& metadata, const void* binaryCode,
|
|
size_t binarySize) {
|
|
setPreferredSizeMultiple(dev().getAttribs().wavefrontSize);
|
|
|
|
if (!NullKernel::create(code, metadata, binaryCode, binarySize)) {
|
|
return false;
|
|
}
|
|
|
|
// initialize constant buffer sizes
|
|
if (!initConstBuffers()) {
|
|
return false;
|
|
}
|
|
|
|
// Initialize the kernel parameters
|
|
bool result = initParameters();
|
|
|
|
// Wave limiter needs to be initialized after kernel metadata is parsed
|
|
// Since it depends on it.
|
|
waveLimiter_.enable(dev().settings().ciPlus_);
|
|
|
|
if (result) {
|
|
buildError_ = CL_SUCCESS;
|
|
} else {
|
|
result = false;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
Kernel::Kernel(const std::string& name, const Device& gpuDev, const Program& prog,
|
|
const InitData* initData)
|
|
: NullKernel(name, gpuDev, prog) {
|
|
hwPrivateSize_ = 0;
|
|
if (NULL != initData) {
|
|
flags_ = initData->flags_;
|
|
hwPrivateSize_ = initData->hwPrivateSize_;
|
|
hwLocalSize_ = initData->hwLocalSize_;
|
|
}
|
|
// Workgroup info private memory size
|
|
workGroupInfo_.privateMemSize_ = hwPrivateSize_;
|
|
// Default wavesPerSimdHint_
|
|
workGroupInfo_.wavesPerSimdHint_ = ~0U;
|
|
}
|
|
|
|
Kernel::~Kernel() {
|
|
if (calRef_ == NULL) {
|
|
return;
|
|
}
|
|
|
|
{
|
|
Device::ScopedLockVgpus lock(dev());
|
|
|
|
// Release all virtual image objects on all virtual GPUs
|
|
for (uint idx = 0; idx < dev().vgpus().size(); ++idx) {
|
|
dev().vgpus()[idx]->releaseKernel(calImage());
|
|
}
|
|
}
|
|
|
|
if (0 != numCb_) {
|
|
delete[] cbSizes_;
|
|
}
|
|
}
|
|
|
|
const Device& Kernel::dev() const { return reinterpret_cast<const Device&>(gpuDev_); }
|
|
|
|
const Program& Kernel::prog() const { return reinterpret_cast<const Program&>(prog_); }
|
|
|
|
bool NullKernel::createMultiBinary(uint* imageSize, void** image, const void* isa) {
|
|
const SC_HWSHADER* shader = reinterpret_cast<const SC_HWSHADER*>(isa);
|
|
|
|
bool result = false;
|
|
AMUabiAddEncoding encoding;
|
|
memset(&encoding, 0, sizeof(AMUabiAddEncoding));
|
|
|
|
size_t allocSize = sizeof(uint) * MaxReadImage + sizeof(CALUavEntry) * MaxUavArguments +
|
|
sizeof(CALSamplerMapEntry) * MaxSamplers + sizeof(CALConstantBufferMask) * MaxConstBuffers +
|
|
sizeof(AMUabiLiteralConst) * shader->dep.NumIntrlIConstants;
|
|
char* tmpMem = new char[allocSize];
|
|
if (tmpMem == NULL) {
|
|
LogError("Error allocating memory");
|
|
return false;
|
|
}
|
|
|
|
CalcPtr(encoding.inputs, tmpMem, 0, 0);
|
|
CalcPtr(encoding.uav, encoding.inputs, sizeof(uint), MaxReadImage);
|
|
CalcPtr(encoding.inputSamplerMaps, encoding.uav, sizeof(CALUavEntry), MaxUavArguments);
|
|
CalcPtr(encoding.constBuffers, encoding.inputSamplerMaps, sizeof(CALSamplerMapEntry),
|
|
MaxSamplers);
|
|
if (shader->dep.NumIntrlIConstants != 0) {
|
|
CalcPtr(encoding.litConsts, encoding.constBuffers, sizeof(CALConstantBufferMask),
|
|
MaxConstBuffers);
|
|
}
|
|
AMUabiMultiBinary amuBinary;
|
|
amuABIMultiBinaryCreate(&amuBinary);
|
|
|
|
result = siCreateHwInfo(shader, encoding);
|
|
if (!result) {
|
|
delete[] tmpMem;
|
|
LogWarning("Error Creating program info");
|
|
return false;
|
|
}
|
|
|
|
addLoopConst(shader, encoding);
|
|
|
|
unsigned int outputCount = 0, condOut = 0, earlyExit = 0, globalCount = 0, persistentCount = 0;
|
|
unsigned int symbolCount = 0;
|
|
CALOutputEntry* outputs = 0;
|
|
unsigned int* globalBuffers = 0;
|
|
unsigned int* persistentBuffers = 0;
|
|
AMUabiUserSymbol* symbols = 0;
|
|
|
|
CALSamplerMapEntry* inputSamplers = encoding.inputSamplerMaps;
|
|
CALConstantBufferMask* constBuffers = encoding.constBuffers;
|
|
uint* inputResources = encoding.inputs;
|
|
CALUavEntry* uav = encoding.uav;
|
|
|
|
uint inputSamplerCount = samplerSize();
|
|
for (uint i = 0; i < inputSamplerCount; ++i) {
|
|
inputSamplers[i].resource = 0;
|
|
inputSamplers[i].sampler = sampler(i)->index_;
|
|
}
|
|
|
|
uint constBufferCount = 2;
|
|
|
|
constBuffers[0].index = 0;
|
|
constBuffers[1].index = 1;
|
|
|
|
uint inputResourceCount = 0;
|
|
|
|
uint uavCount = 0;
|
|
bool cbBound = false;
|
|
bool printfBound = false;
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerHwConst:
|
|
constBuffers[constBufferCount++].index = arg->index_;
|
|
break;
|
|
case KernelArg::PointerGlobal:
|
|
uav[uavCount].offset = arg->index_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
break;
|
|
case KernelArg::ConstBufId:
|
|
if (!cbBound) {
|
|
uav[uavCount].offset = cbId_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
cbBound = true;
|
|
break;
|
|
case KernelArg::PrintfBufId:
|
|
if (!printfBound) {
|
|
uav[uavCount].offset = printfId_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
printfBound = true;
|
|
break;
|
|
case KernelArg::UavId:
|
|
if ((UavIdUndefined != uavRaw_) && !(flags() & PrintfOutput)) {
|
|
uav[uavCount].offset = arg->index_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
} else {
|
|
if (UavIdUndefined != uavRaw_) {
|
|
uav[uavCount].offset = uavRaw_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_BUFFER;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
}
|
|
break;
|
|
case KernelArg::Sampler:
|
|
inputSamplers[inputSamplerCount].resource = 0;
|
|
inputSamplers[inputSamplerCount].sampler = arg->index_;
|
|
inputSamplerCount++;
|
|
break;
|
|
case KernelArg::Image1D:
|
|
case KernelArg::Image2D:
|
|
case KernelArg::Image3D:
|
|
case KernelArg::Image1DB:
|
|
case KernelArg::Image1DA:
|
|
case KernelArg::Image2DA:
|
|
if (arg->memory_.readOnly_) {
|
|
inputResources[inputResourceCount++] = arg->index_;
|
|
} else {
|
|
uav[uavCount].offset = arg->index_;
|
|
uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPED;
|
|
uav[uavCount].dimension = AMU_ABI_DIM_2D;
|
|
uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS;
|
|
uavCount++;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (uint i = 0; i < nullProg().glbCb().size(); ++i) {
|
|
constBuffers[constBufferCount++].index = nullProg().glbCb()[i];
|
|
}
|
|
|
|
encoding.machine = nullDev().hwInfo()->machine_;
|
|
encoding.type = ED_ATI_CAL_TYPE_COMPUTE;
|
|
encoding.inputCount = inputResourceCount;
|
|
encoding.outputCount = outputCount;
|
|
encoding.outputs = outputs;
|
|
encoding.condOut = condOut;
|
|
encoding.earlyExit = earlyExit;
|
|
encoding.globalBuffersCount = globalCount;
|
|
encoding.globalBuffers = globalBuffers;
|
|
encoding.persistentBuffersCount = persistentCount;
|
|
encoding.persistentBuffers = persistentBuffers;
|
|
encoding.constBuffersCount = constBufferCount;
|
|
encoding.inputSamplerMapCount = inputSamplerCount;
|
|
encoding.symbolsCount = symbolCount;
|
|
encoding.symbols = symbols;
|
|
encoding.uavCount = uavCount;
|
|
|
|
amuABIMultiBinaryAddEncoding(amuBinary, &encoding);
|
|
|
|
uint success = amuABIMultiBinaryPack(imageSize, image, amuBinary);
|
|
|
|
amuABIMultiBinaryDestroy(amuBinary);
|
|
|
|
delete[] tmpMem;
|
|
delete[] encoding.progInfos;
|
|
|
|
return (success == 0) ? false : true;
|
|
}
|
|
|
|
void Kernel::findLocalWorkSize(size_t workDim, const amd::NDRange& gblWorkSize,
|
|
amd::NDRange& lclWorkSize) const {
|
|
// Initialize the default workgoup info
|
|
// Check if the kernel has the compiled sizes
|
|
if (workGroupInfo()->compileSize_[0] == 0) {
|
|
// Find the default local workgroup size, if it wasn't specified
|
|
if (lclWorkSize[0] == 0) {
|
|
if ((dev().settings().overrideLclSet & (1 << (workDim - 1))) == 0) {
|
|
// Find threads per group
|
|
size_t thrPerGrp = workGroupInfo()->size_;
|
|
|
|
// Check if kernel uses images
|
|
if ((flags() & ImageEnable) &&
|
|
// and thread group is a multiple value of wavefronts
|
|
((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) &&
|
|
// and it's 2 or 3-dimensional workload
|
|
(workDim > 1) && ((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)) {
|
|
// Use 8x8 workgroup size if kernel has image writes
|
|
if ((flags() & ImageWrite) || (thrPerGrp != nullDev().info().preferredWorkGroupSize_)) {
|
|
lclWorkSize[0] = 8;
|
|
lclWorkSize[1] = 8;
|
|
} else {
|
|
lclWorkSize[0] = 16;
|
|
lclWorkSize[1] = 16;
|
|
}
|
|
if (workDim == 3) {
|
|
lclWorkSize[2] = 1;
|
|
}
|
|
} else {
|
|
size_t tmp = thrPerGrp;
|
|
// Split the local workgroup into the most efficient way
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
size_t div = tmp;
|
|
for (; (gblWorkSize[d] % div) != 0; div--)
|
|
;
|
|
lclWorkSize[d] = div;
|
|
tmp /= div;
|
|
}
|
|
// Assuming DWORD access
|
|
const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2;
|
|
|
|
// Check if we couldn't find optimal workload
|
|
if (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) ||
|
|
// or size is too small for the cache line
|
|
(lclWorkSize[0] < cacheLineMatch)) {
|
|
size_t maxSize = 0;
|
|
size_t maxDim = 0;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (maxSize < gblWorkSize[d]) {
|
|
maxSize = gblWorkSize[d];
|
|
maxDim = d;
|
|
}
|
|
}
|
|
// Use X dimension as high priority. Runtime will assume that
|
|
// X dimension is more important for the address calculation
|
|
if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) {
|
|
lclWorkSize[0] = cacheLineMatch;
|
|
thrPerGrp /= cacheLineMatch;
|
|
lclWorkSize[maxDim] = thrPerGrp;
|
|
for (uint d = 1; d < workDim; ++d) {
|
|
if (d != maxDim) {
|
|
lclWorkSize[d] = 1;
|
|
}
|
|
}
|
|
} else {
|
|
// Check if a local workgroup has the most optimal size
|
|
if (thrPerGrp > maxSize) {
|
|
thrPerGrp = maxSize;
|
|
}
|
|
lclWorkSize[maxDim] = thrPerGrp;
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
if (d != maxDim) {
|
|
lclWorkSize[d] = 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
// Use overrides when app doesn't provide workgroup dimensions
|
|
if (workDim == 1) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE;
|
|
} else if (workDim == 2) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y;
|
|
} else if (workDim == 3) {
|
|
lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X;
|
|
lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y;
|
|
lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z;
|
|
} else {
|
|
assert(0 && "Invalid workDim!");
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
for (uint d = 0; d < workDim; ++d) {
|
|
lclWorkSize[d] = workGroupInfo()->compileSize_[d];
|
|
}
|
|
}
|
|
}
|
|
|
|
void Kernel::setupProgramGrid(VirtualGPU& gpu, size_t workDim, const amd::NDRange& glbWorkOffset,
|
|
const amd::NDRange& gblWorkSize, amd::NDRange& lclWorkSize,
|
|
const amd::NDRange& groupOffset, const amd::NDRange& glbWorkOffsetOrg,
|
|
const amd::NDRange& glbWorkSizeOrg) const {
|
|
// ABI is always in CB0
|
|
address cbBuf = gpu.cb(0)->sysMemCopy();
|
|
uint* pGlobalSize =
|
|
reinterpret_cast<uint*>(cbBuf + GlobalWorkitemOffset * ConstBuffer::VectorSize);
|
|
uint* pLocalSize = reinterpret_cast<uint*>(cbBuf + LocalWorkitemOffset * ConstBuffer::VectorSize);
|
|
uint* pNumGroups = reinterpret_cast<uint*>(cbBuf + GroupsOffset * ConstBuffer::VectorSize);
|
|
uint* pGlobalOffset =
|
|
reinterpret_cast<uint*>(cbBuf + GlobalWorkOffsetOffset * ConstBuffer::VectorSize);
|
|
uint* pGroupOffset =
|
|
reinterpret_cast<uint*>(cbBuf + GroupWorkOffsetOffset * ConstBuffer::VectorSize);
|
|
uint32_t* debugInfo = reinterpret_cast<uint*>(cbBuf + DebugOffset * ConstBuffer::VectorSize);
|
|
uint* pNDRangeGlobalOffset =
|
|
reinterpret_cast<uint*>(cbBuf + NDRangeGlobalWorkOffsetOffset * ConstBuffer::VectorSize);
|
|
|
|
// Check for 64-bit metadata
|
|
uint glbABIShift = (abi64Bit()) ? 1 : 0;
|
|
|
|
VirtualGPU::CalVirtualDesc* progGrid = &gpu.cal_;
|
|
|
|
// Finds local workgroup size
|
|
findLocalWorkSize(workDim, gblWorkSize, lclWorkSize);
|
|
|
|
// Initialize the execution grid block and size/offset
|
|
pGlobalSize[0] = pGlobalSize[1] = pGlobalSize[2] = 1;
|
|
pGlobalSize[3] = static_cast<uint>(workDim);
|
|
|
|
pLocalSize[0] = pLocalSize[1] = pLocalSize[2] = 1;
|
|
pLocalSize[3] = 0;
|
|
|
|
pNumGroups[0] = pNumGroups[1] = pNumGroups[2] = 1;
|
|
pNumGroups[3] = 0;
|
|
|
|
pGlobalOffset[2] = pGlobalOffset[1] = pGlobalOffset[0] = 0;
|
|
pGroupOffset[2] = pGroupOffset[1] = pGroupOffset[0] = 0;
|
|
|
|
progGrid->gridBlock.width = progGrid->gridBlock.height = progGrid->gridBlock.depth = 1;
|
|
|
|
progGrid->gridSize.width = progGrid->gridSize.height = progGrid->gridSize.depth = 1;
|
|
|
|
progGrid->partialGridBlock.width = progGrid->partialGridBlock.height =
|
|
progGrid->partialGridBlock.depth = 1;
|
|
|
|
bool partialGrid = false;
|
|
|
|
// Fill the right values, based on the application request
|
|
switch (workDim) {
|
|
case 3:
|
|
pLocalSize[2] = progGrid->gridBlock.depth = static_cast<CALuint>(lclWorkSize[2]);
|
|
|
|
pGlobalSize[2] = static_cast<CALuint>(glbWorkSizeOrg[2]);
|
|
progGrid->gridSize.depth = static_cast<CALuint>(gblWorkSize[2]);
|
|
progGrid->gridSize.depth /= progGrid->gridBlock.depth;
|
|
pNumGroups[2] = pGlobalSize[2] / progGrid->gridBlock.depth;
|
|
|
|
pGlobalOffset[2] = glbWorkOffset[2];
|
|
pGroupOffset[2] = groupOffset[2];
|
|
pNDRangeGlobalOffset[2 + glbABIShift] = glbWorkOffsetOrg[2];
|
|
|
|
// Check if partial workgroup dispatch is required
|
|
progGrid->partialGridBlock.depth = gblWorkSize[2] % lclWorkSize[2];
|
|
if (progGrid->partialGridBlock.depth != 0) {
|
|
partialGrid = true;
|
|
// Increment the number of groups
|
|
progGrid->gridSize.depth++;
|
|
pNumGroups[2]++;
|
|
} else {
|
|
progGrid->partialGridBlock.depth = lclWorkSize[2];
|
|
}
|
|
// Fall through to fill 2D and 1D dimensions...
|
|
case 2:
|
|
pLocalSize[1] = progGrid->gridBlock.height = static_cast<CALuint>(lclWorkSize[1]);
|
|
|
|
pGlobalSize[1] = static_cast<CALuint>(glbWorkSizeOrg[1]);
|
|
progGrid->gridSize.height = static_cast<CALuint>(gblWorkSize[1]);
|
|
progGrid->gridSize.height /= progGrid->gridBlock.height;
|
|
pNumGroups[1] = pGlobalSize[1] / progGrid->gridBlock.height;
|
|
|
|
pGlobalOffset[1] = glbWorkOffset[1];
|
|
pGroupOffset[1] = groupOffset[1];
|
|
pNDRangeGlobalOffset[1 + glbABIShift] = glbWorkOffsetOrg[1];
|
|
|
|
// Check if partial workgroup dispatch is required
|
|
progGrid->partialGridBlock.height = gblWorkSize[1] % lclWorkSize[1];
|
|
if (progGrid->partialGridBlock.height != 0) {
|
|
partialGrid = true;
|
|
// Increment the number of groups
|
|
progGrid->gridSize.height++;
|
|
pNumGroups[1]++;
|
|
} else {
|
|
progGrid->partialGridBlock.height = lclWorkSize[1];
|
|
}
|
|
// Fall through to fill 1D dimension...
|
|
case 1:
|
|
pLocalSize[0] = progGrid->gridBlock.width = static_cast<CALuint>(lclWorkSize[0]);
|
|
|
|
pGlobalSize[0] = static_cast<CALuint>(glbWorkSizeOrg[0]);
|
|
progGrid->gridSize.width = static_cast<CALuint>(gblWorkSize[0]);
|
|
progGrid->gridSize.width /= progGrid->gridBlock.width;
|
|
pNumGroups[0] = pGlobalSize[0] / progGrid->gridBlock.width;
|
|
|
|
pGlobalOffset[0] = glbWorkOffset[0];
|
|
pGroupOffset[0] = groupOffset[0];
|
|
pNDRangeGlobalOffset[0 + glbABIShift] = glbWorkOffsetOrg[0];
|
|
|
|
// Check if partial workgroup dispatch is required
|
|
progGrid->partialGridBlock.width = gblWorkSize[0] % lclWorkSize[0];
|
|
if (progGrid->partialGridBlock.width != 0) {
|
|
partialGrid = true;
|
|
// Increment the number of groups
|
|
progGrid->gridSize.width++;
|
|
pNumGroups[0]++;
|
|
} else {
|
|
progGrid->partialGridBlock.width = lclWorkSize[0];
|
|
}
|
|
break;
|
|
default:
|
|
LogWarning("Wrong dimensions. Force to 1x1x1!");
|
|
break;
|
|
}
|
|
|
|
if (!partialGrid) {
|
|
progGrid->partialGridBlock.width = progGrid->partialGridBlock.height =
|
|
progGrid->partialGridBlock.depth = 0;
|
|
}
|
|
|
|
// Calculate the total number of workitems and workgroups
|
|
pGlobalOffset[3] = pGroupOffset[3] = 1;
|
|
for (uint i = 0; i < workDim; ++i) {
|
|
pGlobalOffset[3] *= pGlobalOffset[i];
|
|
pGroupOffset[3] *= pGroupOffset[i];
|
|
}
|
|
|
|
// Setup debug output buffer (if printf is active)
|
|
if (flags() & PrintfOutput) {
|
|
if (abi64Bit()) {
|
|
// Setup the debug info in constant buffer
|
|
reinterpret_cast<uint64_t*>(debugInfo)[1] = gpu.printfDbg().bufOffset();
|
|
// Size in DWORDs
|
|
debugInfo[4] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
|
|
debugInfo[4] /= sizeof(uint32_t);
|
|
} else {
|
|
// Setup the debug info in constant buffer
|
|
debugInfo[1] = static_cast<uint32_t>(gpu.printfDbg().bufOffset());
|
|
// Size in DWORDs
|
|
debugInfo[2] = static_cast<uint32_t>(gpu.printfDbg().wiDbgSize());
|
|
debugInfo[2] /= sizeof(uint32_t);
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Kernel::initParameters() {
|
|
size_t offset = 0;
|
|
device::Kernel::parameters_t params;
|
|
amd::KernelParameterDescriptor desc;
|
|
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
|
|
// Initialize the arguments for the abstraction layer
|
|
if (arg->isCbNeeded()) {
|
|
desc.name_ = arg->name_.data();
|
|
desc.type_ = arg->type();
|
|
desc.size_ = arg->size(false);
|
|
desc.addressQualifier_ = arg->addressQualifier();
|
|
desc.accessQualifier_ = arg->accessQualifier();
|
|
desc.typeName_ = arg->typeName();
|
|
desc.typeQualifier_ = arg->typeQualifier();
|
|
|
|
// Make offset alignment to match CPU metadata, since
|
|
// in multidevice config abstraction layer has a single signature
|
|
// and CPU sends the paramaters as they are allocated in memory
|
|
size_t size = desc.size_;
|
|
if (size == 0) {
|
|
// Local memory for CPU
|
|
size = sizeof(cl_mem);
|
|
}
|
|
offset = amd::alignUp(offset, std::min(size, size_t(16)));
|
|
desc.offset_ = offset;
|
|
offset += amd::alignUp(size, sizeof(uint32_t));
|
|
params.push_back(desc);
|
|
}
|
|
}
|
|
|
|
// Report the allocated local memory size (emulated and hw)
|
|
if (hwLocalSize_ != 0) {
|
|
CondLog((dev().info().localMemSize_ < hwLocalSize_),
|
|
"Requested local size is bigger than reported");
|
|
workGroupInfo_.localMemSize_ = hwLocalSize_;
|
|
}
|
|
|
|
if (!createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Kernel::bindGlobalHwCb(VirtualGPU& gpu, VirtualGPU::GslKernelDesc* desc) const {
|
|
bool result = true;
|
|
|
|
// Bind HW constant buffers used for the global data store
|
|
const Program::HwConstBuffers& gds = prog().glbHwCb();
|
|
for (const auto& it : gds) {
|
|
uint idx = it.first;
|
|
result = bindResource(gpu, *(it.second), idx, ConstantBuffer, idx);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool Kernel::bindConstantBuffers(VirtualGPU& gpu) const {
|
|
bool result = true;
|
|
|
|
assert((numCb_ <= MaxConstBuffersArguments) && "Runtime doesn't support more CBs for arguments!");
|
|
|
|
// Upload the parameters to HW and bind all constant buffers
|
|
for (uint i = 0; i < numCb_; i++) {
|
|
ConstBuffer* cb = gpu.constBufs_[i];
|
|
result &= cb->uploadDataToHw(cbSizes_[i]) &&
|
|
bindResource(gpu, *cb, i, ConstantBuffer, i, cb->wrtOffset());
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
void Kernel::processMemObjects(VirtualGPU& gpu, const amd::Kernel& kernel, const_address params,
|
|
bool nativeMem) const {
|
|
// Mark the tracker with a new kernel,
|
|
// so we can avoid checks of the aliased objects
|
|
gpu.memoryDependency().newKernel();
|
|
|
|
// Check all parameters for the current kernel
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
amd::Memory* const* memories =
|
|
reinterpret_cast<amd::Memory* const*>(params + kernel.parameters().memoryObjOffset());
|
|
|
|
for (size_t i = 0; i < signature.numParameters(); ++i) {
|
|
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
|
const KernelArg* arg = argument(i);
|
|
Memory* memory = NULL;
|
|
|
|
// Find if current argument is a buffer
|
|
if ((desc.type_ == T_POINTER) && (arg->type_ != KernelArg::PointerLocal) &&
|
|
(arg->type_ != KernelArg::PointerHwLocal)) {
|
|
uint32_t index = desc.info_.arrayIndex_;
|
|
if (nativeMem) {
|
|
memory = reinterpret_cast<Memory* const*>(memories)[index];
|
|
} else if (*reinterpret_cast<amd::Memory* const*>(params + desc.offset_) != NULL) {
|
|
memory = dev().getGpuMemory(memories[index]);
|
|
// Synchronize data with other memory instances if necessary
|
|
memory->syncCacheFromHost(gpu);
|
|
}
|
|
|
|
if (memory != NULL) {
|
|
// Validate memory for a dependency in the queue
|
|
gpu.memoryDependency().validate(gpu, memory, arg->memory_.readOnly_);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Kernel::loadParameters(VirtualGPU& gpu, const amd::Kernel& kernel, const_address params,
|
|
bool nativeMem) const {
|
|
bool result = true;
|
|
uint i;
|
|
|
|
// Initialize local private ranges
|
|
if (!initLocalPrivateRanges(gpu)) {
|
|
return false;
|
|
}
|
|
|
|
if ((UavIdUndefined != uavRaw_) && (!(flags() & PrintfOutput) || (printfId_ != UavIdUndefined))) {
|
|
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
|
|
// Bind a buffer for a dummy read
|
|
result = bindResource(gpu, *gpuMemory, 0, ArgumentUavID, uavRaw_);
|
|
}
|
|
|
|
// Find all parameters for the current kernel
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
for (i = 0; i != signature.numParameters(); ++i) {
|
|
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
|
// Set current argument
|
|
if (!setArgument(gpu, kernel, i, params, desc, nativeMem)) {
|
|
result = false;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (result) {
|
|
// Update the ring ranges and math constant
|
|
setLocalPrivateRanges(gpu);
|
|
|
|
result = bindConstantBuffers(gpu);
|
|
|
|
if (flags() & PrivateFixed) {
|
|
result &= bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_);
|
|
}
|
|
|
|
// Setup debug output buffer (if printf is active)
|
|
if (flags() & PrintfOutput) {
|
|
gpu.addVmMemory(gpu.printfDbg().dbgBuffer());
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool Kernel::run(VirtualGPU& gpu, GpuEvent* calEvent, bool lastRun, bool lastDoppCmd,
|
|
bool pfpaDoppCmd) const {
|
|
const VirtualGPU::CalVirtualDesc* dispatch = gpu.cal();
|
|
|
|
auto compProg = static_cast<gsl::ComputeProgramObject*>(gpu.gslKernelDesc()->func_);
|
|
compProg->setWavesPerSH(waveLimiter_.getWavesPerSH(&gpu));
|
|
|
|
gpu.eventBegin(MainEngine);
|
|
gpu.rs()->Dispatch(gpu.cs(), &dispatch->gridBlock, &dispatch->partialGridBlock,
|
|
&dispatch->gridSize, dispatch->localSize, gpu.vmMems(), dispatch->memCount_,
|
|
lastDoppCmd, pfpaDoppCmd);
|
|
|
|
gpu.flushCUCaches();
|
|
|
|
gpu.eventEnd(MainEngine, *calEvent);
|
|
|
|
// Unbind all resources
|
|
unbindResources(gpu, *calEvent, lastRun);
|
|
|
|
return true;
|
|
}
|
|
|
|
static size_t counter = 0;
|
|
void Kernel::debug(VirtualGPU& gpu) const {
|
|
std::fstream stubWrite;
|
|
address src = NULL;
|
|
|
|
std::cerr << "--- " << name_ << " ---" << std::endl;
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
const Memory* gpuMem = gpu.slots_[i].memory_;
|
|
std::stringstream fileName;
|
|
bool bufferObj =
|
|
((arg->type_ == KernelArg::PointerGlobal) || (arg->type_ == KernelArg::PointerConst) ||
|
|
(arg->type_ == KernelArg::PointerHwConst));
|
|
|
|
if ((src != NULL) && arg->isCbNeeded() && bufferObj) {
|
|
address memory = gpu.cb(arg->cbIdx_)->sysMemCopy();
|
|
std::cerr.setf(std::ios::hex);
|
|
uint* location =
|
|
reinterpret_cast<uint*>(src + *reinterpret_cast<uint*>(memory + arg->cbPos_));
|
|
std::cerr << " > " << arg->name_ << ": 0x" << location << std::endl;
|
|
|
|
// Dump the data
|
|
fileName << counter << "_kernel_" << name() << "_" << arg->name_ << "_" << location << ".bin";
|
|
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
|
|
|
|
// Write data to a file
|
|
if (stubWrite.is_open()) {
|
|
stubWrite.write(reinterpret_cast<char*>(location), gpuMem->size());
|
|
stubWrite.close();
|
|
}
|
|
}
|
|
if (((arg->type_ >= KernelArg::Image1D) && (arg->type_ <= KernelArg::Image3D)) ||
|
|
((src == NULL) && bufferObj)) {
|
|
//@todo Replace the current map
|
|
Memory* resource = const_cast<Memory*>(gpu.slots_[i].memory_);
|
|
void* memory = resource->map(&gpu);
|
|
uint* location = reinterpret_cast<uint*>(memory);
|
|
std::cerr << " > " << arg->name_ << (bufferObj ? ": buffer" : ": image") << std::endl;
|
|
// Dump the data
|
|
fileName << counter << "_kernel_" << name() << "_" << arg->name_ << "_" << location << ".bin";
|
|
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
|
|
|
|
// Write data to a file
|
|
if (stubWrite.is_open()) {
|
|
stubWrite.write(reinterpret_cast<char*>(location), gpuMem->size());
|
|
stubWrite.close();
|
|
}
|
|
resource->unmap(&gpu);
|
|
}
|
|
}
|
|
|
|
for (uint i = 0; i < gpu.constBufs_.size(); ++i) {
|
|
std::stringstream fileName;
|
|
fileName << counter++ << "_kernel_" << name() << "_const" << i << ".bin";
|
|
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
|
|
if (stubWrite.is_open()) {
|
|
address memory = reinterpret_cast<address>(gpu.constBufs_[i]->map(&gpu, Resource::ReadOnly));
|
|
// Check if we have OpenCL program
|
|
stubWrite.write(reinterpret_cast<char*>(memory + gpu.cb(i)->wrtOffset()),
|
|
gpu.cb(i)->lastWrtSize());
|
|
gpu.constBufs_[i]->unmap(&gpu);
|
|
stubWrite.close();
|
|
}
|
|
}
|
|
const Program::HwConstBuffers& gds = prog().glbHwCb();
|
|
for (const auto& it : gds) {
|
|
uint idx = it.first;
|
|
std::stringstream fileName;
|
|
fileName << counter++ << "_kernel_" << name() << "_const" << idx << ".bin";
|
|
stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary));
|
|
if (stubWrite.is_open()) {
|
|
address memory = reinterpret_cast<address>(it.second->map(&gpu, Resource::ReadOnly));
|
|
// Check if we have OpenCL program
|
|
stubWrite.write(reinterpret_cast<char*>(memory), it.second->size());
|
|
it.second->unmap(&gpu);
|
|
stubWrite.close();
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Kernel::initConstBuffers() {
|
|
bool result = true;
|
|
size_t i;
|
|
|
|
assert((numCb_ != 0) && "We have 0 constant buffers!");
|
|
|
|
// Allocate an array for CB sizes
|
|
cbSizes_ = new size_t[numCb_];
|
|
if (cbSizes_ == NULL) {
|
|
return false;
|
|
}
|
|
memset(cbSizes_, 0, sizeof(size_t) * numCb_);
|
|
|
|
// CB0 is reserved for ABI data
|
|
cbSizes_[0] = TotalABIVectors * ConstBuffer::VectorSize;
|
|
|
|
// Find sizes of all constant buffers
|
|
for (i = 0; i < arguments_.size(); ++i) {
|
|
const KernelArg* arg = argument(i);
|
|
size_t size = arg->cbPos_ + arg->size(true);
|
|
size_t specVec = arg->specialVector();
|
|
if (specVec != 0) {
|
|
size = arg->cbPos_ + (arg->size_ / KernelArg::VectorSizeLimit) * ConstBuffer::VectorSize;
|
|
}
|
|
// Do we need a CB?
|
|
if (arg->isCbNeeded() && (cbSizes_[arg->cbIdx_] < size)) {
|
|
cbSizes_[arg->cbIdx_] = size;
|
|
}
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool Kernel::setInternalSamplers(VirtualGPU& gpu) const {
|
|
for (uint i = 0; i < samplerSize(); ++i) {
|
|
const KernelArg* arg = sampler(i);
|
|
uint state = arg->cbPos_;
|
|
uint idx = arg->index_;
|
|
|
|
if (gpu.cal()->samplersState_[idx] != state) {
|
|
setSampler(gpu, state, idx);
|
|
gpu.cal_.samplersState_[idx] = state;
|
|
}
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Kernel::setArgument(VirtualGPU& gpu, const amd::Kernel& kernel,
|
|
uint idx, const_address params,
|
|
const amd::KernelParameterDescriptor& desc,
|
|
bool nativeMem) const {
|
|
size_t size = desc.size_;
|
|
const void* param = params + desc.offset_;
|
|
bool result = true;
|
|
const KernelArg* arg;
|
|
address memory;
|
|
size_t argSize;
|
|
static const bool waitOnBusyEngine = true;
|
|
|
|
assert((idx < arguments_.size()) && "Param index is out of range!");
|
|
|
|
arg = argument(idx);
|
|
assert((arg->cbIdx_ == 1) && "Runtime supports CB1 only for the arguments buffer!");
|
|
memory = gpu.cb(1)->sysMemCopy();
|
|
argSize = arg->size(true);
|
|
|
|
// Bind the global heap for emulation mode
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
// Fall through ...
|
|
default:
|
|
break;
|
|
}
|
|
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerHwConst:
|
|
case KernelArg::PointerGlobal: {
|
|
gpu::Memory* gpuMem = NULL;
|
|
amd::Memory* const* memories =
|
|
reinterpret_cast<amd::Memory* const*>(params + kernel.parameters().memoryObjOffset());
|
|
uint32_t index = desc.info_.arrayIndex_;
|
|
if (nativeMem) {
|
|
gpuMem = reinterpret_cast<Memory*>(memories[index]);
|
|
} else if (memories[index] != nullptr) {
|
|
gpuMem = dev().getGpuMemory(memories[index]);
|
|
}
|
|
bool forceZeroOffset = false;
|
|
|
|
if (gpuMem == NULL) {
|
|
forceZeroOffset = true;
|
|
gpuMem = dev().getGpuMemory(dev().dummyPage());
|
|
}
|
|
uint64_t offset = gpuMem->pinOffset();
|
|
|
|
// Make sure the passed argument is a buffer object
|
|
if (!gpuMem->cal()->buffer_) {
|
|
LogError("The kernel buffer argument isn't a buffer object!");
|
|
return false;
|
|
}
|
|
|
|
if (arg->type_ == KernelArg::PointerHwConst) {
|
|
// Bind current memory object with the kernel
|
|
if (!bindResource(gpu, *gpuMem, idx, ArgumentConstBuffer, arg->index_)) {
|
|
return false;
|
|
}
|
|
assert((offset == 0) && "No offset for HW CB");
|
|
// Add a fake offset to make sure (ptr != NULL) is TRUE
|
|
offset = 1;
|
|
} else {
|
|
ResourceType type = ArgumentHeapBuffer;
|
|
|
|
// Check if kernel expects UAV binding
|
|
if (arg->memory_.uavBuf_) {
|
|
type = ArgumentBuffer;
|
|
} else {
|
|
// Bind global buffer to UAV this buffer is bound to
|
|
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Bind current memory object with the kernel
|
|
// Note: it's a fake binding, if the buffer is part of
|
|
// the global heap
|
|
if (!bindResource(gpu, *gpuMem, idx, type, arg->index_)) {
|
|
return false;
|
|
}
|
|
|
|
// Update offset only if we bind HeapBuffer or
|
|
// it's global address space in UAV setup on SI+
|
|
offset += gpuMem->hbOffset();
|
|
if (!forceZeroOffset) {
|
|
assert((offset != 0) && "Offset 0 with a real allocation!");
|
|
}
|
|
gpu.addVmMemory(gpuMem);
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, waitOnBusyEngine);
|
|
|
|
if (forceZeroOffset) {
|
|
offset = 0;
|
|
}
|
|
|
|
// Copy memory offset into the constant buffer
|
|
if (abi64Bit()) {
|
|
*(reinterpret_cast<uint64_t*>(memory + arg->cbPos_)) = offset;
|
|
} else {
|
|
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) = static_cast<uint>(offset);
|
|
}
|
|
} break;
|
|
case KernelArg::Image1D:
|
|
case KernelArg::Image2D:
|
|
case KernelArg::Image3D:
|
|
case KernelArg::Image1DB:
|
|
case KernelArg::Image1DA:
|
|
case KernelArg::Image2DA: {
|
|
gpu::Memory* gpuMem = NULL;
|
|
amd::Memory* const* memories =
|
|
reinterpret_cast<amd::Memory* const*>(params + kernel.parameters().memoryObjOffset());
|
|
uint32_t index = desc.info_.arrayIndex_;
|
|
if (nativeMem) {
|
|
gpuMem = reinterpret_cast<Memory*>(memories[index]);
|
|
} else if (memories[index] != nullptr) {
|
|
gpuMem = dev().getGpuMemory(memories[index]);
|
|
}
|
|
|
|
if (gpuMem == NULL) {
|
|
return false;
|
|
}
|
|
// Make sure the passed argument is an image object
|
|
if (gpuMem->cal()->buffer_) {
|
|
LogError("The kernel image argument isn't an image object!");
|
|
return false;
|
|
}
|
|
|
|
ResourceType resType = arg->memory_.readOnly_ ? ArgumentImageRead : ArgumentImageWrite;
|
|
|
|
// Bind current memory object with the shader.
|
|
if (!bindResource(gpu, *gpuMem, idx, resType, arg->index_)) {
|
|
return false;
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, waitOnBusyEngine);
|
|
|
|
// Copy image constants into the constant buffer
|
|
if (gpuMem->owner() != NULL) {
|
|
copyImageConstants(gpuMem->owner()->asImage(),
|
|
reinterpret_cast<ImageConstants*>(memory + arg->cbPos_));
|
|
}
|
|
|
|
// Handle DOPP texture resource
|
|
gslMemObject gslMem = gpuMem->gslResource();
|
|
if (gslMem->getAttribs().isDOPPDesktopTexture) {
|
|
gpu.addVmMemory(gpuMem);
|
|
}
|
|
} break;
|
|
case KernelArg::Sampler: {
|
|
uint32_t index = desc.info_.arrayIndex_;
|
|
const amd::Sampler* amdSampler = reinterpret_cast<amd::Sampler* const*>(params +
|
|
kernel.parameters().samplerObjOffset())[index];
|
|
uint idx = arg->index_;
|
|
uint32_t state = amdSampler->state();
|
|
|
|
if (state != gpu.cal()->samplersState_[idx]) {
|
|
setSampler(gpu, state, idx);
|
|
gpu.cal_.samplersState_[idx] = state;
|
|
}
|
|
|
|
// Copy sampler state into the constant buffer
|
|
*(reinterpret_cast<uint32_t*>(memory + arg->cbPos_)) = state;
|
|
} break;
|
|
case KernelArg::Counter: {
|
|
gpu::Memory* gpuMem = NULL;
|
|
if (nativeMem) {
|
|
gpuMem = *reinterpret_cast<Memory* const*>(param);
|
|
} else if (*reinterpret_cast<amd::Memory* const*>(param) != NULL) {
|
|
gpuMem = dev().getGpuMemory(*reinterpret_cast<amd::Memory* const*>(param));
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, waitOnBusyEngine);
|
|
|
|
// Bind current memory object with the shader.
|
|
if (!bindResource(gpu, *gpuMem, idx, ArgumentCounter, idx)) {
|
|
return false;
|
|
}
|
|
} break;
|
|
case KernelArg::PointerHwLocal: {
|
|
// Calculate current offset in the local ring
|
|
uint offset = gpu.cal_.localSize;
|
|
uint extra = amd::alignUp(offset, arg->alignment_) - offset;
|
|
|
|
offset = amd::alignUp(offset, arg->alignment_);
|
|
size_t memSize = *static_cast<const uintptr_t*>(param);
|
|
|
|
// Allocate new memory from the local ring
|
|
gpu.cal_.localSize += static_cast<uint>(memSize) + extra;
|
|
// Copy current local argument's offset into the CB
|
|
*(reinterpret_cast<uint*>(memory + arg->cbPos_)) = offset;
|
|
|
|
CondLog((gpu.cal_.localSize > dev().info().localMemSize_),
|
|
"Requested local size is bigger than reported!");
|
|
} break;
|
|
case KernelArg::Float:
|
|
case KernelArg::Double:
|
|
case KernelArg::Char:
|
|
case KernelArg::UChar:
|
|
case KernelArg::Short:
|
|
case KernelArg::UShort:
|
|
case KernelArg::Int:
|
|
case KernelArg::UInt:
|
|
case KernelArg::Long:
|
|
case KernelArg::ULong:
|
|
if (size != argSize) {
|
|
LogWarning("Parameter's sizes are unmatched!");
|
|
}
|
|
// Fall through ...
|
|
case KernelArg::Struct:
|
|
case KernelArg::Union: {
|
|
size_t specVec = arg->specialVector();
|
|
if (specVec != 0) {
|
|
uint iter = (arg->size_ / KernelArg::VectorSizeLimit);
|
|
for (uint i = 0; i < iter; ++i) {
|
|
amd::Os::fastMemcpy(
|
|
(memory + arg->cbPos_ + i * ConstBuffer::VectorSize),
|
|
reinterpret_cast<const char*>(param) + i * KernelArg::VectorSizeLimit * specVec,
|
|
specVec * KernelArg::VectorSizeLimit);
|
|
}
|
|
} else {
|
|
// Copy data into the CB
|
|
amd::Os::fastMemcpy((memory + arg->cbPos_), param, size);
|
|
}
|
|
} break;
|
|
default:
|
|
LogError("Unhandled argument's type!");
|
|
break;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
bool Kernel::initLocalPrivateRanges(VirtualGPU& gpu) const {
|
|
// Initialize HW local
|
|
gpu.cal_.localSize = hwLocalSize_;
|
|
|
|
// Bind the global buffer if emulated local or private memory
|
|
// was allocated by the kernel
|
|
if ((flags() & PrintfOutput && (printfId_ == UavIdUndefined)) && (uavRaw_ != UavIdUndefined)) {
|
|
if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Bind the global buffer if emulated constant buffers are enabled
|
|
if (cbId_ != UavIdUndefined) {
|
|
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentCbID, cbId_)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Bind the printf buffer
|
|
if (printfId_ != UavIdUndefined) {
|
|
if (!bindResource(gpu, dev().globalMem(), 0, ArgumentPrintfID, printfId_)) {
|
|
return false;
|
|
}
|
|
}
|
|
// Initialize the iterations count
|
|
gpu.cal_.iterations_ = 1;
|
|
|
|
return true;
|
|
}
|
|
|
|
void Kernel::setLocalPrivateRanges(VirtualGPU& gpu) const {
|
|
address cbBuf = gpu.cb(0)->sysMemCopy();
|
|
uint* data;
|
|
uint gridSize =
|
|
gpu.cal()->gridSize.width * gpu.cal()->gridSize.height * gpu.cal()->gridSize.depth;
|
|
uint blockSize =
|
|
gpu.cal()->gridBlock.width * gpu.cal()->gridBlock.height * gpu.cal()->gridBlock.depth;
|
|
|
|
//! \todo validate if the compiler still generates PrivateFixed
|
|
if (flags() & PrivateFixed) {
|
|
// Update private ring
|
|
data = reinterpret_cast<uint*>(cbBuf + PrivateRingOffset * ConstBuffer::VectorSize);
|
|
Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage());
|
|
|
|
if (abi64Bit()) {
|
|
reinterpret_cast<uint64_t*>(data)[0] = gpuMemory->hbOffset();
|
|
data[2] = 0;
|
|
data[3] = 0;
|
|
} else {
|
|
data[0] = static_cast<uint>(gpuMemory->hbOffset());
|
|
data[1] = 0;
|
|
data[2] = data[3] = 0;
|
|
}
|
|
gpu.addVmMemory(gpuMemory);
|
|
}
|
|
|
|
// Copy the math lib constants
|
|
amd::Os::fastMemcpy((cbBuf + MathLibOffset * ConstBuffer::VectorSize), MathLibConst,
|
|
sizeof(MathLibConst));
|
|
|
|
// Update the offset to the global data
|
|
if (prog().glbData() != NULL) {
|
|
gpu.addVmMemory(prog().glbData());
|
|
uint64_t glbDataOffset = prog().glbData()->hbOffset();
|
|
if (abi64Bit()) {
|
|
*reinterpret_cast<uint64_t*>(cbBuf + GlobalDataStoreOffset * ConstBuffer::VectorSize) =
|
|
glbDataOffset;
|
|
} else {
|
|
*reinterpret_cast<uint*>(cbBuf + GlobalDataStoreOffset * ConstBuffer::VectorSize) =
|
|
static_cast<uint>(glbDataOffset);
|
|
}
|
|
}
|
|
|
|
// Split workload if it was requested
|
|
if ((gpu.cal_.iterations_ < 2) && gpu.dmaFlushMgmt().dispatchSplitSize() != 0) {
|
|
uint totalSize = gridSize * blockSize;
|
|
if (totalSize > gpu.dmaFlushMgmt().dispatchSplitSize()) {
|
|
gpu.cal_.iterations_ =
|
|
std::max(gpu.cal_.iterations_, (totalSize / gpu.dmaFlushMgmt().dispatchSplitSize()));
|
|
}
|
|
}
|
|
|
|
// Initialize the number of iterations to the grid size
|
|
if (flags() & PrintfOutput) {
|
|
gpu.cal_.iterations_ = gridSize;
|
|
}
|
|
}
|
|
|
|
void Kernel::setSampler(VirtualGPU& gpu, uint32_t state, uint physUnit) const {
|
|
// All CAL sampler's parameters are in floats
|
|
float gslAddress = GSL_CLAMP_TO_BORDER;
|
|
float gslMinFilter = GSL_MIN_NEAREST;
|
|
float gslMagFilter = GSL_MAG_NEAREST;
|
|
|
|
state &= ~amd::Sampler::StateNormalizedCoordsMask;
|
|
|
|
// Program the sampler address mode
|
|
switch (state & amd::Sampler::StateAddressMask) {
|
|
case amd::Sampler::StateAddressRepeat:
|
|
gslAddress = GSL_REPEAT;
|
|
break;
|
|
case amd::Sampler::StateAddressClampToEdge:
|
|
gslAddress = GSL_CLAMP_TO_EDGE;
|
|
break;
|
|
case amd::Sampler::StateAddressMirroredRepeat:
|
|
gslAddress = GSL_MIRRORED_REPEAT;
|
|
break;
|
|
case amd::Sampler::StateAddressClamp:
|
|
case amd::Sampler::StateAddressNone:
|
|
default:
|
|
break;
|
|
}
|
|
state &= ~amd::Sampler::StateAddressMask;
|
|
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_S, &gslAddress);
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_T, &gslAddress);
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_R, &gslAddress);
|
|
|
|
// Program texture filter mode
|
|
if (state == amd::Sampler::StateFilterLinear) {
|
|
gslMinFilter = GSL_MIN_LINEAR;
|
|
gslMagFilter = GSL_MAG_LINEAR;
|
|
}
|
|
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MIN_FILTER, &gslMinFilter);
|
|
gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MAG_FILTER, &gslMagFilter);
|
|
}
|
|
|
|
bool Kernel::bindResource(VirtualGPU& gpu, const Memory& memory, uint paramIdx, ResourceType type,
|
|
uint physUnit, size_t offset) const {
|
|
gslUAVType uavType = GSL_UAV_TYPE_UNKNOWN;
|
|
|
|
// Find the original resource name from the IL program
|
|
switch (type) {
|
|
case GlobalBuffer:
|
|
if (gpu.state_.boundGlobal_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundGlobal_ = true;
|
|
physUnit = uavRaw_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case ArgumentCbID:
|
|
if (gpu.state_.boundCb_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundCb_ = true;
|
|
physUnit = cbId_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case ArgumentPrintfID:
|
|
if (gpu.state_.boundPrintf_) {
|
|
return true;
|
|
}
|
|
gpu.state_.boundPrintf_ = true;
|
|
physUnit = printfId_;
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
break;
|
|
case ArgumentHeapBuffer:
|
|
case ArgumentBuffer:
|
|
case ArgumentImageRead:
|
|
case ArgumentImageWrite:
|
|
case ArgumentConstBuffer:
|
|
case ArgumentCounter:
|
|
// Early exit if resource is bound already
|
|
if (gpu.slots_[paramIdx].state_.bound_) {
|
|
return true;
|
|
}
|
|
|
|
// Associate resource with the slot
|
|
gpu.slots_[paramIdx].memory_ = &memory;
|
|
|
|
// Mark resource as bound
|
|
gpu.slots_[paramIdx].state_.bound_ = true;
|
|
|
|
if (type == ArgumentCounter) {
|
|
GpuEvent calEvent;
|
|
|
|
// Bind memory with atomic counter
|
|
gpu.cs()->bindAtomicCounter(argument(paramIdx)->index_, memory.gslResource());
|
|
|
|
// Copy the counter value into GDS
|
|
gpu.eventBegin(MainEngine);
|
|
gpu.cs()->syncAtomicCounter(argument(paramIdx)->index_, false);
|
|
gpu.eventEnd(MainEngine, calEvent);
|
|
|
|
// Mark resource as busy
|
|
memory.setBusy(gpu, calEvent);
|
|
return true;
|
|
} else if (type == ArgumentHeapBuffer) {
|
|
// We return here, since we just have to bind the global heap
|
|
return true;
|
|
} else if (type == ArgumentConstBuffer) {
|
|
gpu.slots_[paramIdx].state_.constant_ = true;
|
|
}
|
|
break;
|
|
case ArgumentUavID:
|
|
case ConstantBuffer:
|
|
break;
|
|
default:
|
|
LogPrintfError("Unspecified argument type ()!", type);
|
|
return false;
|
|
}
|
|
|
|
gslMemObject gslMem = NULL;
|
|
// Use global address space on SI+ for UAV setup
|
|
if ((type == ArgumentBuffer) || (type == ArgumentCbID) || (type == ArgumentUavID) ||
|
|
(type == ArgumentPrintfID)) {
|
|
gslMem = dev().heap().resource().gslResource();
|
|
} else {
|
|
gslMem = memory.gslResource();
|
|
}
|
|
|
|
// Associate memory with the physical unit, the actual binding
|
|
bool result = true;
|
|
switch (type) {
|
|
case GlobalBuffer:
|
|
case ArgumentBuffer:
|
|
case ArgumentImageWrite:
|
|
case ArgumentUavID:
|
|
case ArgumentCbID:
|
|
case ArgumentPrintfID:
|
|
if (type == ArgumentImageWrite) {
|
|
uavType = GSL_UAV_TYPE_TYPED;
|
|
} else if ((type == ArgumentBuffer) || (type == ArgumentUavID)) {
|
|
uavType = GSL_UAV_TYPE_TYPELESS;
|
|
}
|
|
if (gpu.cal_.uavs_[physUnit] != gslMem) {
|
|
result = gpu.setUAVBuffer(physUnit, gslMem, uavType);
|
|
gpu.setUAVChannelOrder(physUnit, gslMem);
|
|
gpu.cal_.uavs_[physUnit] = gslMem;
|
|
}
|
|
break;
|
|
case ConstantBuffer:
|
|
case ArgumentConstBuffer:
|
|
if ((gpu.cal_.constBuffers_[physUnit] != gslMem) || (offset != 0)) {
|
|
result = gpu.setConstantBuffer(physUnit, gslMem, offset, memory.hbSize());
|
|
gpu.cal_.constBuffers_[physUnit] = gslMem;
|
|
}
|
|
break;
|
|
case ArgumentImageRead:
|
|
if (gpu.cal_.readImages_[physUnit] != gslMem) {
|
|
result = gpu.setInput(physUnit, gslMem);
|
|
gpu.cal_.readImages_[physUnit] = gslMem;
|
|
}
|
|
break;
|
|
default:
|
|
result = false;
|
|
assert(false);
|
|
break;
|
|
}
|
|
if (!result) {
|
|
LogPrintfError("setMem failed unit:%d mem:0x%08x!", physUnit, gslMem);
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
void Kernel::unbindResources(VirtualGPU& gpu, GpuEvent calEvent, bool lastRun) const {
|
|
// Make sure unbind occurs on the last run, in case the execution had a split
|
|
if (lastRun) {
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
if (gpu.slots_[i].state_.bound_) {
|
|
GpuEvent calEventTmp = calEvent;
|
|
|
|
if (KernelArg::Counter == argument(i)->type_) {
|
|
// Copy the counter value from GDS
|
|
gpu.eventBegin(MainEngine);
|
|
gpu.cs()->syncAtomicCounter(argument(i)->index_, true);
|
|
gpu.eventEnd(MainEngine, calEventTmp);
|
|
} else if (!(gpu.slots_[i].state_.constant_ || argument(i)->memory_.readOnly_)) {
|
|
// Signal the abstraction layer that GPU memory is dirty
|
|
if (gpu.slots_[i].memory_->owner() != NULL) {
|
|
gpu.slots_[i].memory_->owner()->signalWrite(&gpu.dev());
|
|
}
|
|
}
|
|
// Mark resource as busy
|
|
gpu.slots_[i].memory_->setBusy(gpu, calEventTmp);
|
|
|
|
gpu.slots_[i].state_.value_ = 0;
|
|
}
|
|
}
|
|
|
|
// Unbind the global buffer
|
|
gpu.state_.boundGlobal_ = false;
|
|
|
|
// Unbind the constant buffer
|
|
gpu.state_.boundCb_ = false;
|
|
|
|
// Unbind the pritnf buffer
|
|
gpu.state_.boundPrintf_ = false;
|
|
}
|
|
|
|
// Mark CB busy
|
|
for (uint i = 0; i < numCb_; ++i) {
|
|
gpu.constBufs_[i]->setBusy(gpu, calEvent);
|
|
}
|
|
|
|
// Set the event object for the scratch buffer
|
|
if (workGroupInfo()->scratchRegs_ > 0) {
|
|
dev().scratch(gpu.hwRing())->memObj_->setBusy(gpu, calEvent);
|
|
}
|
|
}
|
|
|
|
void Kernel::copyImageConstants(const amd::Image* amdImage, ImageConstants* imageData) const {
|
|
imageData->width_ = static_cast<uint32_t>(amdImage->getWidth());
|
|
imageData->height_ = static_cast<uint32_t>(amdImage->getHeight());
|
|
imageData->depth_ = static_cast<uint32_t>(amdImage->getDepth());
|
|
imageData->dataType_ = static_cast<uint32_t>(amdImage->getImageFormat().image_channel_data_type);
|
|
|
|
imageData->widthFloat_ = 1.f / static_cast<float>(amdImage->getWidth());
|
|
imageData->heightFloat_ = 1.f / static_cast<float>(amdImage->getHeight());
|
|
imageData->depthFloat_ = 1.f / static_cast<float>(amdImage->getDepth());
|
|
imageData->channelOrder_ = static_cast<uint32_t>(amdImage->getImageFormat().image_channel_order);
|
|
}
|
|
|
|
union MetadataVersion {
|
|
struct {
|
|
uint64_t revision_ : 16; //!< LLVM metadata revision
|
|
uint64_t minorVersion_ : 16; //!< LLVM metadata minor verison
|
|
uint64_t majorVersion_ : 16; //!< LLVM metadata major version
|
|
};
|
|
uint64_t value_;
|
|
MetadataVersion(uint mj, uint mi, uint rev) : value_(0) {
|
|
revision_ = rev;
|
|
minorVersion_ = mi;
|
|
majorVersion_ = mj;
|
|
}
|
|
MetadataVersion() : value_(0) {}
|
|
};
|
|
|
|
//! Version of metadata with buffer attributes
|
|
const MetadataVersion MetadataBufferAttributes = MetadataVersion(2, 0, 88);
|
|
|
|
//! Version of metadata with type qualifiers
|
|
const MetadataVersion MetadataTypeQualifiers = MetadataVersion(3, 1, 103);
|
|
|
|
bool NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount) {
|
|
// Initialize workgroup info
|
|
workGroupInfo_.size_ = nullDev().info().preferredWorkGroupSize_;
|
|
MetadataVersion mdVersion;
|
|
|
|
// Find first tag
|
|
size_t pos = metaData.find(";");
|
|
|
|
// Loop through all provided program arguments
|
|
while (pos != std::string::npos) {
|
|
KernelArg arg;
|
|
|
|
if (!expect(metaData, &pos, ";")) {
|
|
break;
|
|
}
|
|
|
|
arg.type_ = KernelArg::NoType;
|
|
|
|
// Loop through all available metadata types
|
|
for (uint i = 0; i < ArgStateTotal; ++i) {
|
|
uint tmpValue;
|
|
// Find the name tag
|
|
if (expect(metaData, &pos, ArgState[i].typeName_)) {
|
|
switch (ArgState[i].type_) {
|
|
case KernelArg::NoType:
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::Reflection: {
|
|
uint argIdx;
|
|
// Read the argument's index
|
|
if (!getuint(metaData, &pos, &argIdx)) {
|
|
LogWarning("Couldn't get the argument index!");
|
|
return false;
|
|
}
|
|
KernelArg* tmpArg = arguments_[argIdx];
|
|
if (!getstring(metaData, &pos, &tmpArg->typeName_)) {
|
|
LogWarning("Couldn't get the argument type!");
|
|
return false;
|
|
}
|
|
}
|
|
continue;
|
|
case KernelArg::ConstArg: {
|
|
uint argIdx;
|
|
// Read the argument's index
|
|
if (!getuint(metaData, &pos, &argIdx)) {
|
|
LogWarning("Couldn't get the argument index!");
|
|
return false;
|
|
}
|
|
KernelArg* tmpArg = arguments_[argIdx];
|
|
tmpArg->typeQualifier_ |= CL_KERNEL_ARG_TYPE_CONST;
|
|
}
|
|
continue;
|
|
case KernelArg::Grouping:
|
|
for (uint j = 0; j < 3; ++j) {
|
|
uint temp;
|
|
// Read the compile workgroup size
|
|
if (!getuint(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the compile workgroup size!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.compileSize_[j] = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::WrkgrpSize: {
|
|
uint temp;
|
|
// Read the workgroup size
|
|
if (!getuint(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the workgroup size!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.size_ = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::Wavefront:
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::UavId:
|
|
// Read index
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
return false;
|
|
}
|
|
break;
|
|
case KernelArg::ConstBufId:
|
|
// Read index
|
|
if (!getuint(metaData, &pos, &cbId_)) {
|
|
return false;
|
|
}
|
|
continue;
|
|
case KernelArg::PrintfBufId:
|
|
// Read index
|
|
if (!getuint(metaData, &pos, &printfId_)) {
|
|
return false;
|
|
}
|
|
continue;
|
|
case KernelArg::MetadataVersion:
|
|
// Read metadata version
|
|
if (!getuint(metaData, &pos, &tmpValue)) {
|
|
return false;
|
|
}
|
|
mdVersion.majorVersion_ = tmpValue;
|
|
if (!getuint(metaData, &pos, &tmpValue)) {
|
|
return false;
|
|
}
|
|
mdVersion.minorVersion_ = tmpValue;
|
|
if (!getuint(metaData, &pos, &tmpValue)) {
|
|
return false;
|
|
}
|
|
mdVersion.revision_ = tmpValue;
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::GroupingHint:
|
|
for (uint j = 0; j < 3; ++j) {
|
|
uint temp;
|
|
// Read the compile workgroup size hint
|
|
if (!getuint(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the compile workgroup size hint!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.compileSizeHint_[j] = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::VecTypeHint: {
|
|
std::string temp;
|
|
// Read the compile vector type hint
|
|
if (!getstring(metaData, &pos, &temp)) {
|
|
LogWarning("Couldn't get the compile vector type hint!");
|
|
return false;
|
|
}
|
|
workGroupInfo_.compileVecTypeHint_ = temp;
|
|
}
|
|
// Process next ...
|
|
continue;
|
|
case KernelArg::WavesPerSimdHint: {
|
|
uint tmp;
|
|
if (!getuint(metaData, &pos, &tmp)) {
|
|
return false;
|
|
}
|
|
workGroupInfo_.wavesPerSimdHint_ = tmp;
|
|
}
|
|
continue;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
std::string argName;
|
|
// Save the argument type
|
|
arg.type_ = ArgState[i].type_;
|
|
|
|
// Check if we should expect the name
|
|
if (ArgState[i].name_) {
|
|
// Read the parameter's name
|
|
if (!getword(metaData, &pos, argName)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
arg.name_ = argName;
|
|
}
|
|
|
|
if (arg.type_ == KernelArg::Sampler) {
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
if (!getuint(metaData, &pos, &arg.location_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
if (!getuint(metaData, &pos, &arg.cbPos_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check if we should expect the resource data type
|
|
if (ArgState[i].resType_) {
|
|
uint k;
|
|
// Search for the data type
|
|
for (k = 0; k < DataTypeTotal; k++) {
|
|
if (expect(metaData, &pos, DataType[k].tagName_)) {
|
|
arg.dataType_ = DataType[k].type_;
|
|
if (arg.type_ == KernelArg::Image) {
|
|
flags_ |= ImageEnable;
|
|
if (expect(metaData, &pos, "RO:")) {
|
|
arg.memory_.readOnly_ = 1;
|
|
} else if (expect(metaData, &pos, "RW:")) {
|
|
arg.memory_.readWrite_ = 1;
|
|
flags_ |= ImageWrite;
|
|
} else if (expect(metaData, &pos, "WO:")) {
|
|
arg.memory_.writeOnly_ = 1;
|
|
flags_ |= ImageWrite;
|
|
}
|
|
} else if (arg.type_ == KernelArg::Value) {
|
|
arg.type_ = DataType[k].type_;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
if (k == DataTypeTotal) {
|
|
LogWarning("We couldn't find the argument's type.");
|
|
if ((arg.type_ == KernelArg::Value) || !getword(metaData, &pos, argName)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
//! @todo temporary condition
|
|
if ((arg.type_ == KernelArg::Opaque) || (arg.type_ == KernelArg::Sampler)) {
|
|
assert(false);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Check if we should expect the data size
|
|
if (ArgState[i].size_) {
|
|
uint tmpData;
|
|
// Read the data size
|
|
if (!getuint(metaData, &pos, &tmpData)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
if (arg.type_ == KernelArg::Image) {
|
|
arg.type_ = arg.dataType_;
|
|
arg.index_ = tmpData;
|
|
} else {
|
|
arg.size_ = tmpData;
|
|
}
|
|
}
|
|
|
|
if (arg.type_ == KernelArg::Counter) {
|
|
// Read a counter index
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
LogWarning("Couldn't get a counter index!");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Check if we should expect a resource index
|
|
if (ArgState[i].cbIdx_) {
|
|
// Read resource index
|
|
if (!getuint(metaData, &pos, &arg.cbIdx_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
|
|
if (arg.isCbNeeded() && (numCb_ < arg.cbIdx_)) {
|
|
numCb_ = arg.cbIdx_;
|
|
}
|
|
}
|
|
// Check if we should expect the CB offset
|
|
if (ArgState[i].cbPos_) {
|
|
// Read position in the constant buffer
|
|
if (!getuint(metaData, &pos, &arg.cbPos_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
// Check if we should expect the buffer type
|
|
if (ArgState[i].buf_) {
|
|
// Read the buffer type
|
|
if (!getword(metaData, &pos, argName)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
arg.buf_ = argName;
|
|
|
|
for (uint k = 0; k < BufTypeTotal; ++k) {
|
|
if (0 == arg.buf_.compare(BufType[k].tagName_)) {
|
|
// Update the parameter type
|
|
arg.type_ = BufType[k].type_;
|
|
// Check if we should expect a buffer index
|
|
if (BufType[k].number_) {
|
|
// Read a buffer index
|
|
if (!getuint(metaData, &pos, &arg.index_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
// Check for the required alignment
|
|
if (BufType[k].alignment_) {
|
|
// Read data alignment
|
|
if (!getuint(metaData, &pos, &arg.alignment_)) {
|
|
LogWarning("Couldn't get a kernel argument!");
|
|
return false;
|
|
}
|
|
}
|
|
// Check for the buffer's attribute
|
|
if ((mdVersion.value_ >= MetadataBufferAttributes.value_) && BufType[k].attribute_) {
|
|
if (expect(metaData, &pos, "RO")) {
|
|
arg.memory_.readOnly_ = 1;
|
|
} else if (expect(metaData, &pos, "RW")) {
|
|
arg.memory_.readWrite_ = 1;
|
|
} else if (expect(metaData, &pos, "WO")) {
|
|
arg.memory_.writeOnly_ = 1;
|
|
}
|
|
}
|
|
// Check for the type qualifier
|
|
if ((mdVersion.value_ >= MetadataTypeQualifiers.value_) && BufType[k].attribute_) {
|
|
uint tmp;
|
|
pos += 1;
|
|
if (!getuint(metaData, &pos, &tmp)) {
|
|
LogWarning("Couldn't get volatile type!");
|
|
return false;
|
|
}
|
|
if (tmp == 1) {
|
|
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
|
}
|
|
if (!getuint(metaData, &pos, &tmp)) {
|
|
LogWarning("Couldn't get restrict type!");
|
|
return false;
|
|
}
|
|
if (tmp == 1) {
|
|
arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// Find multiple UAV references
|
|
switch (arg.type_) {
|
|
case KernelArg::PointerGlobal:
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
case KernelArg::UavId:
|
|
uavRefCount[arg.index_]++;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
// Check if this argument will be passed in constant buffer
|
|
if (arg.isCbNeeded() || (arg.type_ == KernelArg::UavId)) {
|
|
if (arg.type_ == KernelArg::Sampler) {
|
|
// Serach for the passed by value sampler
|
|
for (uint i = 0; i < argSize(); ++i) {
|
|
KernelArg* value = arguments_[i];
|
|
if (0 == value->name_.compare(arg.name_)) {
|
|
value->type_ = arg.type_;
|
|
value->index_ = arg.index_;
|
|
value->location_ = 0;
|
|
break;
|
|
}
|
|
}
|
|
} else {
|
|
KernelArg* argument = new KernelArg(arg);
|
|
if (argument != NULL) {
|
|
addArgument(argument);
|
|
} else {
|
|
LogError("Couldn't allocate memory!");
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
// Check if we have a pre-defined sampler
|
|
else if (arg.type_ == KernelArg::Sampler) {
|
|
KernelArg* sampler = new KernelArg(arg);
|
|
if (sampler != NULL) {
|
|
addSampler(sampler);
|
|
} else {
|
|
LogError("Couldn't allocate memory!");
|
|
return false;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Next argument
|
|
pos = metaData.find(";", pos);
|
|
}
|
|
|
|
// Find arguments that will require a reallocation
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
KernelArg* arg = arguments_[i];
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerGlobal:
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
// Check if can't use a dedicated UAV,
|
|
// so realloc memory in the heap
|
|
arg->memory_.realloc_ = false;
|
|
arg->memory_.uavBuf_ = true;
|
|
break;
|
|
case KernelArg::PointerHwConst:
|
|
arg->memory_.realloc_ = true;
|
|
break;
|
|
case KernelArg::UavId:
|
|
uavRaw_ = arg->index_;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
// If argument marked with the const qualifier, then overwrite
|
|
// Read-Write attributes, since compiler doesn't mark it properly
|
|
if (arg->typeQualifier() & CL_KERNEL_ARG_TYPE_CONST) {
|
|
arg->memory_.readOnly_ = 1;
|
|
arg->memory_.readWrite_ = 0;
|
|
arg->memory_.writeOnly_ = 0;
|
|
}
|
|
}
|
|
|
|
if ((uavRaw_ != UavIdUndefined) && !(flags() & PrintfOutput)) {
|
|
// Find if default UAV is already assigned to an argument
|
|
for (uint i = 0; i < arguments_.size(); ++i) {
|
|
KernelArg* arg = arguments_[i];
|
|
switch (arg->type_) {
|
|
case KernelArg::PointerGlobal:
|
|
case KernelArg::PointerConst:
|
|
case KernelArg::PointerLocal:
|
|
case KernelArg::PointerPrivate:
|
|
if (uavRaw_ == arg->index_) {
|
|
uavRaw_ = UavIdUndefined;
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// There is always 1 constant buffer, associated with the kernel
|
|
numCb_++;
|
|
assert((numCb_ <= MaxConstBuffersArguments) &&
|
|
"Runtime doesn't support more than max CBs for arguments!");
|
|
|
|
// Limit workgroup size if requested
|
|
if ((flags() & LimitWorkgroup) && (GPU_MAX_WORKGROUP_SIZE == 0)) {
|
|
size_t temp = 1;
|
|
workGroupInfo_.size_ = workGroupInfo()->wavefrontSize_;
|
|
for (uint j = 0; j < 3; ++j) {
|
|
if (workGroupInfo()->compileSize_[j] != 0) {
|
|
temp *= workGroupInfo_.compileSize_[j];
|
|
}
|
|
}
|
|
// Report a compilation error if requested compile size doesn't
|
|
// match the required workgroup size
|
|
if (workGroupInfo()->size_ < temp) {
|
|
char str[8];
|
|
intToStr(workGroupInfo_.size_, str, 8);
|
|
buildError_ = CL_OUT_OF_RESOURCES;
|
|
buildLog_ += "Error: Requested compile size is bigger than the required workgroup size of ";
|
|
buildLog_ += str;
|
|
buildLog_ += " elements\n";
|
|
LogError(buildLog().c_str());
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Read/Write attributes are provided in metadata
|
|
if (mdVersion.value_ >= MetadataBufferAttributes.value_) {
|
|
rwAttributes_ = true;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
inline static HSAIL_ARG_TYPE GetHSAILArgType(const aclArgData* argInfo) {
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return HSAIL_ARGTYPE_POINTER;
|
|
case ARG_TYPE_QUEUE:
|
|
return HSAIL_ARGTYPE_QUEUE;
|
|
case ARG_TYPE_VALUE:
|
|
return HSAIL_ARGTYPE_VALUE;
|
|
case ARG_TYPE_IMAGE:
|
|
return HSAIL_ARGTYPE_IMAGE;
|
|
case ARG_TYPE_SAMPLER:
|
|
return HSAIL_ARGTYPE_SAMPLER;
|
|
case ARG_TYPE_ERROR:
|
|
default:
|
|
return HSAIL_ARGTYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
inline static size_t GetHSAILArgAlignment(const aclArgData* argInfo) {
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return argInfo->arg.pointer.align;
|
|
default:
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
inline static HSAIL_ACCESS_TYPE GetHSAILArgAccessType(const aclArgData* argInfo) {
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
switch (argInfo->arg.pointer.type) {
|
|
case ACCESS_TYPE_RO:
|
|
return HSAIL_ACCESS_TYPE_RO;
|
|
case ACCESS_TYPE_WO:
|
|
return HSAIL_ACCESS_TYPE_WO;
|
|
case ACCESS_TYPE_RW:
|
|
default:
|
|
return HSAIL_ACCESS_TYPE_RW;
|
|
}
|
|
}
|
|
return HSAIL_ACCESS_TYPE_NONE;
|
|
}
|
|
|
|
inline static HSAIL_ADDRESS_QUALIFIER GetHSAILAddrQual(const aclArgData* argInfo) {
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_CONSTANT_EMU:
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV:
|
|
case PTR_MT_GLOBAL:
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
case PTR_MT_LDS_EMU:
|
|
case PTR_MT_LDS:
|
|
return HSAIL_ADDRESS_LOCAL;
|
|
case PTR_MT_SCRATCH_EMU:
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
case PTR_MT_ERROR:
|
|
default:
|
|
LogError("Unsupported address type");
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
} else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
} else if (argInfo->type == ARG_TYPE_QUEUE) {
|
|
return HSAIL_ADDRESS_GLOBAL;
|
|
}
|
|
return HSAIL_ADDRESS_ERROR;
|
|
}
|
|
|
|
/* f16 returns f32 - workaround due to comp lib */
|
|
inline static HSAIL_DATA_TYPE GetHSAILDataType(const aclArgData* argInfo) {
|
|
aclArgDataType dataType;
|
|
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
dataType = argInfo->arg.pointer.data;
|
|
} else if (argInfo->type == ARG_TYPE_VALUE) {
|
|
dataType = argInfo->arg.value.data;
|
|
} else {
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
switch (dataType) {
|
|
case DATATYPE_i1:
|
|
return HSAIL_DATATYPE_B1;
|
|
case DATATYPE_i8:
|
|
return HSAIL_DATATYPE_S8;
|
|
case DATATYPE_i16:
|
|
return HSAIL_DATATYPE_S16;
|
|
case DATATYPE_i32:
|
|
return HSAIL_DATATYPE_S32;
|
|
case DATATYPE_i64:
|
|
return HSAIL_DATATYPE_S64;
|
|
case DATATYPE_u8:
|
|
return HSAIL_DATATYPE_U8;
|
|
case DATATYPE_u16:
|
|
return HSAIL_DATATYPE_U16;
|
|
case DATATYPE_u32:
|
|
return HSAIL_DATATYPE_U32;
|
|
case DATATYPE_u64:
|
|
return HSAIL_DATATYPE_U64;
|
|
case DATATYPE_f16:
|
|
return HSAIL_DATATYPE_F32;
|
|
case DATATYPE_f32:
|
|
return HSAIL_DATATYPE_F32;
|
|
case DATATYPE_f64:
|
|
return HSAIL_DATATYPE_F64;
|
|
case DATATYPE_struct:
|
|
return HSAIL_DATATYPE_STRUCT;
|
|
case DATATYPE_opaque:
|
|
return HSAIL_DATATYPE_OPAQUE;
|
|
case DATATYPE_ERROR:
|
|
default:
|
|
return HSAIL_DATATYPE_ERROR;
|
|
}
|
|
}
|
|
|
|
inline static int GetHSAILArgSize(const aclArgData* argInfo) {
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_VALUE:
|
|
switch (GetHSAILDataType(argInfo)) {
|
|
case HSAIL_DATATYPE_B1:
|
|
return 1;
|
|
case HSAIL_DATATYPE_B8:
|
|
case HSAIL_DATATYPE_S8:
|
|
case HSAIL_DATATYPE_U8:
|
|
return 1;
|
|
case HSAIL_DATATYPE_B16:
|
|
case HSAIL_DATATYPE_U16:
|
|
case HSAIL_DATATYPE_S16:
|
|
case HSAIL_DATATYPE_F16:
|
|
return 2;
|
|
case HSAIL_DATATYPE_B32:
|
|
case HSAIL_DATATYPE_U32:
|
|
case HSAIL_DATATYPE_S32:
|
|
case HSAIL_DATATYPE_F32:
|
|
return 4;
|
|
case HSAIL_DATATYPE_B64:
|
|
case HSAIL_DATATYPE_U64:
|
|
case HSAIL_DATATYPE_S64:
|
|
case HSAIL_DATATYPE_F64:
|
|
return 8;
|
|
case HSAIL_DATATYPE_STRUCT:
|
|
return argInfo->arg.value.numElements;
|
|
default:
|
|
return -1;
|
|
}
|
|
case ARG_TYPE_POINTER:
|
|
case ARG_TYPE_IMAGE:
|
|
case ARG_TYPE_SAMPLER:
|
|
case ARG_TYPE_QUEUE:
|
|
return sizeof(void*);
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
inline static clk_value_type_t GetOclType(const aclArgData* argInfo) {
|
|
static const clk_value_type_t ClkValueMapType[6][6] = {
|
|
{T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16},
|
|
{T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16},
|
|
{T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16},
|
|
{T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16},
|
|
{T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16},
|
|
{T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16},
|
|
};
|
|
|
|
uint sizeType;
|
|
if (argInfo->type == ARG_TYPE_QUEUE) {
|
|
return T_QUEUE;
|
|
}
|
|
if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) {
|
|
return T_POINTER;
|
|
} else if (argInfo->type == ARG_TYPE_VALUE) {
|
|
switch (argInfo->arg.value.data) {
|
|
case DATATYPE_i8:
|
|
case DATATYPE_u8:
|
|
sizeType = 0;
|
|
break;
|
|
case DATATYPE_i16:
|
|
case DATATYPE_u16:
|
|
sizeType = 1;
|
|
break;
|
|
case DATATYPE_i32:
|
|
case DATATYPE_u32:
|
|
sizeType = 2;
|
|
break;
|
|
case DATATYPE_i64:
|
|
case DATATYPE_u64:
|
|
sizeType = 3;
|
|
break;
|
|
case DATATYPE_f16:
|
|
case DATATYPE_f32:
|
|
sizeType = 4;
|
|
break;
|
|
case DATATYPE_f64:
|
|
sizeType = 5;
|
|
break;
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
switch (argInfo->arg.value.numElements) {
|
|
case 1:
|
|
return ClkValueMapType[sizeType][0];
|
|
case 2:
|
|
return ClkValueMapType[sizeType][1];
|
|
case 3:
|
|
return ClkValueMapType[sizeType][2];
|
|
case 4:
|
|
return ClkValueMapType[sizeType][3];
|
|
case 8:
|
|
return ClkValueMapType[sizeType][4];
|
|
case 16:
|
|
return ClkValueMapType[sizeType][5];
|
|
default:
|
|
return T_VOID;
|
|
}
|
|
} else if (argInfo->type == ARG_TYPE_SAMPLER) {
|
|
return T_SAMPLER;
|
|
} else {
|
|
return T_VOID;
|
|
}
|
|
}
|
|
|
|
inline static cl_kernel_arg_address_qualifier GetOclAddrQual(const aclArgData* argInfo) {
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_UAV:
|
|
case PTR_MT_GLOBAL:
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV_CONSTANT:
|
|
case PTR_MT_CONSTANT_EMU:
|
|
return CL_KERNEL_ARG_ADDRESS_CONSTANT;
|
|
case PTR_MT_LDS_EMU:
|
|
case PTR_MT_LDS:
|
|
return CL_KERNEL_ARG_ADDRESS_LOCAL;
|
|
default:
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
} else if (argInfo->type == ARG_TYPE_IMAGE) {
|
|
return CL_KERNEL_ARG_ADDRESS_GLOBAL;
|
|
}
|
|
// default for all other cases
|
|
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
|
|
}
|
|
|
|
inline static cl_kernel_arg_access_qualifier GetOclAccessQual(const aclArgData* argInfo) {
|
|
if (argInfo->type == ARG_TYPE_IMAGE) {
|
|
switch (argInfo->arg.image.type) {
|
|
case ACCESS_TYPE_RO:
|
|
return CL_KERNEL_ARG_ACCESS_READ_ONLY;
|
|
case ACCESS_TYPE_WO:
|
|
return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
|
|
case ACCESS_TYPE_RW:
|
|
return CL_KERNEL_ARG_ACCESS_READ_WRITE;
|
|
default:
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
}
|
|
return CL_KERNEL_ARG_ACCESS_NONE;
|
|
}
|
|
|
|
inline static cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) {
|
|
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
|
|
if (argInfo->type == ARG_TYPE_POINTER) {
|
|
if (argInfo->arg.pointer.isVolatile) {
|
|
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
|
|
}
|
|
if (argInfo->arg.pointer.isRestrict) {
|
|
rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
|
|
}
|
|
if (argInfo->arg.pointer.isPipe) {
|
|
rv |= CL_KERNEL_ARG_TYPE_PIPE;
|
|
}
|
|
if (argInfo->isConst) {
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
}
|
|
switch (argInfo->arg.pointer.memory) {
|
|
case PTR_MT_CONSTANT:
|
|
case PTR_MT_UAV_CONSTANT:
|
|
case PTR_MT_CONSTANT_EMU:
|
|
rv |= CL_KERNEL_ARG_TYPE_CONST;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
return rv;
|
|
}
|
|
|
|
static int GetOclSize(const aclArgData* argInfo) {
|
|
switch (argInfo->type) {
|
|
case ARG_TYPE_POINTER:
|
|
return sizeof(void*);
|
|
case ARG_TYPE_VALUE:
|
|
//! \note OCL 6.1.5. For 3-component vector data types,
|
|
//! the size of the data type is 4 * sizeof(component).
|
|
switch (argInfo->arg.value.data) {
|
|
case DATATYPE_struct:
|
|
return 1 * argInfo->arg.value.numElements;
|
|
case DATATYPE_i8:
|
|
case DATATYPE_u8:
|
|
return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_u16:
|
|
case DATATYPE_i16:
|
|
case DATATYPE_f16:
|
|
return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_u32:
|
|
case DATATYPE_i32:
|
|
case DATATYPE_f32:
|
|
return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_i64:
|
|
case DATATYPE_u64:
|
|
case DATATYPE_f64:
|
|
return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements);
|
|
case DATATYPE_ERROR:
|
|
default:
|
|
return -1;
|
|
}
|
|
case ARG_TYPE_IMAGE:
|
|
return sizeof(cl_mem);
|
|
case ARG_TYPE_SAMPLER:
|
|
return sizeof(cl_sampler);
|
|
case ARG_TYPE_QUEUE:
|
|
return sizeof(cl_command_queue);
|
|
default:
|
|
return -1;
|
|
}
|
|
}
|
|
|
|
void HSAILKernel::initArgList(const aclArgData* aclArg) {
|
|
// Initialize the hsail argument list too
|
|
initHsailArgs(aclArg);
|
|
|
|
// Iterate through the arguments and insert into parameterList
|
|
device::Kernel::parameters_t params;
|
|
amd::KernelParameterDescriptor desc;
|
|
size_t offset = 0;
|
|
|
|
// Reserved arguments for HSAIL launch
|
|
aclArg += MaxExtraArgumentsNum;
|
|
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
|
|
desc.name_ = arguments_[i]->name_.c_str();
|
|
desc.type_ = GetOclType(aclArg);
|
|
desc.addressQualifier_ = GetOclAddrQual(aclArg);
|
|
desc.accessQualifier_ = GetOclAccessQual(aclArg);
|
|
desc.typeQualifier_ = GetOclTypeQual(aclArg);
|
|
desc.typeName_ = arguments_[i]->typeName_.c_str();
|
|
|
|
// Make a check if it is local or global
|
|
if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
|
|
desc.size_ = sizeof(cl_mem);
|
|
} else {
|
|
desc.size_ = GetOclSize(aclArg);
|
|
}
|
|
|
|
// Make offset alignment to match CPU metadata, since
|
|
// in multidevice config abstraction layer has a single signature
|
|
// and CPU sends the paramaters as they are allocated in memory
|
|
size_t size = desc.size_;
|
|
|
|
offset = amd::alignUp(offset, std::min(size, size_t(16)));
|
|
desc.offset_ = offset;
|
|
offset += amd::alignUp(size, sizeof(uint32_t));
|
|
params.push_back(desc);
|
|
|
|
if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) {
|
|
flags_.imageEna_ = true;
|
|
if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) {
|
|
flags_.imageWriteEna_ = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
createSignature(params, params.size(), amd::KernelSignature::ABIVersion_0);
|
|
}
|
|
|
|
void HSAILKernel::initHsailArgs(const aclArgData* aclArg) {
|
|
int offset = 0;
|
|
|
|
// Reserved arguments for HSAIL launch
|
|
aclArg += MaxExtraArgumentsNum;
|
|
|
|
// Iterate through the each kernel argument
|
|
for (; aclArg->struct_size != 0; aclArg++) {
|
|
Argument* arg = new Argument;
|
|
// Initialize HSAIL kernel argument
|
|
arg->name_ = aclArg->argStr;
|
|
arg->typeName_ = aclArg->typeStr;
|
|
arg->size_ = GetHSAILArgSize(aclArg);
|
|
arg->offset_ = offset;
|
|
arg->type_ = GetHSAILArgType(aclArg);
|
|
arg->addrQual_ = GetHSAILAddrQual(aclArg);
|
|
arg->dataType_ = GetHSAILDataType(aclArg);
|
|
// If vector of args we add additional arguments to flatten it out
|
|
arg->numElem_ =
|
|
((aclArg->type == ARG_TYPE_VALUE) && (aclArg->arg.value.data != DATATYPE_struct))
|
|
? aclArg->arg.value.numElements
|
|
: 1;
|
|
arg->alignment_ = GetHSAILArgAlignment(aclArg);
|
|
arg->access_ = GetHSAILArgAccessType(aclArg);
|
|
offset += GetHSAILArgSize(aclArg);
|
|
arguments_.push_back(arg);
|
|
}
|
|
}
|
|
|
|
HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions,
|
|
uint extraArgsNum)
|
|
: device::Kernel(prog->dev(), name, *prog),
|
|
compileOptions_(compileOptions),
|
|
index_(0),
|
|
code_(NULL),
|
|
codeSize_(0),
|
|
hwMetaData_(NULL),
|
|
extraArgumentsNum_(extraArgsNum) {
|
|
flags_.hsa_ = true;
|
|
}
|
|
|
|
HSAILKernel::~HSAILKernel() {
|
|
while (!arguments_.empty()) {
|
|
Argument* arg = arguments_.back();
|
|
delete arg;
|
|
arguments_.pop_back();
|
|
}
|
|
|
|
delete[] hwMetaData_;
|
|
|
|
delete code_;
|
|
}
|
|
|
|
bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) {
|
|
if (extraArgumentsNum_ > MaxExtraArgumentsNum) {
|
|
LogError("Failed to initialize kernel: extra arguments number is bigger than is supported");
|
|
return false;
|
|
}
|
|
acl_error error = ACL_SUCCESS;
|
|
std::string openClKernelName = openclMangledName(name());
|
|
flags_.internalKernel_ =
|
|
(compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false;
|
|
// compile kernel down to ISA
|
|
if (finalize) {
|
|
std::string options(compileOptions_.c_str());
|
|
options.append(" -just-kernel=");
|
|
options.append(openClKernelName.c_str());
|
|
// Append an option so that we can selectively enable a SCOption on CZ
|
|
// whenever IOMMUv2 is enabled.
|
|
if (dev().settings().svmFineGrainSystem_) {
|
|
options.append(" -sc-xnack-iommu");
|
|
}
|
|
error = aclCompile(dev().hsaCompiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG,
|
|
ACL_TYPE_ISA, NULL);
|
|
buildLog_ += aclGetCompilerLog(dev().hsaCompiler());
|
|
if (error != ACL_SUCCESS) {
|
|
LogError("Failed to finalize kernel");
|
|
return false;
|
|
}
|
|
}
|
|
|
|
aqlCreateHWInfo(sym);
|
|
|
|
// Pull out metadata from the ELF
|
|
size_t sizeOfArgList;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
|
|
openClKernelName.c_str(), NULL, &sizeOfArgList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
char* aclArgList = new char[sizeOfArgList];
|
|
if (NULL == aclArgList) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY,
|
|
openClKernelName.c_str(), aclArgList, &sizeOfArgList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
size_t sizeOfWorkGroupSize;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
|
|
openClKernelName.c_str(), NULL, &sizeOfWorkGroupSize);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE,
|
|
openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Copy wavefront size
|
|
workGroupInfo_.wavefrontSize_ = prog().isNull() ? 64 : dev().getAttribs().wavefrontSize;
|
|
|
|
// Find total workgroup size
|
|
if (workGroupInfo_.compileSize_[0] != 0) {
|
|
workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] *
|
|
workGroupInfo_.compileSize_[2];
|
|
} else {
|
|
workGroupInfo_.size_ = dev().info().preferredWorkGroupSize_;
|
|
}
|
|
|
|
// Pull out printf metadata from the ELF
|
|
size_t sizeOfPrintfList;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
|
|
openClKernelName.c_str(), NULL, &sizeOfPrintfList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Make sure kernel has any printf info
|
|
if (0 != sizeOfPrintfList) {
|
|
char* aclPrintfList = new char[sizeOfPrintfList];
|
|
if (NULL == aclPrintfList) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY,
|
|
openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
// Set the PrintfList
|
|
InitPrintf(reinterpret_cast<aclPrintfFmt*>(aclPrintfList));
|
|
delete[] aclPrintfList;
|
|
}
|
|
|
|
aclMetadata md;
|
|
md.enqueue_kernel = false;
|
|
size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel);
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE,
|
|
openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
flags_.dynamicParallelism_ = md.enqueue_kernel;
|
|
|
|
md.kernel_index = -1;
|
|
size_t sizeOfIndex = sizeof(md.kernel_index);
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_KERNEL_INDEX,
|
|
openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
index_ = md.kernel_index;
|
|
|
|
size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_);
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT,
|
|
openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_,
|
|
&sizeOfWavesPerSimdHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
waveLimiter_.enable(dev().settings().ciPlus_);
|
|
|
|
size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_);
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT,
|
|
openClKernelName.c_str(), workGroupInfo_.compileSizeHint_,
|
|
&sizeOfWorkGroupSizeHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
size_t sizeOfVecTypeHint;
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
|
|
openClKernelName.c_str(), NULL, &sizeOfVecTypeHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
|
|
if (0 != sizeOfVecTypeHint) {
|
|
char* VecTypeHint = new char[sizeOfVecTypeHint + 1];
|
|
if (NULL == VecTypeHint) {
|
|
return false;
|
|
}
|
|
error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_VEC_TYPE_HINT,
|
|
openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint);
|
|
if (error != ACL_SUCCESS) {
|
|
return false;
|
|
}
|
|
VecTypeHint[sizeOfVecTypeHint] = '\0';
|
|
workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint);
|
|
delete[] VecTypeHint;
|
|
}
|
|
|
|
// Set the argList
|
|
initArgList(reinterpret_cast<const aclArgData*>(aclArgList));
|
|
delete[] aclArgList;
|
|
|
|
return true;
|
|
}
|
|
|
|
const Device& HSAILKernel::dev() const { return reinterpret_cast<const Device&>(dev_); }
|
|
|
|
const HSAILProgram& HSAILKernel::prog() const {
|
|
return reinterpret_cast<const HSAILProgram&>(prog_);
|
|
}
|
|
|
|
inline static void WriteAqlArg(
|
|
unsigned char** dst, //!< The write pointer to the buffer
|
|
const void* src, //!< The source pointer
|
|
uint size, //!< The size in bytes to copy
|
|
uint alignment = 0 //!< The alignment to follow while writing to the buffer
|
|
) {
|
|
if (alignment == 0) {
|
|
*dst = amd::alignUp(*dst, size);
|
|
} else {
|
|
*dst = amd::alignUp(*dst, alignment);
|
|
}
|
|
memcpy(*dst, src, size);
|
|
*dst += size;
|
|
}
|
|
|
|
const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
|
|
(1 << HSA_PACKET_HEADER_BARRIER) |
|
|
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
|
|
(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
|
|
|
|
hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(
|
|
VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes,
|
|
const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap,
|
|
std::vector<const Memory*>& memList) const {
|
|
static const bool WaitOnBusyEngine = true;
|
|
uint64_t ldsAddress = ldsSize();
|
|
address aqlArgBuf = gpu.cb(0)->sysMemCopy();
|
|
address aqlStruct = gpu.cb(1)->sysMemCopy();
|
|
bool srdResource = false;
|
|
|
|
if (extraArgumentsNum_ > 0) {
|
|
assert(MaxExtraArgumentsNum >= 6 &&
|
|
"MaxExtraArgumentsNum has changed, the below algorithm should be changed accordingly");
|
|
size_t extraArgs[MaxExtraArgumentsNum] = {0, 0, 0, 0, 0, 0};
|
|
// The HLC generates up to 3 additional arguments for the global offsets
|
|
for (uint i = 0; i < sizes.dimensions(); ++i) {
|
|
extraArgs[i] = sizes.offset()[i];
|
|
}
|
|
// Check if the kernel may have printf output
|
|
if ((printfInfo().size() > 0) &&
|
|
// and printf buffer was allocated
|
|
(gpu.printfDbgHSA().dbgBuffer() != NULL)) {
|
|
// and set the fourth argument as the printf_buffer pointer
|
|
extraArgs[3] = static_cast<size_t>(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
|
|
memList.push_back(gpu.printfDbgHSA().dbgBuffer());
|
|
}
|
|
if (dynamicParallelism()) {
|
|
// Provide the host parent AQL wrap object to the kernel
|
|
AmdAqlWrap* wrap = reinterpret_cast<AmdAqlWrap*>(aqlStruct);
|
|
memset(wrap, 0, sizeof(AmdAqlWrap));
|
|
wrap->state = AQL_WRAP_BUSY;
|
|
ConstBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(sizeof(AmdAqlWrap));
|
|
*vmParentWrap = cb->vmAddress() + cb->wrtOffset();
|
|
// and set 5th & 6th arguments
|
|
extraArgs[4] = vmDefQueue;
|
|
extraArgs[5] = *vmParentWrap;
|
|
memList.push_back(cb);
|
|
}
|
|
WriteAqlArg(&aqlArgBuf, extraArgs, sizeof(size_t) * extraArgumentsNum_, sizeof(size_t));
|
|
}
|
|
|
|
const amd::KernelSignature& signature = kernel.signature();
|
|
const amd::KernelParameters& kernelParams = kernel.parameters();
|
|
|
|
amd::Memory* const* memories =
|
|
reinterpret_cast<amd::Memory* const*>(parameters + kernelParams.memoryObjOffset());
|
|
|
|
// Find all parameters for the current kernel
|
|
for (uint i = 0; i != signature.numParameters(); ++i) {
|
|
const HSAILKernel::Argument* arg = argument(i);
|
|
const amd::KernelParameterDescriptor& desc = signature.at(i);
|
|
const_address paramaddr = parameters + desc.offset_;
|
|
|
|
switch (arg->type_) {
|
|
case HSAIL_ARGTYPE_POINTER:
|
|
// If it is a global pointer
|
|
if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) {
|
|
Memory* gpuMem = NULL;
|
|
amd::Memory* mem = NULL;
|
|
|
|
uint32_t index = signature.at(i).info_.arrayIndex_;
|
|
if (nativeMem) {
|
|
gpuMem = reinterpret_cast<Memory* const*>(memories)[index];
|
|
if (nullptr != gpuMem) {
|
|
mem = gpuMem->owner();
|
|
}
|
|
} else {
|
|
mem = memories[index];
|
|
if (mem != nullptr) {
|
|
gpuMem = dev().getGpuMemory(mem);
|
|
}
|
|
}
|
|
|
|
WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr), sizeof(paramaddr));
|
|
if (gpuMem == nullptr) {
|
|
break;
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
gpuMem->wait(gpu, WaitOnBusyEngine);
|
|
|
|
//! @todo Compiler has to return read/write attributes
|
|
if ((NULL != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
memList.push_back(gpuMem);
|
|
|
|
// save the memory object pointer to allow global memory access
|
|
if (NULL != dev().hwDebugMgr()) {
|
|
dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner());
|
|
}
|
|
}
|
|
// If it is a local pointer
|
|
else {
|
|
assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) && "Unsupported address type");
|
|
ldsAddress = amd::alignUp(ldsAddress, arg->alignment_);
|
|
WriteAqlArg(&aqlArgBuf, &ldsAddress, desc.size_);
|
|
if (desc.size_ == 8) {
|
|
ldsAddress += *reinterpret_cast<const uint64_t*>(paramaddr);
|
|
} else {
|
|
ldsAddress += *reinterpret_cast<const uint32_t*>(paramaddr);
|
|
}
|
|
}
|
|
break;
|
|
case HSAIL_ARGTYPE_VALUE:
|
|
// Special case for structrues
|
|
if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) {
|
|
// Copy the current structre into CB1
|
|
memcpy(aqlStruct, paramaddr, arg->size_);
|
|
ConstBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(arg->size_);
|
|
// Then use a pointer in aqlArgBuffer to CB1
|
|
uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset();
|
|
WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*));
|
|
memList.push_back(cb);
|
|
} else {
|
|
WriteAqlArg(&aqlArgBuf, paramaddr, arg->numElem_ * arg->size_, arg->size_);
|
|
}
|
|
break;
|
|
case HSAIL_ARGTYPE_IMAGE: {
|
|
Image* image = nullptr;
|
|
amd::Memory* mem = nullptr;
|
|
uint32_t index = signature.at(i).info_.arrayIndex_;
|
|
if (nativeMem) {
|
|
image = reinterpret_cast<Image* const*>(memories)[index];
|
|
if (nullptr != image) {
|
|
mem = image->owner();
|
|
}
|
|
} else {
|
|
mem = memories[index];
|
|
if (mem == NULL) {
|
|
LogError("The kernel image argument isn't an image object!");
|
|
return nullptr;
|
|
}
|
|
image = static_cast<Image*>(dev().getGpuMemory(mem));
|
|
}
|
|
|
|
// Wait for resource if it was used on an inactive engine
|
|
//! \note syncCache may call DRM transfer
|
|
image->wait(gpu, WaitOnBusyEngine);
|
|
|
|
//! \note Special case for the image views.
|
|
//! Copy SRD to CB1, so blit manager will be able to release
|
|
//! this view without a wait for SRD resource.
|
|
if (image->memoryType() == Resource::ImageView) {
|
|
// Copy the current structre into CB1
|
|
memcpy(aqlStruct, image->hwState(), HsaImageObjectSize);
|
|
ConstBuffer* cb = gpu.constBufs_[1];
|
|
cb->uploadDataToHw(HsaImageObjectSize);
|
|
// Then use a pointer in aqlArgBuffer to CB1
|
|
uint64_t srd = cb->vmAddress() + cb->wrtOffset();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
memList.push_back(cb);
|
|
} else {
|
|
uint64_t srd = image->hwSrd();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
srdResource = true;
|
|
}
|
|
|
|
//! @todo Compiler has to return read/write attributes
|
|
if ((NULL != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) {
|
|
mem->signalWrite(&dev());
|
|
}
|
|
|
|
memList.push_back(image);
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_SAMPLER: {
|
|
uint32_t index = signature.at(i).info_.arrayIndex_;
|
|
const amd::Sampler* sampler = reinterpret_cast<amd::Sampler* const*>(parameters +
|
|
kernelParams.samplerObjOffset())[index];
|
|
const Sampler* gpuSampler = static_cast<Sampler*>(sampler->getDeviceSampler(dev()));
|
|
uint64_t srd = gpuSampler->hwSrd();
|
|
WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd));
|
|
srdResource = true;
|
|
break;
|
|
}
|
|
case HSAIL_ARGTYPE_QUEUE: {
|
|
uint32_t index = signature.at(i).info_.arrayIndex_;
|
|
const amd::DeviceQueue* queue = reinterpret_cast<amd::DeviceQueue* const*>(
|
|
parameters + kernelParams.queueObjOffset())[index];
|
|
VirtualGPU* gpuQueue = static_cast<VirtualGPU*>(queue->vDev());
|
|
uint64_t vmQueue;
|
|
if (dev().settings().useDeviceQueue_) {
|
|
vmQueue = gpuQueue->vQueue()->vmAddress();
|
|
} else {
|
|
if (!gpu.createVirtualQueue(queue->size())) {
|
|
LogError("Virtual queue creation failed!");
|
|
return nullptr;
|
|
}
|
|
vmQueue = gpu.vQueue()->vmAddress();
|
|
}
|
|
WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*));
|
|
break;
|
|
}
|
|
default:
|
|
LogError(" Unsupported address type ");
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
if (ldsAddress > dev().info().localMemSize_) {
|
|
LogError("No local memory available\n");
|
|
return NULL;
|
|
}
|
|
|
|
// HSAIL kernarg segment size is rounded up to multiple of 16.
|
|
aqlArgBuf = amd::alignUp(aqlArgBuf, 16);
|
|
assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) &&
|
|
"Size and the number of arguments don't match!");
|
|
hsa_kernel_dispatch_packet_t* hsaDisp =
|
|
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlArgBuf);
|
|
|
|
amd::NDRange local(sizes.local());
|
|
const amd::NDRange& global = sizes.global();
|
|
|
|
// Check if runtime has to find local workgroup size
|
|
FindLocalWorkSize(sizes.dimensions(), sizes.global(), local);
|
|
|
|
hsaDisp->header = kDispatchPacketHeader;
|
|
hsaDisp->setup = sizes.dimensions();
|
|
|
|
hsaDisp->workgroup_size_x = local[0];
|
|
hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1;
|
|
hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1;
|
|
|
|
hsaDisp->grid_size_x = global[0];
|
|
hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1;
|
|
hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1;
|
|
hsaDisp->reserved2 = 0;
|
|
|
|
// Initialize kernel ISA and execution buffer requirements
|
|
hsaDisp->private_segment_size = spillSegSize();
|
|
hsaDisp->group_segment_size = ldsAddress;
|
|
hsaDisp->kernel_object = gpuAqlCode()->vmAddress();
|
|
|
|
ConstBuffer* cb = gpu.constBufs_[0];
|
|
cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t));
|
|
uint64_t argList = cb->vmAddress() + cb->wrtOffset();
|
|
|
|
hsaDisp->kernarg_address = reinterpret_cast<void*>(argList);
|
|
hsaDisp->reserved2 = 0;
|
|
hsaDisp->completion_signal.handle = 0;
|
|
|
|
memList.push_back(cb);
|
|
memList.push_back(gpuAqlCode());
|
|
for (gpu::Memory* mem : prog().globalStores()) {
|
|
memList.push_back(mem);
|
|
}
|
|
if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties,
|
|
AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) {
|
|
memList.push_back(gpu.hsaQueueMem());
|
|
}
|
|
|
|
if (srdResource || prog().isStaticSampler()) {
|
|
dev().srds().fillResourceList(memList);
|
|
}
|
|
|
|
return hsaDisp;
|
|
}
|
|
|
|
} // namespace gpu
|