// // Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved. // #include "device/gpu/gpukernel.hpp" #include "device/gpu/gpuprogram.hpp" #include "device/gpu/gpublit.hpp" #include "device/gpu/gpuconstbuf.hpp" #include "device/gpu/gpusched.hpp" #include "platform/commandqueue.hpp" #include "shader/ComputeProgramObject.h" #include "utils/options.hpp" #include "acl.h" #include "SCShadersR678XXCommon.h" #include #include #include #include #include #include #include namespace gpu { const MetaDataConst ArgState[ArgStateTotal] = { // Note: the order is important // Name Type Properties // Kernel description (special properties) { "memory:compilerwrite", KernelArg::PrivateFixed, { 0, 0, 0, 0, 0, 0, 0 } }, { "uniqueid:", KernelArg::None, { 0, 0, 0, 0, 0, 0, 0 } }, { "memory:private:", KernelArg::PrivateSize, { 0, 0, 0, 0, 0, 0, 0 } }, { "memory:local:", KernelArg::LocalSize, { 0, 0, 0, 0, 0, 0, 0 } }, { "memory:hwprivate:", KernelArg::HwPrivateSize, { 0, 0, 0, 0, 0, 0, 0 } }, { "memory:uavprivate:", KernelArg::HwPrivateSize, { 0, 0, 0, 0, 0, 0, 0 } }, { "memory:hwlocal:", KernelArg::HwLocalSize, { 0, 0, 0, 0, 0, 0, 0 } }, { "memory:64bitABI", KernelArg::ABI64Bit, { 0, 0, 0, 0, 0, 0, 0 } }, { "limitgroupsize", KernelArg::Wavefront, { 0, 0, 0, 0, 0, 0, 0 } }, { "function:", KernelArg::None, { 1, 1, 0, 0, 0, 0, 0 } }, { "intrinsic:", KernelArg::None, { 1, 0, 0, 0, 0, 0, 0 } }, { "error:", KernelArg::ErrorMessage, { 0, 0, 0, 0, 0, 0, 0 } }, { "warning:", KernelArg::WarningMessage, { 0, 0, 0, 0, 0, 0, 0 } }, { "printf_fmt:", KernelArg::PrintfFormatStr, { 0, 0, 0, 0, 0, 0, 0 } }, { "version:", KernelArg::MetadataVersion, { 0, 0, 0, 0, 0, 0, 0 } }, // Kernel basic types { "pointer:", KernelArg::PointerGlobal, { 1, 1, 1, 1, 1, 1, 0 } }, { "value:", KernelArg::Value, { 1, 1, 1, 1, 1, 0, 0 } }, { "image:", KernelArg::Image, { 1, 1, 1, 1, 1, 0, 0 } }, { "sampler:", KernelArg::Sampler, { 0, 1, 0, 0, 0, 0, 0 } }, { "counter:", KernelArg::Counter, { 1, 1, 0, 1, 1, 0, 0 } }, { "cws:", KernelArg::Grouping, { 0, 0, 0, 0, 0, 0, 0 } }, { "lws:", KernelArg::WrkgrpSize, { 0, 0, 0, 0, 0, 0, 0 } }, { "uavid:", KernelArg::UavId, { 0, 0, 0, 0, 0, 0, 0 } }, { "reflection:", KernelArg::Reflection, { 0, 0, 0, 0, 0, 0, 0 } }, { "constarg:", KernelArg::ConstArg, { 0, 0, 0, 0, 0, 0, 0 } }, { "cbid:", KernelArg::ConstBufId, { 0, 0, 0, 0, 0, 0, 0 } }, { "printfid:", KernelArg::PrintfBufId, { 0, 0, 0, 0, 0, 0, 0 } }, { "wsh:", KernelArg::GroupingHint, { 0, 0, 0, 0, 0, 0, 0 } }, { "vth:", KernelArg::VecTypeHint, { 0, 0, 0, 0, 0, 0, 0 } }, { "WavesPerSimdHint:", KernelArg::WavesPerSimdHint,{ 0, 0, 0, 0, 0, 0, 0 } }, }; const DataTypeConst DataType[] = { { "i8:", KernelArg::Char, }, { "i16:", KernelArg::Short, }, { "i32:", KernelArg::Int, }, { "i64:", KernelArg::Long, }, { "u8:", KernelArg::UChar, }, { "u16:", KernelArg::UShort, }, { "u32:", KernelArg::UInt, }, { "u64:", KernelArg::ULong, }, { "float:", KernelArg::Float, }, { "double:", KernelArg::Double, }, { "struct:", KernelArg::Struct, }, { "union:", KernelArg::Union, }, { "1D:", KernelArg::Image1D, }, { "2D:", KernelArg::Image2D, }, { "3D:", KernelArg::Image3D, }, { "1DB:", KernelArg::Image1DB, }, { "1DA:", KernelArg::Image1DA, }, { "2DA:", KernelArg::Image2DA, }, { "opaque:", KernelArg::Opaque, }, { "event:", KernelArg::Event, }, { "sampler:", KernelArg::Sampler, }, { "half:", KernelArg::Half, }, }; const uint DataTypeTotal = sizeof(DataType) / sizeof(DataTypeConst); struct BufDataConst { const char* tagName_; //!< buffer's name KernelArg::ArgumentType type_; //!< type of argument struct { uint number_ : 1; //!< buffer's number uint alignment_ : 1; //!< buffer's alignment uint attribute_ : 1; //!< buffer's read/write attribute uint reserved : 29; //!< reserved }; }; static const BufDataConst BufType[] = { { "g", KernelArg::PointerGlobal, { 1, 0, 0, 0 } }, { "p", KernelArg::PointerPrivate, { 1, 1, 1, 0 } }, { "l", KernelArg::PointerLocal, { 1, 1, 1, 0 } }, { "uav", KernelArg::PointerGlobal, { 1, 1, 1, 0 } }, { "c", KernelArg::PointerConst, { 1, 1, 1, 0 } }, { "hl", KernelArg::PointerHwLocal, { 1, 1, 1, 0 } }, { "hp", KernelArg::PointerHwPrivate,{ 1, 1, 1, 0 } }, { "hc", KernelArg::PointerHwConst, { 1, 1, 1, 0 } } }; static const uint BufTypeTotal = sizeof(BufType) / sizeof(BufDataConst); //! The mathlib constants for each kernel execution static const float MathLibConst[4] = { 0.0f, 0.5f, 1.0f, 2.0f }; bool expect(const std::string& str, size_t* pos, const std::string& sym) { bool result = true; uint i; if (*pos == std::string::npos) { return false; } // Check if we have expected symbols for (i = 0; i < sym.size(); ++i) { char deb = str[*pos + i]; if (deb != sym[i]) { result = false; break; } } if (result) *pos += i; return result; } bool getword(const std::string& str, size_t* pos, std::string& sym) { if (*pos == std::string::npos) { return false; } *pos = str.find_first_not_of(" \n\r", *pos); size_t posEnd = str.find_first_of(": \n\r;", *pos); size_t count = posEnd - *pos; if (count != 0) { sym = str.substr(*pos, count); } sym[count] = 0; *pos = posEnd + 1; return true; } bool getstring(const std::string& str, size_t* pos, std::string* out) { if (*pos == std::string::npos) { return false; } *pos = str.find_first_not_of(" \n\r", *pos); size_t posEnd = str.find_first_of(":\n\r;", *pos); size_t count = posEnd - *pos; char* sym = new char[count + 1]; if (count != 0) { if (!str.copy(sym, count, *pos)) { return false; } } sym[count] = 0; *out = sym; delete [] sym; *pos = posEnd + 1; return true; } bool getuint(const std::string& str, size_t* pos, uint* val) { if (*pos == std::string::npos) { return false; } char sym[16]; *pos = str.find_first_not_of(" \n\r", *pos); size_t posEnd = str.find_first_of(": \n\r;)", *pos); if (!str.copy(sym, posEnd - *pos, *pos)) { return false; } *val = 0; for (size_t i = 0; i < (posEnd - *pos); ++i) { *val = (*val * 10) + (sym[i] - 0x30); } *pos = posEnd + 1; return true; } bool getuintHex(const std::string& str, size_t* pos, uint* val) { if (*pos == std::string::npos) { return false; } char sym[16]; *pos = str.find_first_not_of(" \n\r", *pos); size_t posEnd = str.find_first_of(": \n\r;)", *pos); if (!str.copy(sym, posEnd - *pos, *pos)) { return false; } *val = 0; for (size_t i = 0; i < (posEnd - *pos); ++i) { if (sym[i] >= '0' && sym[i] <= 'F') { *val = (*val * 16) + (sym[i] - '0'); } else if (sym[i] >= 'a' && sym[i] <= 'f') { *val = (*val * 16) + (sym[i] - 'a' + 10); } else { return false; } } *pos = posEnd + 1; return true; } bool getuint64Hex(const std::string& str, size_t* pos, uint64_t* val) { if (*pos == std::string::npos) { return false; } char sym[16]; *pos = str.find_first_not_of(" \n\r", *pos); size_t posEnd = str.find_first_of(": \n\r;)", *pos); if (!str.copy(sym, posEnd - *pos, *pos)) { return false; } *val = 0; for (size_t i = 0; i < (posEnd - *pos); ++i) { if (sym[i] >= '0' && sym[i] <= 'F') { *val = (*val * 16) + (sym[i] - '0'); } else if (sym[i] >= 'a' && sym[i] <= 'f') { *val = (*val * 16) + (sym[i] - 'a' + 10); } else { return false; } } *pos = posEnd + 1; return true; } void intToStr(size_t value, char* str, size_t size) { static const uint MaxDigits32bit = 10; char result[MaxDigits32bit]; uint idx = MaxDigits32bit; do { idx--; result[idx] = static_cast((value % 10) + '0'); value /= 10; } while ((value != 0) && (idx > 0)); size_t len = MaxDigits32bit - idx; size_t n = std::min(len, size-1); memcpy(str, &result[idx], n); str[n] = '\0'; } //! Default destructor CalImageReference::~CalImageReference() { // Free CAL image free(image_); } KernelArg::KernelArg() : type_(KernelArg::None) , size_(0) , cbIdx_(0) , cbPos_(0) , index_(0) , alignment_(1) , dataType_(KernelArg::None) { name_ = ""; buf_ = ""; memory_.value_ = 0; typeQualifier_ = CL_KERNEL_ARG_TYPE_NONE; } KernelArg::KernelArg(const KernelArg& data) { // Fill the new object *this = data; } KernelArg& KernelArg::operator=(const KernelArg& data) { // Fill the fields of the current object name_ = data.name_; typeName_ = data.typeName_; typeQualifier_ = data.typeQualifier_; type_ = data.type_; size_ = data.size_; cbIdx_ = data.cbIdx_; cbPos_ = data.cbPos_; buf_ = data.buf_; index_ = data.index_; alignment_ = data.alignment_; dataType_ = data.dataType_; memory_.value_ = data.memory_.value_; return *this; } bool KernelArg::isCbNeeded() const { //! \note not a safe way bool result = ((type_ > None) && (type_ < Sampler)) ? true : false; if ((type_ == Sampler) && (location_ == 0)) { // Sampler is defined outside the kernel result = true; } return result; } size_t KernelArg::size(bool gpuLayer)const { switch (type_) { case None: return 0; case PointerConst: case PointerHwConst: case PointerGlobal: return (gpuLayer) ? sizeof(uint32_t) * size_ : sizeof(cl_mem); case Image1D: case Image2D: case Image3D: case Image1DB: case Image1DA: case Image2DA: return (gpuLayer) ? sizeof(ImageConstants) : sizeof(cl_mem); case Sampler: return (gpuLayer) ? 2 * sizeof(uint32_t) : sizeof(cl_sampler); case Counter: return (gpuLayer) ? 0 : sizeof(cl_mem); case PointerLocal: case PointerHwLocal: return (gpuLayer) ? sizeof(uint32_t) * size_ : 0; case PointerPrivate: case PointerHwPrivate: return (gpuLayer) ? sizeof(uint32_t) * size_ : 0; case Float: return sizeof(cl_float) * amd::nextPowerOfTwo(size_); case Double: return sizeof(cl_double) * amd::nextPowerOfTwo(size_); case Char: case UChar: return sizeof(cl_char) * amd::nextPowerOfTwo(size_); case Short: case UShort: return sizeof(cl_short) * amd::nextPowerOfTwo(size_); case Int: case UInt: return sizeof(cl_uint) * amd::nextPowerOfTwo(size_); case Long: case ULong: return sizeof(cl_ulong) * amd::nextPowerOfTwo(size_); case Struct: case Union: return (gpuLayer) ? amd::alignUp(size_, 16) : size_; default: return 0; } } cl_kernel_arg_address_qualifier KernelArg::addressQualifier() const { switch (type_) { case PointerGlobal: case Image1D: case Image2D: case Image3D: case Image1DB: case Image1DA: case Image2DA: return CL_KERNEL_ARG_ADDRESS_GLOBAL; case PointerLocal: case PointerHwLocal: return CL_KERNEL_ARG_ADDRESS_LOCAL; case PointerConst: case PointerHwConst: return CL_KERNEL_ARG_ADDRESS_CONSTANT; default: return CL_KERNEL_ARG_ADDRESS_PRIVATE; } } cl_kernel_arg_access_qualifier KernelArg::accessQualifier() const { switch (type_) { case Image1D: case Image2D: case Image3D: case Image1DB: case Image1DA: case Image2DA: if (memory_.readOnly_) { return CL_KERNEL_ARG_ACCESS_READ_ONLY; } else if (memory_.writeOnly_) { return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; } else if (memory_.readWrite_) { return CL_KERNEL_ARG_ACCESS_READ_WRITE; } // Fall through ... default: return CL_KERNEL_ARG_ACCESS_NONE; } } //! temporary solution for the vectors handling in compiler size_t KernelArg::specialVector() const { if (size_ > VectorSizeLimit) { switch (type_) { case Char: case UChar: return sizeof(cl_char); case Short: case UShort: return sizeof(cl_short); default: return 0; } } return 0; } clk_value_type_t KernelArg::type()const { switch (type_) { case PointerGlobal: case PointerLocal: case PointerHwLocal: case PointerConst: case PointerHwConst: case Image1D: case Image2D: case Image3D: case Image1DB: case Image1DA: case Image2DA: case Counter: return T_POINTER; case Float: return T_FLOAT; case Double: return T_DOUBLE; case Char: case UChar: return T_CHAR; case Short: case UShort: return T_SHORT; case Int: return T_INT; case UInt: //! \note No UINT type return T_INT; case Long: return T_LONG; case ULong: //! \note No ULONG type return T_LONG; case Struct: case Union: //! @todo What should we report? return T_CHAR; case Sampler: return T_SAMPLER; case PointerPrivate: case PointerHwPrivate: case None: default: return T_VOID; } } NullKernel::NullKernel( const std::string& name, const NullDevice& gpuNullDev, const NullProgram& nullprog) : device::Kernel(name) , buildError_(CL_BUILD_PROGRAM_FAILURE) , gpuDev_(gpuNullDev) , prog_(nullprog) , calRef_(NULL) , internal_(false) , flags_(0) , cbSizes_(NULL) , numCb_(0) , rwAttributes_(false) , instructionCnt_(4) { // UAV raw index will be detected uavRaw_ = UavIdUndefined; // CB index will be detected cbId_ = UavIdUndefined; // Printf index will be detected printfId_ = UavIdUndefined; } NullKernel::~NullKernel() { uint idx; if (calRef_ == NULL) { return; } calRef_->release(); // Destroy all kernel arguments for (idx = 0; idx < arguments_.size(); ++idx) { delete arguments_[idx]; } arguments_.clear(); // Destroy all sampler kernel arguments for (idx = 0; idx < intSamplers_.size(); ++idx) { delete intSamplers_[idx]; } intSamplers_.clear(); } static int scComponentToArrayIndex(E_SC_COMPONENT dstComp) { switch (dstComp) { case SC_COMPONENT_X: return 0; case SC_COMPONENT_Y: return 1; case SC_COMPONENT_Z: return 2; case SC_COMPONENT_W: return 3; } return 0; } static void addLoopConst(const SC_HWSHADER* shader, AMUabiAddEncoding& encoding) { uint count = shader->dep.NumIntrlIConstants; encoding.litConstsCount = shader->dep.NumIntrlIConstants; // only suppport loop consts (int consts) if (count) { AMUabiLiteralConst* allocatedconsts = encoding.litConsts; memset(allocatedconsts, 0, count * sizeof(AMUabiLiteralConst)); uint usedConsts = 0; for (uint i = 0; i < count; ++i) { uint currentConst; for (currentConst = 0; currentConst < usedConsts; ++currentConst) { if (allocatedconsts[currentConst].addr == HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber) { break; } } if (currentConst == usedConsts) { usedConsts++; assert(usedConsts <= count); } allocatedconsts[currentConst].addr = HWSHADER_Get(shader, dep.IntrlIConstants)[i].uDstNumber; allocatedconsts[currentConst].type = AMU_ABI_INT32; allocatedconsts[currentConst].value. int32[scComponentToArrayIndex(HWSHADER_Get(shader, dep.IntrlIConstants)[i].eDstComp)] = HWSHADER_Get(shader, dep.IntrlIConstants)[i].iValue; } encoding.litConstsCount = usedConsts; } } bool NullKernel::create( const std::string& code, const std::string& metadata, const void* binaryCode, size_t binarySize) { std::auto_ptr uavRefCount (new uint[MaxUavArguments]); if (NULL == uavRefCount.get()) { return false; } // Set all ref counts to 0 memset(uavRefCount.get(), 0, sizeof(uavRefCount.get()[0]) * MaxUavArguments); // parse the metadata fields if (!parseArguments(metadata, uavRefCount.get())) { return false; } CALimage calImage; // Save source if DEBUG build #if DEBUG ilSource_ = code; #endif // DEBUG amd::option::Options *options = nullProg().getCompilerOptions(); internal_ = options->oVariables->clInternalKernel; if ((binaryCode == NULL) && (binarySize == 0) && !code.empty()) { acl_error err; std::string arch = GPU_TARGET_INFO_ARCH; if (nullDev().settings().use64BitPtr_) { arch += "64"; } aclTargetInfo info = aclGetTargetInfo( arch.c_str(), nullDev().hwInfo()->targetName_, &err); if (err != ACL_SUCCESS) { LogWarning("aclGetTargetInfo failed"); return false; } aclBinaryOptions binOpts = {0}; binOpts.struct_size = sizeof(binOpts); binOpts.elfclass = info.arch_id == aclAMDIL64 ? ELFCLASS64 : ELFCLASS32; binOpts.bitness = ELFDATA2LSB; binOpts.alloc = &::malloc; binOpts.dealloc = &::free; aclBinary* bin = aclBinaryInit(sizeof(aclBinary), &info, &binOpts, &err); if (err != ACL_SUCCESS) { LogWarning("aclBinaryInit failed"); return false; } if (ACL_SUCCESS != aclInsertSection(nullDev().compiler(), bin, code.data(), code.size(), aclSOURCE)) { LogWarning("aclInsertSection failed"); aclBinaryFini(bin); return false; } amd::option::Options* Opts = (amd::option::Options*)bin->options; // Append an option so that we can selectively enable a SCOption on CZ // whenever IOMMUv2 is enabled. if (nullDev().settings().svmFineGrainSystem_) { options->origOptionStr.append(" -sc-xnack-iommu"); } // temporary solution to synchronize buildNo between runtime and complib // until we move runtime inside complib Opts->setBuildNo(options->getBuildNo()); // pass kernel name to compiler Opts->setCurrKernelName(name().c_str()); err = aclCompile(nullDev().compiler(), bin, options->origOptionStr.c_str(), ACL_TYPE_AMDIL_TEXT, ACL_TYPE_ISA, NULL); buildLog_ += aclGetCompilerLog(nullDev().compiler()); if (err != ACL_SUCCESS) { LogWarning("aclCompile failed"); aclBinaryFini(bin); return false; } if (!options->oVariables->BinEXE) { // Early exit if binary doesn't contain EXE aclBinaryFini(bin); return true; } size_t len; const void* isa = aclExtractSection(nullDev().compiler(), bin, &len, aclTEXT, &err); if (err != ACL_SUCCESS) { LogWarning("aclExtractSection failed"); aclBinaryFini(bin); return false; } uint calImageSize; if (!createMultiBinary( &calImageSize, reinterpret_cast(&calImage), isa)) { LogWarning("initSrcEncoding failed"); aclBinaryFini(bin); return false; } aclBinaryFini(bin); } else if ((binaryCode != NULL) && (binarySize != 0)) { uint size = 0; if (!amuABIMultiBinaryGetSize(&size, const_cast(binaryCode)) || size > binarySize) { buildLog_ += "Invalid binary image"; LogError("amuABIMultiBinaryGetSize failed!"); return false; } calImage = static_cast(malloc(size)); ::memcpy(calImage, binaryCode, size); } else { LogError("Incorrect initialization parameters!"); return false; } calRef_ = new CalImageReference(calImage); if (calRef_ == NULL) { LogError("Memory allocation failure!"); // Free CAL image free(calImage); return false; } CALfuncInfo calFuncInfo; // Get kernel compiled information getFuncInfoFromImage(calImage, &calFuncInfo); if (calFuncInfo.maxScratchRegsNeeded > 0) { LogPrintfInfo("%s kernel has register spilling." "Lower performance is expected.", name().c_str()); } workGroupInfo_.scratchRegs_ = calFuncInfo.maxScratchRegsNeeded; workGroupInfo_.wavefrontPerSIMD_ = calFuncInfo.numWavefrontPerSIMD; workGroupInfo_.wavefrontSize_ = calFuncInfo.wavefrontSize; workGroupInfo_.availableGPRs_ = calFuncInfo.numGPRsAvailable; workGroupInfo_.usedGPRs_ = calFuncInfo.numGPRsUsed; workGroupInfo_.availableSGPRs_ = calFuncInfo.numSGPRsAvailable; workGroupInfo_.usedSGPRs_ = calFuncInfo.numSGPRsUsed; workGroupInfo_.availableVGPRs_ = calFuncInfo.numVGPRsAvailable; workGroupInfo_.usedVGPRs_ = calFuncInfo.numVGPRsUsed; workGroupInfo_.availableLDSSize_ = calFuncInfo.LDSSizeAvailable; workGroupInfo_.usedLDSSize_ = calFuncInfo.LDSSizeUsed; workGroupInfo_.availableStackSize_ = calFuncInfo.stackSizeAvailable; workGroupInfo_.usedStackSize_ = calFuncInfo.stackSizeUsed; device::Kernel::parameters_t params; if (!createSignature(params)) { return false; } return true; } size_t NullKernel::getCalBinarySize() const { CALuint imageSize; if (!amuABIMultiBinaryGetSize(&imageSize, calImage())) { LogError("Failed to get the image size!"); return 0; } return static_cast(imageSize); } bool NullKernel::getCalBinary(void* binary, size_t size) const { uint calImageSize = 0; if (!amuABIMultiBinaryGetSize(&calImageSize, calImage()) || size < calImageSize) { LogError("CAL failed to save the kernel binary!"); return false; } ::memcpy(binary, calImage(), calImageSize); return true; } bool Kernel::create( const std::string& code, const std::string& metadata, const void* binaryCode, size_t binarySize) { setPreferredSizeMultiple(dev().getAttribs().wavefrontSize); if (!NullKernel::create(code, metadata, binaryCode, binarySize)) { return false; } // initialize constant buffer sizes if (!initConstBuffers()) { return false; } // Initialize the kernel parameters bool result = initParameters(); // Wave limiter needs to be initialized after kernel metadata is parsed // Since it depends on it. waveLimiter_.enable(dev().settings().ciPlus_); if (result) { buildError_ = CL_SUCCESS; } else { result = false; } return result; } Kernel::Kernel( const std::string& name, const Device& gpuDev, const Program& prog, const InitData* initData) : NullKernel(name, gpuDev, prog) , waveLimiter_(this, dev().getAttribs().numberOfCUsperShaderArray * dev().hwInfo()->simdPerCU_) { hwPrivateSize_ = 0; if (NULL != initData) { flags_ = initData->flags_; hwPrivateSize_ = initData->hwPrivateSize_; hwLocalSize_ = initData->hwLocalSize_; } // Workgroup info private memory size workGroupInfo_.privateMemSize_ = hwPrivateSize_; // Default wavesPerSimdHint_ workGroupInfo_.wavesPerSimdHint_ = ~0U; hsa_ = false; } Kernel::~Kernel() { if (calRef_ == NULL) { return; } { Device::ScopedLockVgpus lock(dev()); // Release all virtual image objects on all virtual GPUs for (uint idx = 0; idx < dev().vgpus().size(); ++idx) { dev().vgpus()[idx]->releaseKernel(calImage()); } } if (0 != numCb_) { delete [] cbSizes_; } } const Device& Kernel::dev() const { return reinterpret_cast(gpuDev_); } const Program& Kernel::prog() const { return reinterpret_cast(prog_); } bool NullKernel::createMultiBinary(uint* imageSize, void** image, const void* isa) { const SC_HWSHADER* shader = reinterpret_cast(isa); bool result = false; AMUabiAddEncoding encoding; memset(&encoding, 0, sizeof(AMUabiAddEncoding)); size_t allocSize = sizeof(uint) * MaxReadImage + sizeof(CALUavEntry) * MaxUavArguments + sizeof(CALSamplerMapEntry) * MaxSamplers + sizeof(CALConstantBufferMask) * MaxConstBuffers + sizeof(AMUabiLiteralConst) * shader->dep.NumIntrlIConstants; char* tmpMem = new char[allocSize]; if (tmpMem == NULL) { LogError("Error allocating memory"); return false; } CalcPtr(encoding.inputs, tmpMem, 0, 0); CalcPtr(encoding.uav, encoding.inputs, sizeof(uint), MaxReadImage); CalcPtr(encoding.inputSamplerMaps, encoding.uav, sizeof(CALUavEntry), MaxUavArguments); CalcPtr(encoding.constBuffers, encoding.inputSamplerMaps, sizeof(CALSamplerMapEntry), MaxSamplers); if (shader->dep.NumIntrlIConstants != 0) { CalcPtr(encoding.litConsts, encoding.constBuffers, sizeof(CALConstantBufferMask), MaxConstBuffers); } AMUabiMultiBinary amuBinary; amuABIMultiBinaryCreate(&amuBinary); result = siCreateHwInfo(shader, encoding); if (!result) { delete [] tmpMem; LogWarning("Error Creating program info"); return false; } addLoopConst(shader, encoding); unsigned int outputCount=0, condOut=0, earlyExit=0, globalCount=0, persistentCount=0; unsigned int symbolCount=0; CALOutputEntry* outputs=0; unsigned int* globalBuffers=0; unsigned int* persistentBuffers=0; AMUabiUserSymbol* symbols=0; CALSamplerMapEntry* inputSamplers = encoding.inputSamplerMaps; CALConstantBufferMask* constBuffers = encoding.constBuffers; uint* inputResources = encoding.inputs; CALUavEntry* uav = encoding.uav; uint inputSamplerCount = samplerSize(); for (uint i = 0; i < inputSamplerCount; ++i) { inputSamplers[i].resource = 0; inputSamplers[i].sampler = sampler(i)->index_; } uint constBufferCount = 2; constBuffers[0].index = 0; constBuffers[1].index = 1; uint inputResourceCount = 0; uint uavCount = 0; bool cbBound = false; bool printfBound = false; for (uint i = 0; i < arguments_.size(); ++i) { const KernelArg* arg = argument(i); switch (arg->type_) { case KernelArg::PointerConst: case KernelArg::PointerHwConst: constBuffers[constBufferCount++].index = arg->index_; break; case KernelArg::PointerGlobal: uav[uavCount].offset = arg->index_; uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS; uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; uavCount++; break; case KernelArg::ConstBufId: if (!cbBound) { uav[uavCount].offset = cbId_; uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW; uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; uavCount++; } cbBound = true; break; case KernelArg::PrintfBufId: if (!printfBound) { uav[uavCount].offset = printfId_; uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW; uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; uavCount++; } printfBound = true; break; case KernelArg::UavId: if ((UavIdUndefined != uavRaw_) && !(flags() & PrintfOutput)) { uav[uavCount].offset = arg->index_; uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPELESS; uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; uavCount++; } else { if (UavIdUndefined != uavRaw_) { uav[uavCount].offset = uavRaw_; uav[uavCount].type = AMU_ABI_UAV_TYPE_RAW; uav[uavCount].dimension = AMU_ABI_DIM_BUFFER; uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; uavCount++; } } break; case KernelArg::Sampler: inputSamplers[inputSamplerCount].resource = 0; inputSamplers[inputSamplerCount].sampler = arg->index_; inputSamplerCount++; break; case KernelArg::Image1D: case KernelArg::Image2D: case KernelArg::Image3D: case KernelArg::Image1DB: case KernelArg::Image1DA: case KernelArg::Image2DA: if (arg->memory_.readOnly_) { inputResources[inputResourceCount++] = arg->index_; } else { uav[uavCount].offset = arg->index_; uav[uavCount].type = AMU_ABI_UAV_TYPE_TYPED; uav[uavCount].dimension = AMU_ABI_DIM_2D; uav[uavCount].format = AMU_ABI_UAV_FORMAT_TYPELESS; uavCount++; } break; default: break; } } for (uint i = 0; i < nullProg().glbCb().size(); ++i) { constBuffers[constBufferCount++].index = nullProg().glbCb()[i]; } encoding.machine = nullDev().hwInfo()->machine_; encoding.type = ED_ATI_CAL_TYPE_COMPUTE; encoding.inputCount = inputResourceCount; encoding.outputCount = outputCount; encoding.outputs = outputs; encoding.condOut = condOut; encoding.earlyExit = earlyExit; encoding.globalBuffersCount = globalCount; encoding.globalBuffers = globalBuffers; encoding.persistentBuffersCount = persistentCount; encoding.persistentBuffers = persistentBuffers; encoding.constBuffersCount = constBufferCount; encoding.inputSamplerMapCount = inputSamplerCount; encoding.symbolsCount = symbolCount; encoding.symbols = symbols; encoding.uavCount = uavCount; amuABIMultiBinaryAddEncoding(amuBinary, &encoding); uint success = amuABIMultiBinaryPack(imageSize, image, amuBinary); amuABIMultiBinaryDestroy(amuBinary); delete [] tmpMem; delete [] encoding.progInfos; return (success == 0) ? false : true; } void Kernel::findLocalWorkSize( size_t workDim, const amd::NDRange& gblWorkSize, amd::NDRange& lclWorkSize) const { // Initialize the default workgoup info // Check if the kernel has the compiled sizes if (workGroupInfo()->compileSize_[0] == 0) { // Find the default local workgroup size, if it wasn't specified if (lclWorkSize[0] == 0) { size_t thrPerGrp; bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE); bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y); bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z); bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) || ((workDim == 3) && b3DOverrideSet); if (!overrideSet) { // Find threads per group thrPerGrp = workGroupInfo()->size_; // Check if kernel uses images if ((flags() & ImageEnable) && // and thread group is a multiple value of wavefronts ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && // and it's 2 or 3-dimensional workload (workDim > 1) && ((dev().settings().partialDispatch_) || (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) { // Use 8x8 workgroup size if kernel has image writes if ((flags() & ImageWrite) || (thrPerGrp != nullDev().info().maxWorkGroupSize_)) { lclWorkSize[0] = 8; lclWorkSize[1] = 8; } else { lclWorkSize[0] = 16; lclWorkSize[1] = 16; } if (workDim == 3) { lclWorkSize[2] = 1; } } else { size_t tmp = thrPerGrp; // Split the local workgroup into the most efficient way for (uint d = 0; d < workDim; ++d) { size_t div = tmp; for (; (gblWorkSize[d] % div) != 0; div--); lclWorkSize[d] = div; tmp /= div; } // Assuming DWORD access const uint cacheLineMatch = dev().settings().cacheLineSize_ >> 2; // Check if partial dispatch is enabled and if (dev().settings().partialDispatch_ && // we couldn't find optimal workload (((lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) || // or size is too small for the cache line (lclWorkSize[0] < cacheLineMatch))) { size_t maxSize = 0; size_t maxDim = 0; for (uint d = 0; d < workDim; ++d) { if (maxSize < gblWorkSize[d]) { maxSize = gblWorkSize[d]; maxDim = d; } } // Use X dimension as high priority. Runtime will assume that // X dimension is more important for the address calculation if ((maxDim != 0) && (gblWorkSize[0] >= (cacheLineMatch / 2))) { lclWorkSize[0] = cacheLineMatch; thrPerGrp /= cacheLineMatch; lclWorkSize[maxDim] = thrPerGrp; for (uint d = 1; d < workDim; ++d) { if (d != maxDim) { lclWorkSize[d] = 1; } } } else { // Check if a local workgroup has the most optimal size if (thrPerGrp > maxSize) { thrPerGrp = maxSize; } lclWorkSize[maxDim] = thrPerGrp; for (uint d = 0; d < workDim; ++d) { if (d != maxDim) { lclWorkSize[d] = 1; } } } } } } else { // Use overrides when app doesn't provide workgroup dimensions if (workDim == 1) { lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; } else if (workDim == 2) { lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; } else if (workDim == 3) { lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; } else { assert(0 && "Invalid workDim!"); } } } } else { for (uint d = 0; d < workDim; ++d) { lclWorkSize[d] = workGroupInfo()->compileSize_[d]; } } } void Kernel::setupProgramGrid( VirtualGPU& gpu, size_t workDim, const amd::NDRange& glbWorkOffset, const amd::NDRange& gblWorkSize, amd::NDRange& lclWorkSize, const amd::NDRange& groupOffset, const amd::NDRange& glbWorkOffsetOrg, const amd::NDRange& glbWorkSizeOrg ) const { // ABI is always in CB0 address cbBuf = gpu.cb(0)->sysMemCopy(); uint* pGlobalSize = reinterpret_cast (cbBuf + GlobalWorkitemOffset * ConstBuffer::VectorSize); uint* pLocalSize = reinterpret_cast (cbBuf + LocalWorkitemOffset * ConstBuffer::VectorSize); uint* pNumGroups = reinterpret_cast (cbBuf + GroupsOffset * ConstBuffer::VectorSize); uint* pGlobalOffset = reinterpret_cast (cbBuf + GlobalWorkOffsetOffset * ConstBuffer::VectorSize); uint* pGroupOffset = reinterpret_cast (cbBuf + GroupWorkOffsetOffset * ConstBuffer::VectorSize); uint32_t* debugInfo = reinterpret_cast (cbBuf + DebugOffset * ConstBuffer::VectorSize); uint* pNDRangeGlobalOffset = reinterpret_cast (cbBuf + NDRangeGlobalWorkOffsetOffset * ConstBuffer::VectorSize); // Check for 64-bit metadata uint glbABIShift = (abi64Bit()) ? 1 : 0; VirtualGPU::CalVirtualDesc* progGrid = &gpu.cal_; // Finds local workgroup size findLocalWorkSize(workDim, gblWorkSize, lclWorkSize); // Initialize the execution grid block and size/offset pGlobalSize[0] = pGlobalSize[1] = pGlobalSize[2] = 1; pGlobalSize[3] = static_cast(workDim); pLocalSize[0] = pLocalSize[1] = pLocalSize[2] = 1; pLocalSize[3] = 0; pNumGroups[0] = pNumGroups[1] = pNumGroups[2] = 1; pNumGroups[3] = 0; pGlobalOffset[2] = pGlobalOffset[1] = pGlobalOffset[0] = 0; pGroupOffset[2] = pGroupOffset[1] = pGroupOffset[0] = 0; progGrid->gridBlock.width = progGrid->gridBlock.height = progGrid->gridBlock.depth = 1; progGrid->gridSize.width = progGrid->gridSize.height = progGrid->gridSize.depth = 1; progGrid->partialGridBlock.width = progGrid->partialGridBlock.height = progGrid->partialGridBlock.depth = 1; bool partialGrid = false; // Fill the right values, based on the application request switch (workDim) { case 3: pLocalSize[2] = progGrid->gridBlock.depth = static_cast(lclWorkSize[2]); pGlobalSize[2] = static_cast(glbWorkSizeOrg[2]); progGrid->gridSize.depth = static_cast(gblWorkSize[2]); progGrid->gridSize.depth /= progGrid->gridBlock.depth; pNumGroups[2] = pGlobalSize[2] / progGrid->gridBlock.depth; pGlobalOffset[2] = glbWorkOffset[2]; pGroupOffset[2] = groupOffset[2]; pNDRangeGlobalOffset[2 + glbABIShift] = glbWorkOffsetOrg[2]; if (dev().settings().partialDispatch_) { // Check if partial workgroup dispatch is required progGrid->partialGridBlock.depth = gblWorkSize[2] % lclWorkSize[2]; if (progGrid->partialGridBlock.depth != 0) { partialGrid = true; // Increment the number of groups progGrid->gridSize.depth++; pNumGroups[2]++; } else { progGrid->partialGridBlock.depth = lclWorkSize[2]; } } // Fall through to fill 2D and 1D dimensions... case 2: pLocalSize[1] = progGrid->gridBlock.height = static_cast(lclWorkSize[1]); pGlobalSize[1] = static_cast(glbWorkSizeOrg[1]); progGrid->gridSize.height = static_cast(gblWorkSize[1]); progGrid->gridSize.height /= progGrid->gridBlock.height; pNumGroups[1] = pGlobalSize[1] / progGrid->gridBlock.height; pGlobalOffset[1] = glbWorkOffset[1]; pGroupOffset[1] = groupOffset[1]; pNDRangeGlobalOffset[1 + glbABIShift] = glbWorkOffsetOrg[1]; if (dev().settings().partialDispatch_) { // Check if partial workgroup dispatch is required progGrid->partialGridBlock.height = gblWorkSize[1] % lclWorkSize[1]; if (progGrid->partialGridBlock.height != 0) { partialGrid = true; // Increment the number of groups progGrid->gridSize.height++; pNumGroups[1]++; } else { progGrid->partialGridBlock.height = lclWorkSize[1]; } } // Fall through to fill 1D dimension... case 1: pLocalSize[0] = progGrid->gridBlock.width = static_cast(lclWorkSize[0]); pGlobalSize[0] = static_cast(glbWorkSizeOrg[0]); progGrid->gridSize.width = static_cast(gblWorkSize[0]); progGrid->gridSize.width /= progGrid->gridBlock.width; pNumGroups[0] = pGlobalSize[0] / progGrid->gridBlock.width; pGlobalOffset[0] = glbWorkOffset[0]; pGroupOffset[0] = groupOffset[0]; pNDRangeGlobalOffset[0 + glbABIShift] = glbWorkOffsetOrg[0]; if (dev().settings().partialDispatch_) { // Check if partial workgroup dispatch is required progGrid->partialGridBlock.width = gblWorkSize[0] % lclWorkSize[0]; if (progGrid->partialGridBlock.width != 0) { partialGrid = true; // Increment the number of groups progGrid->gridSize.width++; pNumGroups[0]++; } else { progGrid->partialGridBlock.width = lclWorkSize[0]; } } break; default: LogWarning("Wrong dimensions. Force to 1x1x1!"); break; } if (!partialGrid) { progGrid->partialGridBlock.width = progGrid->partialGridBlock.height = progGrid->partialGridBlock.depth = 0; } // Calculate the total number of workitems and workgroups pGlobalOffset[3] = pGroupOffset[3] = 1; for (uint i = 0; i < workDim; ++i) { pGlobalOffset[3] *= pGlobalOffset[i]; pGroupOffset[3] *= pGroupOffset[i]; } // Setup debug output buffer (if printf is active) if (flags() & PrintfOutput) { if (abi64Bit()) { // Setup the debug info in constant buffer reinterpret_cast(debugInfo)[1] = gpu.printfDbg().bufOffset(); // Size in DWORDs debugInfo[4] = static_cast(gpu.printfDbg().wiDbgSize()); debugInfo[4] /= sizeof(uint32_t); } else { // Setup the debug info in constant buffer debugInfo[1] = static_cast(gpu.printfDbg().bufOffset()); // Size in DWORDs debugInfo[2] = static_cast(gpu.printfDbg().wiDbgSize()); debugInfo[2] /= sizeof(uint32_t); } } } bool Kernel::initParameters() { size_t offset = 0; device::Kernel::parameters_t params; amd::KernelParameterDescriptor desc; for (uint i = 0; i < arguments_.size(); ++i) { const KernelArg* arg = argument(i); // Initialize the arguments for the abstraction layer if (arg->isCbNeeded()) { desc.name_ = arg->name_.data(); desc.type_ = arg->type(); desc.size_ = arg->size(false); desc.addressQualifier_ = arg->addressQualifier(); desc.accessQualifier_ = arg->accessQualifier(); desc.typeName_ = arg->typeName(); desc.typeQualifier_ = arg->typeQualifier(); // Make offset alignment to match CPU metadata, since // in multidevice config abstraction layer has a single signature // and CPU sends the paramaters as they are allocated in memory size_t size = desc.size_; if (size == 0) { // Local memory for CPU size = sizeof(cl_mem); } offset = amd::alignUp(offset, std::min(size, size_t(16))); desc.offset_ = offset; offset += amd::alignUp(size, sizeof(uint32_t)); params.push_back(desc); } } // Report the allocated local memory size (emulated and hw) if (hwLocalSize_ != 0) { CondLog((dev().info().localMemSize_ < hwLocalSize_), "Requested local size is bigger than reported"); workGroupInfo_.localMemSize_ = hwLocalSize_; } if (!createSignature(params)) { return false; } return true; } bool Kernel::bindGlobalHwCb( VirtualGPU& gpu, VirtualGPU::GslKernelDesc* desc) const { bool result = true; // Bind HW constant buffers used for the global data store const Program::HwConstBuffers& gds = prog().glbHwCb(); for (Program::HwConstBuffers::const_iterator it = gds.begin(); (it != gds.end() && result); ++it) { uint idx = it->first; result = bindResource(gpu, *(it->second), idx, ConstantBuffer, idx); } return result; } bool Kernel::bindConstantBuffers(VirtualGPU& gpu) const { bool result = true; assert((numCb_ <= MaxConstBuffersArguments) && "Runtime doesn't support more CBs for arguments!"); // Upload the parameters to HW and bind all constant buffers for (uint i = 0; i < numCb_; i++) { ConstBuffer* cb = gpu.constBufs_[i]; result &= cb->uploadDataToHw(cbSizes_[i]) && bindResource(gpu, *cb, i, ConstantBuffer, i, cb->wrtOffset()); } return result; } void Kernel::processMemObjects( VirtualGPU& gpu, const amd::Kernel& kernel, const_address params, bool nativeMem) const { VirtualGPU::MemoryDependency& dependecy = gpu.memoryDependency(); // Mark the tracker with a new kernel, // so we can avoid checks of the aliased objects gpu.memoryDependency().newKernel(); // Check all parameters for the current kernel const amd::KernelSignature& signature = kernel.signature(); for (size_t i = 0; i < signature.numParameters(); ++i) { const amd::KernelParameterDescriptor& desc = signature.at(i); const KernelArg* arg = argument(i); Memory* memory = NULL; // Find if current argument is a buffer if ((desc.type_ == T_POINTER) && (arg->type_ != KernelArg::PointerLocal) && (arg->type_ != KernelArg::PointerHwLocal)) { if (nativeMem) { memory = *reinterpret_cast(params + desc.offset_); } else if (*reinterpret_cast (params + desc.offset_) != NULL) { memory = dev().getGpuMemory(*reinterpret_cast (params + desc.offset_)); // Synchronize data with other memory instances if necessary memory->syncCacheFromHost(gpu); } if (memory != NULL) { // Validate memory for a dependency in the queue gpu.memoryDependency().validate(gpu, memory, arg->memory_.readOnly_); } } } } bool Kernel::loadParameters( VirtualGPU& gpu, const amd::Kernel& kernel, const_address params, bool nativeMem) const { bool result = true; uint i; // Initialize local private ranges if (!initLocalPrivateRanges(gpu)) { return false; } if ((UavIdUndefined != uavRaw_) && (!(flags() & PrintfOutput) || (printfId_ != UavIdUndefined))) { Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage()); // Bind a buffer for a dummy read result = bindResource(gpu, *gpuMemory, 0, ArgumentUavID, uavRaw_); } // Find all parameters for the current kernel const amd::KernelSignature& signature = kernel.signature(); for (i = 0; i != signature.numParameters(); ++i) { const amd::KernelParameterDescriptor& desc = signature.at(i); // Set current argument if (!setArgument(gpu, i, params + desc.offset_, desc.size_, nativeMem)) { result = false; break; } } if (result) { // Update the ring ranges and math constant setLocalPrivateRanges(gpu); result = bindConstantBuffers(gpu); if (flags() & PrivateFixed) { result &= bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_); } // Setup debug output buffer (if printf is active) if (flags() & PrintfOutput) { gpu.addVmMemory(gpu.printfDbg().dbgBuffer()); } } return result; } bool Kernel::run(VirtualGPU& gpu, GpuEvent* calEvent, bool lastRun) const { const VirtualGPU::CalVirtualDesc* dispatch = gpu.cal(); auto compProg = static_cast(gpu.gslKernelDesc()->func_); compProg->setWavesPerSH(waveLimiter_.getWavesPerSH(&gpu)); gpu.eventBegin(MainEngine); gpu.rs()->Dispatch(gpu.cs(), &dispatch->gridBlock, &dispatch->partialGridBlock, &dispatch->gridSize, dispatch->localSize, gpu.vmMems(), dispatch->memCount_); gpu.eventEnd(MainEngine, *calEvent); // Unbind all resources unbindResources(gpu, *calEvent, lastRun); return true; } static size_t counter = 0; void Kernel::debug(VirtualGPU& gpu) const { std::fstream stubWrite; address src = NULL; std::cerr << "--- " << name_ << " ---" << std::endl; for (uint i = 0; i < arguments_.size(); ++i) { const KernelArg* arg = argument(i); const Memory* gpuMem = gpu.slots_[i].memory_; std::stringstream fileName; bool bufferObj = ((arg->type_ == KernelArg::PointerGlobal) || (arg->type_ == KernelArg::PointerConst) || (arg->type_ == KernelArg::PointerHwConst)); if ((src != NULL) && arg->isCbNeeded() && bufferObj) { address memory = gpu.cb(arg->cbIdx_)->sysMemCopy(); std::cerr.setf(std::ios::hex); uint* location = reinterpret_cast (src + *reinterpret_cast(memory + arg->cbPos_)); std::cerr << " > " << arg->name_ << ": 0x" << location << std::endl; // Dump the data fileName << counter << "_kernel_" << name() << "_" << arg->name_ << "_" << location << ".bin"; stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary)); // Write data to a file if (stubWrite.is_open()) { stubWrite.write( reinterpret_cast(location), gpuMem->size()); stubWrite.close(); } } if (((arg->type_ >= KernelArg::Image1D) && (arg->type_ <= KernelArg::Image3D)) || ((src == NULL) && bufferObj)) { //@todo Replace the current map Memory* resource = const_cast(gpu.slots_[i].memory_); void* memory = resource->map(&gpu); uint* location = reinterpret_cast(memory); std::cerr << " > " << arg->name_ << (bufferObj ? ": buffer" : ": image") << std::endl; // Dump the data fileName << counter << "_kernel_" << name() << "_" << arg->name_ << "_" << location << ".bin"; stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary)); // Write data to a file if (stubWrite.is_open()) { stubWrite.write( reinterpret_cast(location), gpuMem->size()); stubWrite.close(); } resource->unmap(&gpu); } } for (uint i = 0; i < gpu.constBufs_.size(); ++i) { std::stringstream fileName; fileName << counter++ << "_kernel_" << name() << "_const" << i << ".bin"; stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary)); if (stubWrite.is_open()) { address memory = reinterpret_cast
(gpu.constBufs_[i]->map(&gpu, Resource::ReadOnly)); // Check if we have OpenCL program stubWrite.write(reinterpret_cast(memory+gpu.cb(i)->wrtOffset()), gpu.cb(i)->lastWrtSize()); gpu.constBufs_[i]->unmap(&gpu); stubWrite.close(); } } const Program::HwConstBuffers& gds = prog().glbHwCb(); for (Program::HwConstBuffers::const_iterator it = gds.begin(); it != gds.end(); ++it) { uint idx = it->first; std::stringstream fileName; fileName << counter++ << "_kernel_" << name() << "_const" << idx << ".bin"; stubWrite.open(fileName.str().c_str(), (std::fstream::out | std::fstream::binary)); if (stubWrite.is_open()) { address memory = reinterpret_cast
((it->second)->map(&gpu, Resource::ReadOnly)); // Check if we have OpenCL program stubWrite.write(reinterpret_cast(memory), (it->second)->size()); (it->second)->unmap(&gpu); stubWrite.close(); } } } bool Kernel::initConstBuffers() { bool result = true; size_t i; assert((numCb_ != 0) && "We have 0 constant buffers!"); // Allocate an array for CB sizes cbSizes_ = new size_t[numCb_]; if (cbSizes_ == NULL) { return false; } memset(cbSizes_, 0, sizeof(size_t) * numCb_); // CB0 is reserved for ABI data cbSizes_[0] = TotalABIVectors * ConstBuffer::VectorSize; // Find sizes of all constant buffers for (i = 0; i < arguments_.size(); ++i) { const KernelArg* arg = argument(i); size_t size = arg->cbPos_ + arg->size(true); size_t specVec = arg->specialVector(); if (specVec != 0) { size = arg->cbPos_ + (arg->size_ / KernelArg::VectorSizeLimit) * ConstBuffer::VectorSize; } // Do we need a CB? if (arg->isCbNeeded() && (cbSizes_[arg->cbIdx_] < size)) { cbSizes_[arg->cbIdx_] = size; } } return result; } bool Kernel::setInternalSamplers(VirtualGPU& gpu) const { for (uint i = 0; i < samplerSize(); ++i) { const KernelArg* arg = sampler(i); uint state = arg->cbPos_; uint idx = arg->index_; if (gpu.cal()->samplersState_[idx] != state) { setSampler(gpu, state, idx); gpu.cal_.samplersState_[idx] = state; } } return true; } bool Kernel::setArgument( VirtualGPU& gpu, uint idx, const void* param, size_t size, bool nativeMem) const { bool result = true; const KernelArg* arg; address memory; size_t argSize; static const bool waitOnBusyEngine = true; assert((idx < arguments_.size()) && "Param index is out of range!"); arg = argument(idx); assert((arg->cbIdx_ == 1) && "Runtime supports CB1 only for the arguments buffer!"); memory = gpu.cb(1)->sysMemCopy(); argSize = arg->size(true); // Bind the global heap for emulation mode switch (arg->type_) { case KernelArg::PointerLocal: case KernelArg::PointerPrivate: if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) { return false; } // Fall through ... default: break; } switch (arg->type_) { case KernelArg::PointerConst: case KernelArg::PointerHwConst: case KernelArg::PointerGlobal: { gpu::Memory* gpuMem = NULL; if (nativeMem) { gpuMem = *reinterpret_cast(param); } else if (*reinterpret_cast(param) != NULL) { gpuMem = dev().getGpuMemory(*reinterpret_cast(param)); } bool forceZeroOffset = false; if (gpuMem == NULL) { forceZeroOffset = true; gpuMem = dev().getGpuMemory(dev().dummyPage()); } uint64_t offset = gpuMem->pinOffset(); // Make sure the passed argument is a buffer object if (!gpuMem->cal()->buffer_) { LogError("The kernel buffer argument isn't a buffer object!"); return false; } if (arg->type_ == KernelArg::PointerHwConst) { // Bind current memory object with the kernel if (!bindResource(gpu, *gpuMem, idx, ArgumentConstBuffer, arg->index_)) { return false; } assert((offset == 0) && "No offset for HW CB"); // Add a fake offset to make sure (ptr != NULL) is TRUE offset = 1; } else { ResourceType type = ArgumentHeapBuffer; // Check if kernel expects UAV binding if (arg->memory_.uavBuf_) { type = ArgumentBuffer; } else { // Bind global buffer to UAV this buffer is bound to if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) { return false; } } // Bind current memory object with the kernel // Note: it's a fake binding, if the buffer is part of // the global heap if (!bindResource(gpu, *gpuMem, idx, type, arg->index_)) { return false; } // Update offset only if we bind HeapBuffer or // it's global address space in UAV setup on SI+ offset += gpuMem->hbOffset(); if (!forceZeroOffset) { assert((offset != 0) && "Offset 0 with a real allocation!"); } gpu.addVmMemory(gpuMem); } // Wait for resource if it was used on an inactive engine //! \note syncCache may call DRM transfer gpuMem->wait(gpu, waitOnBusyEngine); if (forceZeroOffset) { offset = 0; } // Copy memory offset into the constant buffer if (abi64Bit()) { *(reinterpret_cast(memory + arg->cbPos_)) = offset; } else { *(reinterpret_cast(memory + arg->cbPos_)) = static_cast(offset); } } break; case KernelArg::Image1D: case KernelArg::Image2D: case KernelArg::Image3D: case KernelArg::Image1DB: case KernelArg::Image1DA: case KernelArg::Image2DA: { gpu::Memory* gpuMem = NULL; if (nativeMem) { gpuMem = *reinterpret_cast(param); } else if (*reinterpret_cast(param) != NULL) { gpuMem = dev().getGpuMemory(*reinterpret_cast(param)); } if (gpuMem == NULL) { return false; } // Make sure the passed argument is an image object if (gpuMem->cal()->buffer_) { LogError("The kernel image argument isn't an image object!"); return false; } ResourceType resType = arg->memory_.readOnly_ ? ArgumentImageRead : ArgumentImageWrite; // Bind current memory object with the shader. if (!bindResource(gpu, *gpuMem, idx, resType, arg->index_)) { return false; } // Wait for resource if it was used on an inactive engine //! \note syncCache may call DRM transfer gpuMem->wait(gpu, waitOnBusyEngine); // Copy image constants into the constant buffer if (gpuMem->owner() != NULL) { copyImageConstants(gpuMem->owner()->asImage(), reinterpret_cast(memory + arg->cbPos_)); } } break; case KernelArg::Sampler: { amd::Sampler* amdSampler = *reinterpret_cast(param); uint idx = arg->index_; uint32_t state = amdSampler->state(); if (state != gpu.cal()->samplersState_[idx]) { setSampler(gpu, state, idx); gpu.cal_.samplersState_[idx] = state; } // Copy sampler state into the constant buffer *(reinterpret_cast(memory + arg->cbPos_)) = state; } break; case KernelArg::Counter: { gpu::Memory* gpuMem = NULL; if (nativeMem) { gpuMem = *reinterpret_cast(param); } else if (*reinterpret_cast(param) != NULL) { gpuMem = dev().getGpuMemory(*reinterpret_cast(param)); } // Wait for resource if it was used on an inactive engine //! \note syncCache may call DRM transfer gpuMem->wait(gpu, waitOnBusyEngine); // Bind current memory object with the shader. if (!bindResource(gpu, *gpuMem, idx, ArgumentCounter, idx)) { return false; } } break; case KernelArg::PointerHwLocal: { // Calculate current offset in the local ring uint offset = gpu.cal_.localSize; uint extra = amd::alignUp(offset, arg->alignment_) - offset; offset = amd::alignUp(offset, arg->alignment_); size_t memSize = *static_cast(param); // Allocate new memory from the local ring gpu.cal_.localSize += static_cast(memSize) + extra; // Copy current local argument's offset into the CB *(reinterpret_cast(memory + arg->cbPos_)) = offset; CondLog((gpu.cal_.localSize > dev().info().localMemSize_), "Requested local size is bigger than reported!"); } break; case KernelArg::Float: case KernelArg::Double: case KernelArg::Char: case KernelArg::UChar: case KernelArg::Short: case KernelArg::UShort: case KernelArg::Int: case KernelArg::UInt: case KernelArg::Long: case KernelArg::ULong: if (size != argSize) { LogWarning("Parameter's sizes are unmatched!"); } // Fall through ... case KernelArg::Struct: case KernelArg::Union: { size_t specVec = arg->specialVector(); if (specVec != 0) { uint iter = (arg->size_ / KernelArg::VectorSizeLimit); for (uint i = 0; i < iter; ++i) { amd::Os::fastMemcpy((memory + arg->cbPos_ + i * ConstBuffer::VectorSize), reinterpret_cast(param) + i * KernelArg::VectorSizeLimit * specVec, specVec * KernelArg::VectorSizeLimit); } } else { // Copy data into the CB amd::Os::fastMemcpy((memory + arg->cbPos_), param, size); } } break; default: LogError("Unhandled argument's type!"); break; } return result; } bool Kernel::initLocalPrivateRanges(VirtualGPU& gpu) const { // Initialize HW local gpu.cal_.localSize = hwLocalSize_; // Bind the global buffer if emulated local or private memory // was allocated by the kernel if ((flags() & PrintfOutput && (printfId_ == UavIdUndefined)) && (uavRaw_ != UavIdUndefined)) { if (!bindResource(gpu, dev().globalMem(), 0, GlobalBuffer, uavRaw_)) { return false; } } // Bind the global buffer if emulated constant buffers are enabled if (cbId_ != UavIdUndefined) { if (!bindResource(gpu, dev().globalMem(), 0, ArgumentCbID, cbId_)) { return false; } } // Bind the printf buffer if (printfId_ != UavIdUndefined) { if (!bindResource(gpu, dev().globalMem(), 0, ArgumentPrintfID, printfId_)) { return false; } } // Initialize the iterations count gpu.cal_.iterations_ = 1; return true; } void Kernel::setLocalPrivateRanges(VirtualGPU& gpu) const { address cbBuf = gpu.cb(0)->sysMemCopy(); uint* data; uint gridSize = gpu.cal()->gridSize.width * gpu.cal()->gridSize.height * gpu.cal()->gridSize.depth; uint blockSize = gpu.cal()->gridBlock.width * gpu.cal()->gridBlock.height * gpu.cal()->gridBlock.depth; //! \todo validate if the compiler still generates PrivateFixed if (flags() & PrivateFixed) { // Update private ring data = reinterpret_cast (cbBuf + PrivateRingOffset * ConstBuffer::VectorSize); Memory* gpuMemory = dev().getGpuMemory(dev().dummyPage()); if (abi64Bit()) { reinterpret_cast(data)[0] = gpuMemory->hbOffset(); data[2] = 0; data[3] = 0; } else { data[0] = static_cast(gpuMemory->hbOffset()); data[1] = 0; data[2] = data[3] = 0; } gpu.addVmMemory(gpuMemory); } // Copy the math lib constants amd::Os::fastMemcpy( (cbBuf + MathLibOffset * ConstBuffer::VectorSize), MathLibConst, sizeof(MathLibConst)); // Update the offset to the global data if (prog().glbData() != NULL) { gpu.addVmMemory(prog().glbData()); uint64_t glbDataOffset = prog().glbData()->hbOffset(); if (abi64Bit()) { *reinterpret_cast(cbBuf + GlobalDataStoreOffset * ConstBuffer::VectorSize) = glbDataOffset; } else { *reinterpret_cast(cbBuf + GlobalDataStoreOffset * ConstBuffer::VectorSize) = static_cast(glbDataOffset); } } // Split workload if it was requested if ((gpu.cal_.iterations_ < 2) && gpu.dmaFlushMgmt().dispatchSplitSize() != 0) { uint totalSize = gridSize * blockSize; if (totalSize > gpu.dmaFlushMgmt().dispatchSplitSize()) { gpu.cal_.iterations_ = std::max(gpu.cal_.iterations_, (totalSize / gpu.dmaFlushMgmt().dispatchSplitSize())); } } // Initialize the number of iterations to the grid size if (flags() & PrintfOutput) { gpu.cal_.iterations_ = gridSize; } } void Kernel::setSampler( VirtualGPU& gpu, uint32_t state, uint physUnit ) const { // All CAL sampler's parameters are in floats float gslAddress = GSL_CLAMP_TO_BORDER; float gslMinFilter = GSL_MIN_NEAREST; float gslMagFilter = GSL_MAG_NEAREST; state &= ~amd::Sampler::StateNormalizedCoordsMask; // Program the sampler address mode switch (state & amd::Sampler::StateAddressMask) { case amd::Sampler::StateAddressRepeat: gslAddress = GSL_REPEAT; break; case amd::Sampler::StateAddressClampToEdge: gslAddress = GSL_CLAMP_TO_EDGE; break; case amd::Sampler::StateAddressMirroredRepeat: gslAddress = GSL_MIRRORED_REPEAT; break; case amd::Sampler::StateAddressClamp: case amd::Sampler::StateAddressNone: default: break; } state &= ~amd::Sampler::StateAddressMask; gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_S, &gslAddress); gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_T, &gslAddress); gpu.setSamplerParameter(physUnit, GSL_TEXTURE_WRAP_R, &gslAddress); // Program texture filter mode if (state == amd::Sampler::StateFilterLinear) { gslMinFilter = GSL_MIN_LINEAR; gslMagFilter = GSL_MAG_LINEAR; } gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MIN_FILTER, &gslMinFilter); gpu.setSamplerParameter(physUnit, GSL_TEXTURE_MAG_FILTER, &gslMagFilter); } bool Kernel::bindResource( VirtualGPU& gpu, const Memory& memory, uint paramIdx, ResourceType type, uint physUnit, size_t offset) const { gslUAVType uavType = GSL_UAV_TYPE_UNKNOWN; // Find the original resource name from the IL program switch (type) { case GlobalBuffer: if (gpu.state_.boundGlobal_) { return true; } gpu.state_.boundGlobal_ = true; physUnit = uavRaw_; uavType = GSL_UAV_TYPE_TYPELESS; break; case ArgumentCbID: if (gpu.state_.boundCb_) { return true; } gpu.state_.boundCb_ = true; physUnit = cbId_; uavType = GSL_UAV_TYPE_TYPELESS; break; case ArgumentPrintfID: if (gpu.state_.boundPrintf_) { return true; } gpu.state_.boundPrintf_ = true; physUnit = printfId_; uavType = GSL_UAV_TYPE_TYPELESS; break; case ArgumentHeapBuffer: case ArgumentBuffer: case ArgumentImageRead: case ArgumentImageWrite: case ArgumentConstBuffer: case ArgumentCounter: // Early exit if resource is bound already if (gpu.slots_[paramIdx].state_.bound_) { return true; } // Associate resource with the slot gpu.slots_[paramIdx].memory_ = &memory; // Mark resource as bound gpu.slots_[paramIdx].state_.bound_ = true; if (type == ArgumentCounter) { GpuEvent calEvent; // Bind memory with atomic counter gpu.cs()->bindAtomicCounter(argument(paramIdx)->index_, memory.gslResource()); // Copy the counter value into GDS gpu.eventBegin(MainEngine); gpu.cs()->syncAtomicCounter(argument(paramIdx)->index_, false); gpu.eventEnd(MainEngine, calEvent); // Mark resource as busy memory.setBusy(gpu, calEvent); return true; } else if (type == ArgumentHeapBuffer) { // We return here, since we just have to bind the global heap return true; } else if (type == ArgumentConstBuffer) { gpu.slots_[paramIdx].state_.constant_ = true; } break; case ArgumentUavID: case ConstantBuffer: break; default: LogPrintfError("Unspecified argument type ()!", type); return false; } gslMemObject gslMem = NULL; // Use global address space on SI+ for UAV setup if ((type == ArgumentBuffer) || (type == ArgumentCbID) || (type == ArgumentUavID) || (type == ArgumentPrintfID)) { gslMem = dev().heap().resource().gslResource(); } else { gslMem = memory.gslResource(); } // Associate memory with the physical unit, the actual binding bool result = true; switch (type) { case GlobalBuffer: case ArgumentBuffer: case ArgumentImageWrite: case ArgumentUavID: case ArgumentCbID: case ArgumentPrintfID: if (type == ArgumentImageWrite) { uavType = GSL_UAV_TYPE_TYPED; } else if ((type == ArgumentBuffer) || (type == ArgumentUavID)) { uavType = GSL_UAV_TYPE_TYPELESS; } if (gpu.cal_.uavs_[physUnit] != gslMem) { result = gpu.setUAVBuffer(physUnit, gslMem, uavType); gpu.setUAVChannelOrder(physUnit, gslMem); gpu.cal_.uavs_[physUnit] = gslMem; } break; case ConstantBuffer: case ArgumentConstBuffer: if ((gpu.cal_.constBuffers_[physUnit] != gslMem) || (offset != 0)) { result = gpu.setConstantBuffer(physUnit, gslMem, offset, memory.hbSize()); gpu.cal_.constBuffers_[physUnit] = gslMem; } break; case ArgumentImageRead: if (gpu.cal_.readImages_[physUnit] != gslMem) { result = gpu.setInput(physUnit, gslMem); gpu.cal_.readImages_[physUnit] = gslMem; } break; default: result = false; assert(false); break; } if (!result) { LogPrintfError("setMem failed unit:%d mem:0x%08x!", physUnit, gslMem); return false; } return true; } void Kernel::unbindResources( VirtualGPU& gpu, GpuEvent calEvent, bool lastRun) const { // Make sure unbind occurs on the last run, in case the execution had a split if (lastRun) { for (uint i = 0; i < arguments_.size(); ++i) { if (gpu.slots_[i].state_.bound_) { GpuEvent calEventTmp = calEvent; if (KernelArg::Counter == argument(i)->type_) { // Copy the counter value from GDS gpu.eventBegin(MainEngine); gpu.cs()->syncAtomicCounter(argument(i)->index_, true); gpu.eventEnd(MainEngine, calEventTmp); } else if (!(gpu.slots_[i].state_.constant_ || argument(i)->memory_.readOnly_)) { // Signal the abstraction layer that GPU memory is dirty if (gpu.slots_[i].memory_->owner() != NULL) { gpu.slots_[i].memory_->owner()->signalWrite(&gpu.dev()); } } // Mark resource as busy gpu.slots_[i].memory_->setBusy(gpu, calEventTmp); gpu.slots_[i].state_.value_ = 0; } } // Unbind the global buffer gpu.state_.boundGlobal_ = false; // Unbind the constant buffer gpu.state_.boundCb_ = false; // Unbind the pritnf buffer gpu.state_.boundPrintf_ = false; } // Mark CB busy for (uint i = 0; i < numCb_; ++i) { gpu.constBufs_[i]->setBusy(gpu, calEvent); } // Set the event object for the scratch buffer if (workGroupInfo()->scratchRegs_ > 0) { dev().scratch(gpu.hwRing())->memObj_->setBusy(gpu, calEvent); } } void Kernel::copyImageConstants( const amd::Image* amdImage, ImageConstants* imageData ) const { imageData->width_ = static_cast(amdImage->getWidth()); imageData->height_ = static_cast(amdImage->getHeight()); imageData->depth_ = static_cast(amdImage->getDepth()); imageData->dataType_ = static_cast(amdImage->getImageFormat().image_channel_data_type); imageData->widthFloat_ = 1.f / static_cast(amdImage->getWidth()); imageData->heightFloat_ = 1.f / static_cast(amdImage->getHeight()); imageData->depthFloat_ = 1.f / static_cast(amdImage->getDepth()); imageData->channelOrder_ = static_cast(amdImage->getImageFormat().image_channel_order); } union MetadataVersion { struct { uint64_t revision_: 16; //!< LLVM metadata revision uint64_t minorVersion_: 16; //!< LLVM metadata minor verison uint64_t majorVersion_: 16; //!< LLVM metadata major version }; uint64_t value_; MetadataVersion(uint mj, uint mi, uint rev): value_(0) { revision_ = rev; minorVersion_ = mi; majorVersion_ = mj; } MetadataVersion(): value_(0) {} }; //! Version of metadata with buffer attributes const MetadataVersion MetadataBufferAttributes = MetadataVersion(2, 0, 88); //! Version of metadata with type qualifiers const MetadataVersion MetadataTypeQualifiers = MetadataVersion(3, 1, 103); bool NullKernel::parseArguments(const std::string& metaData, uint* uavRefCount) { // Initialize workgroup info workGroupInfo_.size_ = nullDev().info().maxWorkGroupSize_; MetadataVersion mdVersion; // Find first tag size_t pos = metaData.find(";"); // Loop through all provided program arguments while (pos != std::string::npos) { KernelArg arg; if (!expect(metaData, &pos, ";")) { break; } arg.type_ = KernelArg::None; // Loop through all available metadata types for (uint i = 0; i < ArgStateTotal; ++i) { uint tmpValue; // Find the name tag if (expect(metaData, &pos, ArgState[i].typeName_)) { switch (ArgState[i].type_) { case KernelArg::None: // Process next ... continue; case KernelArg::Reflection: { uint argIdx; // Read the argument's index if (!getuint(metaData, &pos, &argIdx)) { LogWarning("Couldn't get the argument index!"); return false; } KernelArg* tmpArg = arguments_[argIdx]; if (!getstring(metaData, &pos, &tmpArg->typeName_)) { LogWarning("Couldn't get the argument type!"); return false; } } continue; case KernelArg::ConstArg: { uint argIdx; // Read the argument's index if (!getuint(metaData, &pos, &argIdx)) { LogWarning("Couldn't get the argument index!"); return false; } KernelArg* tmpArg = arguments_[argIdx]; tmpArg->typeQualifier_ |= CL_KERNEL_ARG_TYPE_CONST; } continue; case KernelArg::Grouping: for (uint j = 0; j < 3; ++j) { uint temp; // Read the compile workgroup size if (!getuint(metaData, &pos, &temp)) { LogWarning("Couldn't get the compile workgroup size!"); return false; } workGroupInfo_.compileSize_[j] = temp; } // Process next ... continue; case KernelArg::WrkgrpSize: { uint temp; // Read the workgroup size if (!getuint(metaData, &pos, &temp)) { LogWarning("Couldn't get the workgroup size!"); return false; } workGroupInfo_.size_ = temp; } // Process next ... continue; case KernelArg::Wavefront: // Process next ... continue; case KernelArg::UavId: // Read index if (!getuint(metaData, &pos, &arg.index_)) { return false; } break; case KernelArg::ConstBufId: // Read index if (!getuint(metaData, &pos, &cbId_)) { return false; } continue; case KernelArg::PrintfBufId: // Read index if (!getuint(metaData, &pos, &printfId_)) { return false; } continue; case KernelArg::MetadataVersion: // Read metadata version if (!getuint(metaData, &pos, &tmpValue)) { return false; } mdVersion.majorVersion_ = tmpValue; if (!getuint(metaData, &pos, &tmpValue)) { return false; } mdVersion.minorVersion_ = tmpValue; if (!getuint(metaData, &pos, &tmpValue)) { return false; } mdVersion.revision_ = tmpValue; // Process next ... continue; case KernelArg::GroupingHint: for (uint j = 0; j < 3; ++j) { uint temp; // Read the compile workgroup size hint if (!getuint(metaData, &pos, &temp)) { LogWarning("Couldn't get the compile workgroup size hint!"); return false; } workGroupInfo_.compileSizeHint_[j] = temp; } // Process next ... continue; case KernelArg::VecTypeHint: { std::string temp; // Read the compile vector type hint if (!getstring(metaData, &pos, &temp)) { LogWarning("Couldn't get the compile vector type hint!"); return false; } workGroupInfo_.compileVecTypeHint_ = temp; } // Process next ... continue; case KernelArg::WavesPerSimdHint: { uint tmp; if (!getuint(metaData, &pos, &tmp)) { return false; } workGroupInfo_.wavesPerSimdHint_ = tmp; } continue; default: break; } std::string argName; // Save the argument type arg.type_ = ArgState[i].type_; // Check if we should expect the name if (ArgState[i].name_) { // Read the parameter's name if (!getword(metaData, &pos, argName)) { LogWarning("Couldn't get a kernel argument!"); return false; } arg.name_ = argName; } if (arg.type_ == KernelArg::Sampler) { if (!getuint(metaData, &pos, &arg.index_)) { LogWarning("Couldn't get a kernel argument!"); return false; } if (!getuint(metaData, &pos, &arg.location_)) { LogWarning("Couldn't get a kernel argument!"); return false; } if (!getuint(metaData, &pos, &arg.cbPos_)) { LogWarning("Couldn't get a kernel argument!"); return false; } } // Check if we should expect the resource data type if (ArgState[i].resType_) { uint k; // Search for the data type for (k = 0; k < DataTypeTotal; k++) { if (expect(metaData, &pos, DataType[k].tagName_)) { arg.dataType_ = DataType[k].type_; if (arg.type_ == KernelArg::Image) { flags_ |= ImageEnable; if (expect(metaData, &pos, "RO:")) { arg.memory_.readOnly_ = 1; } else if (expect(metaData, &pos, "RW:")) { arg.memory_.readWrite_ = 1; flags_ |= ImageWrite; } else if (expect(metaData, &pos, "WO:")) { arg.memory_.writeOnly_ = 1; flags_ |= ImageWrite; } } else if (arg.type_ == KernelArg::Value) { arg.type_ = DataType[k].type_; } break; } } if (k == DataTypeTotal) { LogWarning("We couldn't find the argument's type."); if ((arg.type_ == KernelArg::Value) || !getword(metaData, &pos, argName)) { LogWarning("Couldn't get a kernel argument!"); return false; } } //! @todo temporary condition if ((arg.type_ == KernelArg::Opaque) || (arg.type_ == KernelArg::Sampler)) { assert(false); continue; } } // Check if we should expect the data size if (ArgState[i].size_) { uint tmpData; // Read the data size if (!getuint(metaData, &pos, &tmpData)) { LogWarning("Couldn't get a kernel argument!"); return false; } if (arg.type_ == KernelArg::Image) { arg.type_ = arg.dataType_; arg.index_ = tmpData; } else { arg.size_ = tmpData; } } if (arg.type_ == KernelArg::Counter) { // Read a counter index if (!getuint(metaData, &pos, &arg.index_)) { LogWarning("Couldn't get a counter index!"); return false; } } // Check if we should expect a resource index if (ArgState[i].cbIdx_) { // Read resource index if (!getuint(metaData, &pos, &arg.cbIdx_)) { LogWarning("Couldn't get a kernel argument!"); return false; } if (arg.isCbNeeded() && (numCb_ < arg.cbIdx_)) { numCb_ = arg.cbIdx_; } } // Check if we should expect the CB offset if (ArgState[i].cbPos_) { // Read position in the constant buffer if (!getuint(metaData, &pos, &arg.cbPos_)) { LogWarning("Couldn't get a kernel argument!"); return false; } } // Check if we should expect the buffer type if (ArgState[i].buf_) { // Read the buffer type if (!getword(metaData, &pos, argName)) { LogWarning("Couldn't get a kernel argument!"); return false; } arg.buf_ = argName; for (uint k = 0; k < BufTypeTotal; ++k) { if (0 == arg.buf_.compare(BufType[k].tagName_)) { // Update the parameter type arg.type_ = BufType[k].type_; // Check if we should expect a buffer index if (BufType[k].number_) { // Read a buffer index if (!getuint(metaData, &pos, &arg.index_)) { LogWarning("Couldn't get a kernel argument!"); return false; } } // Check for the required alignment if (BufType[k].alignment_) { // Read data alignment if (!getuint(metaData, &pos, &arg.alignment_)) { LogWarning("Couldn't get a kernel argument!"); return false; } } // Check for the buffer's attribute if ((mdVersion.value_ >= MetadataBufferAttributes.value_) && BufType[k].attribute_) { if (expect(metaData, &pos, "RO")) { arg.memory_.readOnly_ = 1; } else if (expect(metaData, &pos, "RW")) { arg.memory_.readWrite_ = 1; } else if (expect(metaData, &pos, "WO")) { arg.memory_.writeOnly_ = 1; } } // Check for the type qualifier if ((mdVersion.value_ >= MetadataTypeQualifiers.value_) && BufType[k].attribute_) { uint tmp; pos += 1; if (!getuint(metaData, &pos, &tmp)) { LogWarning("Couldn't get volatile type!"); return false; } if (tmp == 1) { arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_VOLATILE; } if (!getuint(metaData, &pos, &tmp)) { LogWarning("Couldn't get restrict type!"); return false; } if (tmp == 1) { arg.typeQualifier_ |= CL_KERNEL_ARG_TYPE_RESTRICT; } } } } } // Find multiple UAV references switch (arg.type_) { case KernelArg::PointerGlobal: case KernelArg::PointerConst: case KernelArg::PointerLocal: case KernelArg::PointerPrivate: case KernelArg::UavId: uavRefCount[arg.index_]++; break; default: break; } // Check if this argument will be passed in constant buffer if (arg.isCbNeeded() || (arg.type_ == KernelArg::UavId)) { if (arg.type_ == KernelArg::Sampler) { // Serach for the passed by value sampler for (uint i = 0; i < argSize(); ++i) { KernelArg* value = arguments_[i]; if (0 == value->name_.compare(arg.name_)) { value->type_ = arg.type_; value->index_ = arg.index_; value->location_ = 0; break; } } } else { KernelArg* argument = new KernelArg(arg); if (argument != NULL) { addArgument(argument); } else { LogError("Couldn't allocate memory!"); return false; } } } // Check if we have a pre-defined sampler else if (arg.type_ == KernelArg::Sampler) { KernelArg* sampler = new KernelArg(arg); if (sampler != NULL) { addSampler(sampler); } else { LogError("Couldn't allocate memory!"); return false; } } break; } } // Next argument pos = metaData.find(";", pos); } // Find arguments that will require a reallocation for (uint i = 0; i < arguments_.size(); ++i) { KernelArg* arg = arguments_[i]; switch (arg->type_) { case KernelArg::PointerGlobal: case KernelArg::PointerConst: case KernelArg::PointerLocal: case KernelArg::PointerPrivate: // Check if can't use a dedicated UAV, // so realloc memory in the heap arg->memory_.realloc_ = false; arg->memory_.uavBuf_ = true; break; case KernelArg::PointerHwConst: arg->memory_.realloc_ = true; break; case KernelArg::UavId: uavRaw_ = arg->index_; break; default: break; } // If argument marked with the const qualifier, then overwrite // Read-Write attributes, since compiler doesn't mark it properly if (arg->typeQualifier() & CL_KERNEL_ARG_TYPE_CONST) { arg->memory_.readOnly_ = 1; arg->memory_.readWrite_ = 0; arg->memory_.writeOnly_ = 0; } } if ((uavRaw_ != UavIdUndefined) && !(flags() & PrintfOutput)) { // Find if default UAV is already assigned to an argument for (uint i = 0; i < arguments_.size(); ++i) { KernelArg* arg = arguments_[i]; switch (arg->type_) { case KernelArg::PointerGlobal: case KernelArg::PointerConst: case KernelArg::PointerLocal: case KernelArg::PointerPrivate: if (uavRaw_ == arg->index_) { uavRaw_ = UavIdUndefined; } break; default: break; } } } // There is always 1 constant buffer, associated with the kernel numCb_++; assert((numCb_ <= MaxConstBuffersArguments) && "Runtime doesn't support more than max CBs for arguments!"); // Limit workgroup size if requested if ((flags() & LimitWorkgroup) && (GPU_MAX_WORKGROUP_SIZE == 0)) { size_t temp = 1; workGroupInfo_.size_ = workGroupInfo()->wavefrontSize_; for (uint j = 0; j < 3; ++j) { if (workGroupInfo()->compileSize_[j] != 0) { temp *= workGroupInfo_.compileSize_[j]; } } // Report a compilation error if requested compile size doesn't // match the required workgroup size if (workGroupInfo()->size_ < temp) { char str[8]; intToStr(workGroupInfo_.size_, str, 8); buildError_ = CL_OUT_OF_RESOURCES; buildLog_ += "Error: Requested compile size is bigger than the required workgroup size of "; buildLog_ += str; buildLog_ += " elements\n"; LogError(buildLog().c_str()); return false; } } // Read/Write attributes are provided in metadata if (mdVersion.value_ >= MetadataBufferAttributes.value_) { rwAttributes_ = true; } return true; } bool Kernel::validateMemory(uint idx, amd::Memory* amdMem) const { // Check if memory doesn't require reallocation bool noRealloc = (!argument(idx)->memory_.realloc_ || amdMem->reallocedDeviceMemory(&dev())); return noRealloc; } inline static HSAIL_ARG_TYPE GetHSAILArgType(const aclArgData* argInfo) { switch (argInfo->type) { case ARG_TYPE_POINTER: return HSAIL_ARGTYPE_POINTER; case ARG_TYPE_QUEUE: return HSAIL_ARGTYPE_QUEUE; case ARG_TYPE_VALUE: return HSAIL_ARGTYPE_VALUE; case ARG_TYPE_IMAGE: return HSAIL_ARGTYPE_IMAGE; case ARG_TYPE_SAMPLER: return HSAIL_ARGTYPE_SAMPLER; case ARG_TYPE_ERROR: default: return HSAIL_ARGTYPE_ERROR; } } inline static size_t GetHSAILArgAlignment(const aclArgData* argInfo) { switch (argInfo->type) { case ARG_TYPE_POINTER: return argInfo->arg.pointer.align; default: return 1; } } inline static HSAIL_ACCESS_TYPE GetHSAILArgAccessType(const aclArgData* argInfo) { if (argInfo->type == ARG_TYPE_POINTER) { switch (argInfo->arg.pointer.type) { case ACCESS_TYPE_RO: return HSAIL_ACCESS_TYPE_RO; case ACCESS_TYPE_WO: return HSAIL_ACCESS_TYPE_WO; case ACCESS_TYPE_RW: default: return HSAIL_ACCESS_TYPE_RW; } } return HSAIL_ACCESS_TYPE_NONE; } inline static HSAIL_ADDRESS_QUALIFIER GetHSAILAddrQual(const aclArgData* argInfo) { if (argInfo->type == ARG_TYPE_POINTER) { switch (argInfo->arg.pointer.memory) { case PTR_MT_CONSTANT_EMU: case PTR_MT_CONSTANT: case PTR_MT_UAV: case PTR_MT_GLOBAL: return HSAIL_ADDRESS_GLOBAL; case PTR_MT_LDS_EMU: case PTR_MT_LDS: return HSAIL_ADDRESS_LOCAL; case PTR_MT_SCRATCH_EMU: return HSAIL_ADDRESS_GLOBAL; case PTR_MT_ERROR: default: LogError("Unsupported address type"); return HSAIL_ADDRESS_ERROR; } } else if ((argInfo->type == ARG_TYPE_IMAGE) || (argInfo->type == ARG_TYPE_SAMPLER)) { return HSAIL_ADDRESS_GLOBAL; } else if (argInfo->type == ARG_TYPE_QUEUE) { return HSAIL_ADDRESS_GLOBAL; } return HSAIL_ADDRESS_ERROR; } /* f16 returns f32 - workaround due to comp lib */ inline static HSAIL_DATA_TYPE GetHSAILDataType(const aclArgData* argInfo) { aclArgDataType dataType; if (argInfo->type == ARG_TYPE_POINTER) { dataType = argInfo->arg.pointer.data; } else if (argInfo->type == ARG_TYPE_VALUE) { dataType = argInfo->arg.value.data; } else { return HSAIL_DATATYPE_ERROR; } switch (dataType) { case DATATYPE_i1: return HSAIL_DATATYPE_B1; case DATATYPE_i8: return HSAIL_DATATYPE_S8; case DATATYPE_i16: return HSAIL_DATATYPE_S16; case DATATYPE_i32: return HSAIL_DATATYPE_S32; case DATATYPE_i64: return HSAIL_DATATYPE_S64; case DATATYPE_u8: return HSAIL_DATATYPE_U8; case DATATYPE_u16: return HSAIL_DATATYPE_U16; case DATATYPE_u32: return HSAIL_DATATYPE_U32; case DATATYPE_u64: return HSAIL_DATATYPE_U64; case DATATYPE_f16: return HSAIL_DATATYPE_F32; case DATATYPE_f32: return HSAIL_DATATYPE_F32; case DATATYPE_f64: return HSAIL_DATATYPE_F64; case DATATYPE_struct: return HSAIL_DATATYPE_STRUCT; case DATATYPE_opaque: return HSAIL_DATATYPE_OPAQUE; case DATATYPE_ERROR: default: return HSAIL_DATATYPE_ERROR; } } inline static int GetHSAILArgSize(const aclArgData *argInfo) { switch (argInfo->type) { case ARG_TYPE_VALUE: switch (GetHSAILDataType(argInfo)) { case HSAIL_DATATYPE_B1: return 1; case HSAIL_DATATYPE_B8: case HSAIL_DATATYPE_S8: case HSAIL_DATATYPE_U8: return 1; case HSAIL_DATATYPE_B16: case HSAIL_DATATYPE_U16: case HSAIL_DATATYPE_S16: case HSAIL_DATATYPE_F16: return 2; case HSAIL_DATATYPE_B32: case HSAIL_DATATYPE_U32: case HSAIL_DATATYPE_S32: case HSAIL_DATATYPE_F32: return 4; case HSAIL_DATATYPE_B64: case HSAIL_DATATYPE_U64: case HSAIL_DATATYPE_S64: case HSAIL_DATATYPE_F64: return 8; case HSAIL_DATATYPE_STRUCT: return argInfo->arg.value.numElements; default: return -1; } case ARG_TYPE_POINTER: case ARG_TYPE_IMAGE: case ARG_TYPE_SAMPLER: case ARG_TYPE_QUEUE: return sizeof(void*); default: return -1; } } inline static clk_value_type_t GetOclType(const aclArgData* argInfo) { static const clk_value_type_t ClkValueMapType[6][6] = { { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 }, { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 }, { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 }, { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 }, { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 }, { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 }, }; uint sizeType; if (argInfo->type == ARG_TYPE_QUEUE) { return T_QUEUE; } if ((argInfo->type == ARG_TYPE_POINTER) || (argInfo->type == ARG_TYPE_IMAGE)) { return T_POINTER; } else if (argInfo->type == ARG_TYPE_VALUE) { switch (argInfo->arg.value.data) { case DATATYPE_i8: case DATATYPE_u8: sizeType = 0; break; case DATATYPE_i16: case DATATYPE_u16: sizeType = 1; break; case DATATYPE_i32: case DATATYPE_u32: sizeType = 2; break; case DATATYPE_i64: case DATATYPE_u64: sizeType = 3; break; case DATATYPE_f16: case DATATYPE_f32: sizeType = 4; break; case DATATYPE_f64: sizeType = 5; break; default: return T_VOID; } switch (argInfo->arg.value.numElements) { case 1: return ClkValueMapType[sizeType][0]; case 2: return ClkValueMapType[sizeType][1]; case 3: return ClkValueMapType[sizeType][2]; case 4: return ClkValueMapType[sizeType][3]; case 8: return ClkValueMapType[sizeType][4]; case 16: return ClkValueMapType[sizeType][5]; default: return T_VOID; } } else if (argInfo->type == ARG_TYPE_SAMPLER) { return T_SAMPLER; } else { return T_VOID; } } inline static cl_kernel_arg_address_qualifier GetOclAddrQual(const aclArgData* argInfo) { if (argInfo->type == ARG_TYPE_POINTER) { switch (argInfo->arg.pointer.memory) { case PTR_MT_UAV: case PTR_MT_GLOBAL: return CL_KERNEL_ARG_ADDRESS_GLOBAL; case PTR_MT_CONSTANT: case PTR_MT_UAV_CONSTANT: case PTR_MT_CONSTANT_EMU: return CL_KERNEL_ARG_ADDRESS_CONSTANT; case PTR_MT_LDS_EMU: case PTR_MT_LDS: return CL_KERNEL_ARG_ADDRESS_LOCAL; default: return CL_KERNEL_ARG_ADDRESS_PRIVATE; } } else if (argInfo->type == ARG_TYPE_IMAGE) { return CL_KERNEL_ARG_ADDRESS_GLOBAL; } //default for all other cases return CL_KERNEL_ARG_ADDRESS_PRIVATE; } inline static cl_kernel_arg_access_qualifier GetOclAccessQual(const aclArgData* argInfo) { if (argInfo->type == ARG_TYPE_IMAGE) { switch (argInfo->arg.image.type) { case ACCESS_TYPE_RO: return CL_KERNEL_ARG_ACCESS_READ_ONLY; case ACCESS_TYPE_WO: return CL_KERNEL_ARG_ACCESS_WRITE_ONLY; case ACCESS_TYPE_RW: return CL_KERNEL_ARG_ACCESS_READ_WRITE; default: return CL_KERNEL_ARG_ACCESS_NONE; } } return CL_KERNEL_ARG_ACCESS_NONE; } inline static cl_kernel_arg_type_qualifier GetOclTypeQual(const aclArgData* argInfo) { cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE; if (argInfo->type == ARG_TYPE_POINTER) { if (argInfo->arg.pointer.isVolatile) { rv |= CL_KERNEL_ARG_TYPE_VOLATILE; } if (argInfo->arg.pointer.isRestrict) { rv |= CL_KERNEL_ARG_TYPE_RESTRICT; } if (argInfo->arg.pointer.isPipe) { rv |= CL_KERNEL_ARG_TYPE_PIPE; } if (argInfo->isConst) { rv |= CL_KERNEL_ARG_TYPE_CONST; } switch (argInfo->arg.pointer.memory) { case PTR_MT_CONSTANT: case PTR_MT_UAV_CONSTANT: case PTR_MT_CONSTANT_EMU: rv |= CL_KERNEL_ARG_TYPE_CONST; break; default: break; } } return rv; } static int GetOclSize(const aclArgData* argInfo) { switch (argInfo->type) { case ARG_TYPE_POINTER: return sizeof(void *); case ARG_TYPE_VALUE: //! \note OCL 6.1.5. For 3-component vector data types, //! the size of the data type is 4 * sizeof(component). switch (argInfo->arg.value.data) { case DATATYPE_struct: return 1 * argInfo->arg.value.numElements; case DATATYPE_i8: case DATATYPE_u8: return 1 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); case DATATYPE_u16: case DATATYPE_i16: case DATATYPE_f16: return 2 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); case DATATYPE_u32: case DATATYPE_i32: case DATATYPE_f32: return 4 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); case DATATYPE_i64: case DATATYPE_u64: case DATATYPE_f64: return 8 * amd::nextPowerOfTwo(argInfo->arg.value.numElements); case DATATYPE_ERROR: default: return -1; } case ARG_TYPE_IMAGE: return sizeof(cl_mem); case ARG_TYPE_SAMPLER: return sizeof(cl_sampler); case ARG_TYPE_QUEUE: return sizeof(cl_command_queue); default: return -1; } } void HSAILKernel::initArgList(const aclArgData* aclArg) { // Initialize the hsail argument list too initHsailArgs(aclArg); // Iterate through the arguments and insert into parameterList device::Kernel::parameters_t params; amd::KernelParameterDescriptor desc; size_t offset = 0; // Reserved arguments for HSAIL launch aclArg += MaxExtraArgumentsNum; for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) { desc.name_ = arguments_[i]->name_.c_str(); desc.type_ = GetOclType(aclArg); desc.addressQualifier_ = GetOclAddrQual(aclArg); desc.accessQualifier_ = GetOclAccessQual(aclArg); desc.typeQualifier_ = GetOclTypeQual(aclArg); desc.typeName_ = arguments_[i]->typeName_.c_str(); // Make a check if it is local or global if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) { desc.size_ = 0; } else { desc.size_ = GetOclSize(aclArg); } // Make offset alignment to match CPU metadata, since // in multidevice config abstraction layer has a single signature // and CPU sends the paramaters as they are allocated in memory size_t size = desc.size_; if (size == 0) { // Local memory for CPU size = sizeof(cl_mem); } offset = amd::alignUp(offset, std::min(size, size_t(16))); desc.offset_ = offset; offset += amd::alignUp(size, sizeof(uint32_t)); params.push_back(desc); if (arguments_[i]->type_ == HSAIL_ARGTYPE_IMAGE) { flags_.imageEna_ = true; if (desc.accessQualifier_ != CL_KERNEL_ARG_ACCESS_READ_ONLY) { flags_.imageWriteEna_ = true; } } } createSignature(params); } void HSAILKernel::initHsailArgs(const aclArgData* aclArg) { int offset = 0; // Reserved arguments for HSAIL launch aclArg += MaxExtraArgumentsNum; // Iterate through the each kernel argument for (; aclArg->struct_size != 0; aclArg++) { Argument* arg = new Argument; // Initialize HSAIL kernel argument arg->name_ = aclArg->argStr; arg->typeName_ = aclArg->typeStr; arg->size_ = GetHSAILArgSize(aclArg); arg->offset_ = offset; arg->type_ = GetHSAILArgType(aclArg); arg->addrQual_ = GetHSAILAddrQual(aclArg); arg->dataType_ = GetHSAILDataType(aclArg); // If vector of args we add additional arguments to flatten it out arg->numElem_ = ((aclArg->type == ARG_TYPE_VALUE) && (aclArg->arg.value.data != DATATYPE_struct)) ? aclArg->arg.value.numElements : 1; arg->alignment_ = GetHSAILArgAlignment(aclArg); arg->access_ = GetHSAILArgAccessType(aclArg); offset += GetHSAILArgSize(aclArg); arguments_.push_back(arg); } } void HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) { PrintfInfo info; uint index = 0; for (; aclPrintf->struct_size != 0; aclPrintf++) { index = aclPrintf->ID; if (printf_.size() <= index) { printf_.resize(index + 1); } std::string pfmt = aclPrintf->fmtStr; info.fmtString_.clear(); size_t pos = 0; bool need_nl = true; for (size_t pos = 0; pos < pfmt.size(); ++pos) { char symbol = pfmt[pos]; need_nl = true; if (symbol == '\\') { // Rest of the C escape sequences (e.g. \') are handled correctly // by the MDParser, we are not sure exactly how! switch (pfmt[pos+1]) { case 'a': pos++; symbol = '\a'; break; case 'b': pos++; symbol = '\b'; break; case 'f': pos++; symbol = '\f'; break; case 'n': pos++; symbol = '\n'; need_nl = false; break; case 'r': pos++; symbol = '\r'; break; case 'v': pos++; symbol = '\v'; break; case '7': if (pfmt[pos+2] == '2') { pos += 2; symbol = '\72'; } break; default: break; } } info.fmtString_.push_back(symbol); } if (need_nl) { info.fmtString_ += "\n"; } uint32_t *tmp_ptr = const_cast(aclPrintf->argSizes); for (uint i = 0; i < aclPrintf->numSizes; i++ , tmp_ptr++) { info.arguments_.push_back(*tmp_ptr); } printf_[index] = info; info.arguments_.clear(); } } HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions, uint extraArgsNum) : device::Kernel(name) , compileOptions_(compileOptions) , dev_(prog->dev()) , prog_(*prog) , index_(0) , code_(NULL) , codeSize_(0) , hwMetaData_(NULL) , extraArgumentsNum_(extraArgsNum) , waveLimiter_(this, (prog->isNull() ? 1 : dev().getAttribs().numberOfCUsperShaderArray) * dev().hwInfo()->simdPerCU_) { hsa_ = true; } HSAILKernel::~HSAILKernel() { while (!arguments_.empty()) { Argument* arg = arguments_.back(); delete arg; arguments_.pop_back(); } delete [] hwMetaData_; delete code_; } bool HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize) { if (extraArgumentsNum_ > MaxExtraArgumentsNum) { LogError("Failed to initialize kernel: extra arguments number is bigger than is supported"); return false; } acl_error error = ACL_SUCCESS; std::string openClKernelName = openclMangledName(name()); flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true: false; //compile kernel down to ISA if (finalize) { std::string options(compileOptions_.c_str()); options.append(" -just-kernel="); options.append(openClKernelName.c_str()); // Append an option so that we can selectively enable a SCOption on CZ // whenever IOMMUv2 is enabled. if (dev().settings().svmFineGrainSystem_) { options.append(" -sc-xnack-iommu"); } error = aclCompile(dev().hsaCompiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL); buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); if (error != ACL_SUCCESS) { LogError("Failed to finalize kernel"); return false; } } aqlCreateHWInfo(sym); // Pull out metadata from the ELF size_t sizeOfArgList; error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY, openClKernelName.c_str(), NULL, &sizeOfArgList); if (error != ACL_SUCCESS) { return false; } char* aclArgList = new char[sizeOfArgList]; if (NULL == aclArgList) { return false; } error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY, openClKernelName.c_str(), aclArgList, &sizeOfArgList); if (error != ACL_SUCCESS) { return false; } size_t sizeOfWorkGroupSize; error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE, openClKernelName.c_str(), NULL, &sizeOfWorkGroupSize); if (error != ACL_SUCCESS) { return false; } error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE, openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize); if (error != ACL_SUCCESS) { return false; } // Copy wavefront size workGroupInfo_.wavefrontSize_ = prog().isNull() ? 64 : dev().getAttribs().wavefrontSize; // Find total workgroup size if (workGroupInfo_.compileSize_[0] != 0) { workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] * workGroupInfo_.compileSize_[2]; } else { workGroupInfo_.size_ = dev().info().maxWorkGroupSize_; } // Pull out printf metadata from the ELF size_t sizeOfPrintfList; error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), NULL, &sizeOfPrintfList); if (error != ACL_SUCCESS) { return false; } // Make sure kernel has any printf info if (0 != sizeOfPrintfList) { char* aclPrintfList = new char[sizeOfPrintfList]; if (NULL == aclPrintfList) { return false; } error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList); if (error != ACL_SUCCESS) { return false; } // Set the PrintfList initPrintf(reinterpret_cast(aclPrintfList)); delete [] aclPrintfList; } aclMetadata md; md.enqueue_kernel = false; size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel); error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE, openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue); if (error != ACL_SUCCESS) { return false; } flags_.dynamicParallelism_ = md.enqueue_kernel; md.kernel_index = -1; size_t sizeOfIndex = sizeof(md.kernel_index); error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_KERNEL_INDEX, openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex); if (error != ACL_SUCCESS) { return false; } index_ = md.kernel_index; size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_); error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint); if (error != ACL_SUCCESS) { return false; } waveLimiter_.enable(dev().settings().ciPlus_); size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_); error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT, openClKernelName.c_str(), workGroupInfo_.compileSizeHint_, &sizeOfWorkGroupSizeHint); if (error != ACL_SUCCESS) { return false; } size_t sizeOfVecTypeHint; error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_VEC_TYPE_HINT, openClKernelName.c_str(), NULL, &sizeOfVecTypeHint); if (error != ACL_SUCCESS) { return false; } if (0 != sizeOfVecTypeHint) { char* VecTypeHint = new char[sizeOfVecTypeHint + 1]; if (NULL == VecTypeHint) { return false; } error = aclQueryInfo(dev().hsaCompiler(), prog().binaryElf(), RT_VEC_TYPE_HINT, openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint); if (error != ACL_SUCCESS) { return false; } VecTypeHint[sizeOfVecTypeHint] = '\0'; workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint); delete [] VecTypeHint; } // Set the argList initArgList(reinterpret_cast(aclArgList)); delete [] aclArgList; return true; } bool HSAILKernel::validateMemory(uint idx, amd::Memory* amdMem) const { // Check if memory doesn't require reallocation bool noRealloc = true; //amdMem->reallocedDeviceMemory(&dev())); return noRealloc; } const Device& HSAILKernel::dev() const { return reinterpret_cast(dev_); } const HSAILProgram& HSAILKernel::prog() const { return reinterpret_cast(prog_); } void HSAILKernel::findLocalWorkSize( size_t workDim, const amd::NDRange& gblWorkSize, amd::NDRange& lclWorkSize) const { // Initialize the default workgoup info // Check if the kernel has the compiled sizes if (workGroupInfo()->compileSize_[0] == 0) { // Find the default local workgroup size, if it wasn't specified if (lclWorkSize[0] == 0) { size_t thrPerGrp; bool b1DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE); bool b2DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_2D_Y); bool b3DOverrideSet = !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_X) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Y) || !flagIsDefault(GPU_MAX_WORKGROUP_SIZE_3D_Z); bool overrideSet = ((workDim == 1) && b1DOverrideSet) || ((workDim == 2) && b2DOverrideSet) || ((workDim == 3) && b3DOverrideSet); if (!overrideSet) { // Find threads per group thrPerGrp = workGroupInfo()->size_; // Check if kernel uses images if (flags_.imageEna_ && // and thread group is a multiple value of wavefronts ((thrPerGrp % workGroupInfo()->wavefrontSize_) == 0) && // and it's 2 or 3-dimensional workload (workDim > 1) && ((dev().settings().partialDispatch_) || (((gblWorkSize[0] % 16) == 0) && ((gblWorkSize[1] % 16) == 0)))) { // Use 8x8 workgroup size if kernel has image writes if (flags_.imageWriteEna_ || (thrPerGrp != dev().info().maxWorkGroupSize_)) { lclWorkSize[0] = 8; lclWorkSize[1] = 8; } else { lclWorkSize[0] = 16; lclWorkSize[1] = 16; } if (workDim == 3) { lclWorkSize[2] = 1; } } else { size_t tmp = thrPerGrp; // Split the local workgroup into the most efficient way for (uint d = 0; d < workDim; ++d) { size_t div = tmp; for (; (gblWorkSize[d] % div) != 0; div--); lclWorkSize[d] = div; tmp /= div; } // Check if partial dispatch is enabled and if (dev().settings().partialDispatch_ && // we couldn't find optimal workload (lclWorkSize.product() % workGroupInfo()->wavefrontSize_) != 0) { size_t maxSize = 0; size_t maxDim = 0; for (uint d = 0; d < workDim; ++d) { if (maxSize < gblWorkSize[d]) { maxSize = gblWorkSize[d]; maxDim = d; } } // Check if a local workgroup has the most optimal size if (thrPerGrp > maxSize) { thrPerGrp = maxSize; } lclWorkSize[maxDim] = thrPerGrp; for (uint d = 0; d < workDim; ++d) { if (d != maxDim) { lclWorkSize[d] = 1; } } } } } else { // Use overrides when app doesn't provide workgroup dimensions if (workDim == 1) { lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE; } else if (workDim == 2) { lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_2D_X; lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_2D_Y; } else if (workDim == 3) { lclWorkSize[0] = GPU_MAX_WORKGROUP_SIZE_3D_X; lclWorkSize[1] = GPU_MAX_WORKGROUP_SIZE_3D_Y; lclWorkSize[2] = GPU_MAX_WORKGROUP_SIZE_3D_Z; } else { assert(0 && "Invalid workDim!"); } } } } else { for (uint d = 0; d < workDim; ++d) { lclWorkSize[d] = workGroupInfo()->compileSize_[d]; } } } inline static void WriteAqlArg( unsigned char** dst,//!< The write pointer to the buffer const void* src, //!< The source pointer uint size, //!< The size in bytes to copy uint alignment = 0 //!< The alignment to follow while writing to the buffer ) { if (alignment == 0) { *dst = amd::alignUp(*dst, size); } else { *dst = amd::alignUp(*dst, alignment); } memcpy(*dst, src, size); *dst += size; } const uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes, const_address parameters, bool nativeMem, uint64_t vmDefQueue, uint64_t* vmParentWrap, std::vector& memList) const { static const bool WaitOnBusyEngine = true; uint64_t ldsAddress = ldsSize(); address aqlArgBuf = gpu.cb(0)->sysMemCopy(); address aqlStruct = gpu.cb(1)->sysMemCopy(); bool srdResource = false; if (extraArgumentsNum_ > 0) { assert(MaxExtraArgumentsNum >= 6 && "MaxExtraArgumentsNum has changed, the below algorithm should be changed accordingly"); size_t extraArgs[MaxExtraArgumentsNum] = { 0, 0, 0, 0, 0, 0 }; // The HLC generates up to 3 additional arguments for the global offsets for (uint i = 0; i < sizes.dimensions(); ++i) { extraArgs[i] = sizes.offset()[i]; } // Check if the kernel may have printf output if ((printfInfo().size() > 0) && // and printf buffer was allocated (gpu.printfDbgHSA().dbgBuffer() != NULL)) { // and set the fourth argument as the printf_buffer pointer extraArgs[3] = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress()); memList.push_back(gpu.printfDbgHSA().dbgBuffer()); } if (dynamicParallelism()) { // Provide the host parent AQL wrap object to the kernel AmdAqlWrap* wrap = reinterpret_cast(aqlStruct); memset(wrap, 0, sizeof(AmdAqlWrap)); wrap->state = AQL_WRAP_BUSY; ConstBuffer* cb = gpu.constBufs_[1]; cb->uploadDataToHw(sizeof(AmdAqlWrap)); *vmParentWrap = cb->vmAddress() + cb->wrtOffset(); // and set 5th & 6th arguments extraArgs[4] = vmDefQueue; extraArgs[5] = *vmParentWrap; memList.push_back(cb); } WriteAqlArg(&aqlArgBuf, extraArgs, sizeof(size_t)*extraArgumentsNum_, sizeof(size_t)); } const amd::KernelSignature& signature = kernel.signature(); const amd::KernelParameters& kernelParams = kernel.parameters(); // Find all parameters for the current kernel for (uint i = 0; i != signature.numParameters(); ++i) { const HSAILKernel::Argument* arg = argument(i); const amd::KernelParameterDescriptor& desc = signature.at(i); const_address paramaddr = parameters + desc.offset_; switch (arg->type_) { case HSAIL_ARGTYPE_POINTER: // If it is a global pointer if (arg->addrQual_ == HSAIL_ADDRESS_GLOBAL) { Memory* gpuMem = NULL; amd::Memory* mem = NULL; if (kernelParams.boundToSvmPointer(dev(), parameters, i)) { WriteAqlArg(&aqlArgBuf, paramaddr, sizeof(paramaddr)); mem = amd::SvmManager::FindSvmBuffer(*reinterpret_cast(paramaddr)); if (mem != NULL) { gpuMem = dev().getGpuMemory(mem); gpuMem->wait(gpu, WaitOnBusyEngine); if ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0) { mem->signalWrite(&dev()); } memList.push_back(gpuMem); } // If finegrainsystem is present then the pointer can be malloced by the app and // passed to kernel directly. If so copy the pointer location to aqlArgBuf else if (!dev().isFineGrainedSystem(true)) { return NULL; } break; } if (nativeMem) { gpuMem = *reinterpret_cast(paramaddr); if (NULL != gpuMem) { mem = gpuMem->owner(); } } else { mem = *reinterpret_cast(paramaddr); if (mem != NULL) { gpuMem = dev().getGpuMemory(mem); } } if (gpuMem == NULL) { WriteAqlArg(&aqlArgBuf, &gpuMem, sizeof(void*)); break; } //! @todo 64 bit isn't supported with 32 bit binary uint64_t globalAddress = gpuMem->vmAddress() + gpuMem->pinOffset(); WriteAqlArg(&aqlArgBuf, &globalAddress, sizeof(void*)); // Wait for resource if it was used on an inactive engine //! \note syncCache may call DRM transfer gpuMem->wait(gpu, WaitOnBusyEngine); //! @todo Compiler has to return read/write attributes if ((NULL != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { mem->signalWrite(&dev()); } memList.push_back(gpuMem); // save the memory object pointer to allow global memory access if (NULL != dev().hwDebugMgr()) { dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner()); } } // If it is a local pointer else { assert((arg->addrQual_ == HSAIL_ADDRESS_LOCAL) && "Unsupported address type"); ldsAddress = amd::alignUp(ldsAddress, arg->alignment_); WriteAqlArg(&aqlArgBuf, &ldsAddress, sizeof(size_t)); ldsAddress += *reinterpret_cast(paramaddr); } break; case HSAIL_ARGTYPE_VALUE: // Special case for structrues if (arg->dataType_ == HSAIL_DATATYPE_STRUCT) { // Copy the current structre into CB1 memcpy(aqlStruct, paramaddr, arg->size_); ConstBuffer* cb = gpu.constBufs_[1]; cb->uploadDataToHw(arg->size_); // Then use a pointer in aqlArgBuffer to CB1 uint64_t gpuPtr = cb->vmAddress() + cb->wrtOffset(); WriteAqlArg(&aqlArgBuf, &gpuPtr, sizeof(void*)); memList.push_back(cb); } else { WriteAqlArg(&aqlArgBuf, paramaddr, arg->numElem_ * arg->size_, arg->size_); } break; case HSAIL_ARGTYPE_IMAGE: { Image* image = NULL; amd::Memory* mem = NULL; if (nativeMem) { image = static_cast(*reinterpret_cast(paramaddr)); } else { mem = *reinterpret_cast(paramaddr); if (mem == NULL) { LogError( "The kernel image argument isn't an image object!"); return nullptr; } image = static_cast(dev().getGpuMemory(mem)); } // Wait for resource if it was used on an inactive engine //! \note syncCache may call DRM transfer image->wait(gpu, WaitOnBusyEngine); //! \note Special case for the image views. //! Copy SRD to CB1, so blit manager will be able to release //! this view without a wait for SRD resource. if (image->memoryType() == Resource::ImageView) { // Copy the current structre into CB1 memcpy(aqlStruct, image->hwState(), HsaImageObjectSize); ConstBuffer* cb = gpu.constBufs_[1]; cb->uploadDataToHw(HsaImageObjectSize); // Then use a pointer in aqlArgBuffer to CB1 uint64_t srd = cb->vmAddress() + cb->wrtOffset(); WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); memList.push_back(cb); } else { uint64_t srd = image->hwSrd(); WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); srdResource = true; } //! @todo Compiler has to return read/write attributes if ((NULL != mem) && ((mem->getMemFlags() & CL_MEM_READ_ONLY) == 0)) { mem->signalWrite(&dev()); } memList.push_back(image); break; } case HSAIL_ARGTYPE_SAMPLER: { const amd::Sampler* sampler = *reinterpret_cast(paramaddr); const Sampler* gpuSampler = static_cast (sampler->getDeviceSampler(dev())); uint64_t srd = gpuSampler->hwSrd(); WriteAqlArg(&aqlArgBuf, &srd, sizeof(srd)); srdResource = true; break; } case HSAIL_ARGTYPE_QUEUE: { const amd::DeviceQueue* queue = *reinterpret_cast(paramaddr); VirtualGPU* gpuQueue = static_cast(queue->vDev()); uint64_t vmQueue; if (dev().settings().useDeviceQueue_) { vmQueue = gpuQueue->vQueue()->vmAddress(); } else { if (!gpu.createVirtualQueue(queue->size())) { LogError("Virtual queue creation failed!"); return nullptr; } vmQueue = gpu.vQueue()->vmAddress(); } WriteAqlArg(&aqlArgBuf, &vmQueue, sizeof(void*)); break; } default: LogError(" Unsupported address type "); return NULL; } } if (ldsAddress > dev().info().localMemSize_) { LogError("No local memory available\n"); return NULL; } // HSAIL kernarg segment size is rounded up to multiple of 16. aqlArgBuf = amd::alignUp(aqlArgBuf, 16); assert((aqlArgBuf == (gpu.cb(0)->sysMemCopy() + argsBufferSize())) && "Size and the number of arguments don't match!"); hsa_kernel_dispatch_packet_t* hsaDisp = reinterpret_cast(aqlArgBuf); amd::NDRange local(sizes.local()); const amd::NDRange& global = sizes.global(); // Check if runtime has to find local workgroup size findLocalWorkSize(sizes.dimensions(), sizes.global(), local); hsaDisp->header = kDispatchPacketHeader; hsaDisp->setup = sizes.dimensions(); hsaDisp->workgroup_size_x = local[0]; hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1; hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1; hsaDisp->grid_size_x = global[0]; hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1; hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1; hsaDisp->reserved2 = 0; // Initialize kernel ISA and execution buffer requirements hsaDisp->private_segment_size = spillSegSize(); hsaDisp->group_segment_size = ldsAddress - ldsSize(); hsaDisp->kernel_object = gpuAqlCode()->vmAddress(); ConstBuffer* cb = gpu.constBufs_[0]; cb->uploadDataToHw(argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t)); uint64_t argList = cb->vmAddress() + cb->wrtOffset(); hsaDisp->kernarg_address = reinterpret_cast(argList); hsaDisp->reserved2 = 0; hsaDisp->completion_signal.handle = 0; memList.push_back(cb); memList.push_back(gpuAqlCode()); for (gpu::Memory * mem : prog().globalStores()) { memList.push_back(mem); } if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { memList.push_back(gpu.hsaQueueMem()); } if (srdResource || prog().isStaticSampler()) { dev().srds().fillResourceList(memList); } return hsaDisp; } } // namespace gpu