From 81b331f4c59a53ab0db4e1eb2c842acf3799df4f Mon Sep 17 00:00:00 2001 From: foreman Date: Wed, 5 Aug 2015 06:18:33 -0400 Subject: [PATCH] P4 to Git Change 1177220 by emankov@em-hsa-amd on 2015/08/05 06:08:39 ECR #333753 - ORCA RT/Compiler Lib/aoc2: AMD HSA Code Object Import feature (part II) - arbitrary hidden (extra) kernargs support Only HSAIL path is affected. It doesn't affect blit kernels. To use offline by aoc2: aoc2 -hsacodeobject= -numhiddenkernargs= -cl-std=CL2.0 -march=hsail(-64) -mdevice=Bonaire To use online by setting env: AMD_DEBUG_HSA_NUM_HIDDEN_KERNARGS= where num >= 0. If num == 0, then no additional arguments will be added on RT for every kernel. The default value is unchanged and equal to 6 for now. Misc: + get rid of PRE & POST defines in Compiler Lib, as they started to conflict with ugl\gl\gs\hwl\ headers with the same defines. + minor copy/paste eliminations & typo fixes + ocltst complib tests update Testing: pre check-in, manually based on ocl sdk MatrixMultiplication Reviewers: Brian Sumner, German Andryeyev, Nikolay Haustov, Artem Tamazov Affected files ... ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/common/v0_8/if_acl.cpp#72 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/hsail_be.cpp#49 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/metadata.cpp#8 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/include/v0_8/aclDefs.h#5 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/include/v0_8/aclEnums.h#19 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/include/v0_8/aclStructs.h#17 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/bif_section_labels.hpp#21 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/v0_8/libUtils.cpp#10 edit ... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/v0_8/libUtils.h#20 edit ... //depot/stg/opencl/drivers/opencl/compiler/tools/aoc2/aoc2.cpp#74 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#181 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#249 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#291 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#113 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#199 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#369 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsaprogram.cpp#38 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsakernel.cpp#8 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsakernel.hpp#5 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsaprogram.cpp#19 edit ... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsavirtual.cpp#43 edit ... //depot/stg/opencl/drivers/opencl/tests/ocltst/module/complib/CLAssumptionCheck.cpp#43 edit ... //depot/stg/opencl/drivers/opencl/tests/ocltst/module/complib/CLEnumCheck.cpp#44 edit --- .../lib/backends/common/v0_8/if_acl.cpp | 11 +++ rocclr/compiler/lib/include/v0_8/aclDefs.h | 6 ++ rocclr/compiler/lib/include/v0_8/aclEnums.h | 47 +++++----- rocclr/compiler/lib/include/v0_8/aclStructs.h | 1 + .../compiler/lib/utils/bif_section_labels.hpp | 6 +- rocclr/compiler/lib/utils/v0_8/libUtils.cpp | 4 +- rocclr/compiler/lib/utils/v0_8/libUtils.h | 92 +++++++++++++++++-- rocclr/runtime/device/device.cpp | 12 ++- rocclr/runtime/device/device.hpp | 2 +- rocclr/runtime/device/gpu/gpukernel.cpp | 79 ++++++++-------- rocclr/runtime/device/gpu/gpukernel.hpp | 12 ++- rocclr/runtime/device/gpu/gpuprogram.cpp | 25 +++-- rocclr/runtime/device/gpu/gpuvirtual.cpp | 4 +- rocclr/runtime/device/hsa/hsaprogram.cpp | 6 +- 14 files changed, 214 insertions(+), 93 deletions(-) diff --git a/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp b/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp index a488b99f62..f3e8d7196b 100644 --- a/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp +++ b/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp @@ -2641,6 +2641,17 @@ if_aclQueryInfo(aclCompiler *cl, } break; } + case RT_NUM_KERNEL_HIDDEN_ARGS: { + size_t hidden_kernargs_size = sizeof(md->numHiddenKernelArgs); + if (!ptr) { + *size = hidden_kernargs_size; + success = true; + } else if (*size >= hidden_kernargs_size) { + memcpy(ptr, &md->numHiddenKernelArgs, hidden_kernargs_size); + success = true; + } + break; + } } return (success) ? ACL_SUCCESS : ACL_ERROR; } diff --git a/rocclr/compiler/lib/include/v0_8/aclDefs.h b/rocclr/compiler/lib/include/v0_8/aclDefs.h index 2d21ea83d1..10da184217 100644 --- a/rocclr/compiler/lib/include/v0_8/aclDefs.h +++ b/rocclr/compiler/lib/include/v0_8/aclDefs.h @@ -28,4 +28,10 @@ #define BIF_API_3_0 #endif +#ifndef MAX_HIDDEN_KERNARGS_NUM +#define MAX_HIDDEN_KERNARGS_NUM 6 +#else +#error "MAX_HIDDEN_KERNARGS_NUM is already defined" +#endif + #endif // _ACL_DEFS_0_8_H_ diff --git a/rocclr/compiler/lib/include/v0_8/aclEnums.h b/rocclr/compiler/lib/include/v0_8/aclEnums.h index 513acc4e86..aa51dbda16 100644 --- a/rocclr/compiler/lib/include/v0_8/aclEnums.h +++ b/rocclr/compiler/lib/include/v0_8/aclEnums.h @@ -188,29 +188,30 @@ typedef enum _bif_sections_enum_0_8 { //! An enumeration that defines what are valid queries for aclQueryInfo. typedef enum _rt_query_types_enum_0_8 { - RT_ABI_VERSION = 0, - RT_DEVICE_NAME = 1, - RT_MEM_SIZES = 2, - RT_GPU_FUNC_CAPS = 3, - RT_GPU_FUNC_ID = 4, - RT_GPU_DEFAULT_ID = 5, - RT_WORK_GROUP_SIZE = 6, - RT_WORK_REGION_SIZE = 7, - RT_ARGUMENT_ARRAY = 8, - RT_GPU_PRINTF_ARRAY = 9, - RT_CPU_BARRIER_NAMES = 10, - RT_DEVICE_ENQUEUE = 11, - RT_KERNEL_INDEX = 12, - RT_KERNEL_NAME = 13, - RT_KERNEL_NAMES = 14, - RT_CONTAINS_LLVMIR = 15, - RT_CONTAINS_OPTIONS = 16, - RT_CONTAINS_BRIG = 17, - RT_CONTAINS_HSAIL = 18, - RT_CONTAINS_ISA = 19, - RT_CONTAINS_LOADER_MAP = 20, - RT_CONTAINS_SPIR = 21, - RT_LAST_TYPE = 22 + RT_ABI_VERSION = 0, + RT_DEVICE_NAME = 1, + RT_MEM_SIZES = 2, + RT_GPU_FUNC_CAPS = 3, + RT_GPU_FUNC_ID = 4, + RT_GPU_DEFAULT_ID = 5, + RT_WORK_GROUP_SIZE = 6, + RT_WORK_REGION_SIZE = 7, + RT_ARGUMENT_ARRAY = 8, + RT_GPU_PRINTF_ARRAY = 9, + RT_CPU_BARRIER_NAMES = 10, + RT_DEVICE_ENQUEUE = 11, + RT_KERNEL_INDEX = 12, + RT_KERNEL_NAME = 13, + RT_KERNEL_NAMES = 14, + RT_CONTAINS_LLVMIR = 15, + RT_CONTAINS_OPTIONS = 16, + RT_CONTAINS_BRIG = 17, + RT_CONTAINS_HSAIL = 18, + RT_CONTAINS_ISA = 19, + RT_CONTAINS_LOADER_MAP = 20, + RT_CONTAINS_SPIR = 21, + RT_NUM_KERNEL_HIDDEN_ARGS = 22, + RT_LAST_TYPE = 23 } aclQueryType_0_8; //! An enumeration for the various GPU capabilities diff --git a/rocclr/compiler/lib/include/v0_8/aclStructs.h b/rocclr/compiler/lib/include/v0_8/aclStructs.h index 8586955aec..49bbd8e68d 100644 --- a/rocclr/compiler/lib/include/v0_8/aclStructs.h +++ b/rocclr/compiler/lib/include/v0_8/aclStructs.h @@ -119,6 +119,7 @@ typedef struct _acl_metadata_0_8 { const char *deviceName; // RT_DEVICE_NAME bool enqueue_kernel; // RT_DEVICE_ENQUEUE uint32_t kernel_index; // RT_KERNEL_INDEX + uint32_t numHiddenKernelArgs; // RT_NUM_KERNEL_HIDDEN_ARGS } aclMetadata_0_8; //! An structure that holds information on the capabilities of the bif device. diff --git a/rocclr/compiler/lib/utils/bif_section_labels.hpp b/rocclr/compiler/lib/utils/bif_section_labels.hpp index 1af28f5f30..7106cff7f0 100644 --- a/rocclr/compiler/lib/utils/bif_section_labels.hpp +++ b/rocclr/compiler/lib/utils/bif_section_labels.hpp @@ -6,8 +6,10 @@ #ifdef __cplusplus extern "C" { #endif -#define PRE 0 -#define POST 1 +namespace bif { +const unsigned PRE = 0; +const unsigned POST = 1; +} typedef enum { symOpenclCompilerOptions, diff --git a/rocclr/compiler/lib/utils/v0_8/libUtils.cpp b/rocclr/compiler/lib/utils/v0_8/libUtils.cpp index 0342406dbd..71797a2962 100644 --- a/rocclr/compiler/lib/utils/v0_8/libUtils.cpp +++ b/rocclr/compiler/lib/utils/v0_8/libUtils.cpp @@ -1,8 +1,7 @@ // // Copyright (c) 2011 Advanced Micro Devices, Inc. All rights reserved. // -#include "acl.h" -#include "aclTypes.h" + #include "api/v0_8/aclValidation.h" #include "libUtils.h" #include "bif/bifbase.hpp" @@ -10,7 +9,6 @@ #include "utils/versions.hpp" #include "utils/options.hpp" #include "backends/gpu/scwrapper/devState.h" -#include #include #include "bif/bif.hpp" extern aclBinary* constructBinary(size_t struct_version, diff --git a/rocclr/compiler/lib/utils/v0_8/libUtils.h b/rocclr/compiler/lib/utils/v0_8/libUtils.h index b49de55d9a..878479f302 100644 --- a/rocclr/compiler/lib/utils/v0_8/libUtils.h +++ b/rocclr/compiler/lib/utils/v0_8/libUtils.h @@ -3,12 +3,16 @@ // #ifndef _CL_LIB_UTILS_0_8_H_ #define _CL_LIB_UTILS_0_8_H_ -#include "v0_8/aclTypes.h" +#include "acl.h" #include #include #include #include +#include #include "library.hpp" +#include "utils/bif_section_labels.hpp" +using namespace bif; + // Utility function to set a flag in option structure // of the aclDevCaps. void @@ -150,6 +154,83 @@ aclutAlloc(const aclCompilerOptions *bin); FreeFunc aclutFree(const aclCompilerOptions *bin); +inline std::vector splitSpaceSeparatedString(char *str) +{ + std::string s(str); + std::stringstream ss(s); + std::istream_iterator beg(ss), end; + std::vector vec(beg, end); + return vec; +} + +// Helper function that returns OpenCL mangled kernel name. +inline std::string +aclutOpenclMangledKernelName(const std::string& kernel_name) +{ + const oclBIFSymbolStruct* sym = findBIF30SymStruct(symOpenclKernel); + assert(sym && "symbol not found"); + return std::string("&") + sym->str[PRE] + kernel_name + sym->str[POST]; +} + +// Helper function that returns OpenCL mangled kernel metadata symbol name. +inline std::string +aclutOpenclMangledKernelMetadataName(const std::string& kernel_name) +{ + const oclBIFSymbolStruct* sym = findBIF30SymStruct(symOpenclMeta); + assert(sym && "symbol not found"); + return sym->str[PRE] + aclutOpenclMangledKernelName(kernel_name) + sym->str[POST]; +} + +#ifdef WITH_TARGET_HSAIL +// Helper function that updates metadata for all the kernels in binary; +// the updated attribute is the number of hidden kernel arguments. +inline acl_error +aclutUpdateMetadataWithHiddenKernargsNum(aclCompiler* cl, aclBinary* bin, uint32_t num) { + if (num == MAX_HIDDEN_KERNARGS_NUM) { + return ACL_SUCCESS; + } + const oclBIFSymbolStruct* sym = findBIF30SymStruct(symOpenclMeta); + assert(sym && "symbol not found"); + aclSections secID = sym->sections[0]; + size_t kernelNamesSize = 0; + acl_error error_code = aclQueryInfo(cl, bin, RT_KERNEL_NAMES, NULL, NULL, &kernelNamesSize); + if (error_code != ACL_SUCCESS) { + return error_code; + } + char* kernelNames = new char[kernelNamesSize]; + error_code = aclQueryInfo(cl, bin, RT_KERNEL_NAMES, NULL, kernelNames, &kernelNamesSize); + if (error_code != ACL_SUCCESS) { + delete kernelNames; + return error_code; + } + std::vector vKernels = splitSpaceSeparatedString(kernelNames); + delete kernelNames; + size_t roSize = 0; + for (auto it = vKernels.begin(); it != vKernels.end(); ++it) { + std::string symbol = aclutOpenclMangledKernelMetadataName(*it); + void* roSec = const_cast(aclExtractSymbol(cl, bin, &roSize, secID, symbol.c_str(), &error_code)); + if (error_code != ACL_SUCCESS) { + return error_code; + } + if (!roSec || roSize == 0) { + error_code = ACL_ELF_ERROR; + return error_code; + } + aclMetadata *md = reinterpret_cast(roSec); + md->numHiddenKernelArgs = num; + error_code = aclRemoveSymbol(cl, bin, secID, symbol.c_str()); + if (error_code != ACL_SUCCESS) { + return error_code; + } + error_code = aclInsertSymbol(cl, bin, md, roSize, secID, symbol.c_str()); + if (error_code != ACL_SUCCESS) { + return error_code; + } + } + return error_code; +} +#endif + inline bool is64BitTarget(const aclTargetInfo& target) { return (target.arch_id == aclX64 || @@ -184,15 +265,6 @@ enum scId { SC_LAST, }; -inline std::vector splitSpaceSeparatedString(char *str) -{ - std::string s(str); - std::stringstream ss(s); - std::istream_iterator beg(ss), end; - std::vector vec(beg, end); - return vec; -} - // Helper function that allocates an aligned memory. inline void* alignedMalloc(size_t size, size_t alignment) diff --git a/rocclr/runtime/device/device.cpp b/rocclr/runtime/device/device.cpp index f69726e36c..496c7ad755 100644 --- a/rocclr/runtime/device/device.cpp +++ b/rocclr/runtime/device/device.cpp @@ -604,6 +604,14 @@ Kernel::~Kernel() delete signature_; } +std::string +Kernel::openclMangledName(const std::string& name) +{ + const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel); + assert(bifSym && "symbol not found"); + return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST]; +} + void Memory::saveMapInfo( const amd::Coord3D origin, @@ -1246,7 +1254,7 @@ ClBinary::getBIFSymbol(unsigned int symbolID) const const oclBIFSymbolStruct* symb = findBIFSymbolStruct(BIF20, nSymbols, symID); assert(symb && "BIF20 symbol with symbolID not found"); if (symb) { - return std::string(symb->str[PRE]) + std::string(symb->str[POST]); + return std::string(symb->str[bif::PRE]) + std::string(symb->str[bif::POST]); } break; } @@ -1255,7 +1263,7 @@ ClBinary::getBIFSymbol(unsigned int symbolID) const const oclBIFSymbolStruct* symb = findBIFSymbolStruct(BIF30, nSymbols, symID); assert(symb && "BIF30 symbol with symbolID not found"); if (symb) { - return std::string(symb->str[PRE]) + std::string(symb->str[POST]); + return std::string(symb->str[bif::PRE]) + std::string(symb->str[bif::POST]); } break; } diff --git a/rocclr/runtime/device/device.hpp b/rocclr/runtime/device/device.hpp index 6bff20f27a..4adce37016 100644 --- a/rocclr/runtime/device/device.hpp +++ b/rocclr/runtime/device/device.hpp @@ -940,7 +940,7 @@ public: //! Return the build log const std::string& buildLog() const { return buildLog_; } - static std::string openclMangledName(const std::string& name) { return "&__OpenCL_" + name + "_kernel"; } + static std::string openclMangledName(const std::string& name); protected: std::string name_; //!< kernel name diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp index e705ce33b4..2e9c8446ad 100644 --- a/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/rocclr/runtime/device/gpu/gpukernel.cpp @@ -9,7 +9,6 @@ #include "device/gpu/gpusched.hpp" #include "platform/commandqueue.hpp" #include "utils/options.hpp" -#include "utils/bif_section_labels.hpp" #include "acl.h" #include "SCShadersR678XXCommon.h" @@ -3432,7 +3431,7 @@ HSAILKernel::initArgList(const aclArgData* aclArg) size_t offset = 0; // Reserved arguments for HSAIL launch - aclArg += ExtraArguments; + aclArg += MaxExtraArgumentsNum; for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) { desc.name_ = arguments_[i]->name_.c_str(); desc.type_ = GetOclType(aclArg); @@ -3479,7 +3478,7 @@ HSAILKernel::initHsailArgs(const aclArgData* aclArg) int offset = 0; // Reserved arguments for HSAIL launch - aclArg += ExtraArguments; + aclArg += MaxExtraArgumentsNum; // Iterate through the each kernel argument for (; aclArg->struct_size != 0; aclArg++) { @@ -3569,7 +3568,8 @@ HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf) HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, - std::string compileOptions) + std::string compileOptions, + uint extraArgsNum) : device::Kernel(name) , compileOptions_(compileOptions) , dev_(prog->dev()) @@ -3578,6 +3578,7 @@ HSAILKernel::HSAILKernel(std::string name, , code_(NULL) , codeSize_(0) , hwMetaData_(NULL) + , extraArgumentsNum_(extraArgsNum) { hsa_ = true; } @@ -3598,14 +3599,16 @@ HSAILKernel::~HSAILKernel() bool HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize) { - acl_error error; - const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel); - assert(bifSym && "symbol not found"); - std::string openClKernelName(std::string("&") + bifSym->str[PRE] + name() + bifSym->str[POST]); + if (extraArgumentsNum_ > MaxExtraArgumentsNum) { + LogError("Failed to initialize kernel: extra arguments number is bigger than is supported"); + return false; + } + acl_error error = ACL_SUCCESS; + std::string openClKernelName = openclMangledName(name()); //compile kernel down to ISA if (finalize) { std::string options(compileOptions_.c_str()); - flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != + flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true: false; options.append(" -just-kernel="); options.append(openClKernelName.c_str()); @@ -3618,7 +3621,7 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize) options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL); buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); if (error != ACL_SUCCESS) { - LogError("Failed to finalize"); + LogError("Failed to finalize kernel"); return false; } } @@ -3900,35 +3903,37 @@ HSAILKernel::loadArguments( address aqlStruct = gpu.cb(1)->sysMemCopy(); bool srdResource = false; - // The HLC generates 3 additional arguments for the global offsets - //and fourth argument is the printf_buffer pointer - size_t offsetSize[HSAILKernel::ExtraArguments] = { 0, 0, 0, 0, 0, 0 }; - for (uint i = 0; i < sizes.dimensions(); ++i) { - offsetSize[i] = sizes.offset()[i]; + if (extraArgumentsNum_ > 0) { + assert(MaxExtraArgumentsNum >= 6 && "MaxExtraArgumentsNum has changed, the below algorithm should be changed accordingly"); + size_t extraArgs[MaxExtraArgumentsNum] = { 0, 0, 0, 0, 0, 0 }; + // The HLC generates up to 3 additional arguments for the global offsets + for (uint i = 0; i < sizes.dimensions(); ++i) { + extraArgs[i] = sizes.offset()[i]; + } + // Check if the kernel may have printf output + if ((printfInfo().size() > 0) && + // and printf buffer was allocated + (gpu.printfDbgHSA().dbgBuffer() != NULL)) { + // and set the fourth argument as the printf_buffer pointer + extraArgs[3] = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress()); + memList.push_back(gpu.printfDbgHSA().dbgBuffer()); + } + if (dynamicParallelism()) { + // Provide the host parent AQL wrap object to the kernel + AmdAqlWrap* wrap = reinterpret_cast(aqlStruct); + memset(wrap, 0, sizeof(AmdAqlWrap)); + wrap->state = AQL_WRAP_BUSY; + ConstBuffer* cb = gpu.constBufs_[1]; + cb->uploadDataToHw(sizeof(AmdAqlWrap)); + *vmParentWrap = cb->vmAddress() + cb->wrtOffset(); + // and set 5th & 6th arguments + extraArgs[4] = vmDefQueue; + extraArgs[5] = *vmParentWrap; + memList.push_back(cb); + } + WriteAqlArg(&aqlArgBuf, extraArgs, sizeof(size_t)*extraArgumentsNum_, sizeof(size_t)); } - if (dynamicParallelism()) { - // Provide the host parent AQL wrap object to the kernel - AmdAqlWrap* wrap = reinterpret_cast(aqlStruct); - memset(wrap, 0, sizeof(AmdAqlWrap)); - wrap->state = AQL_WRAP_BUSY; - ConstBuffer* cb = gpu.constBufs_[1]; - cb->uploadDataToHw(sizeof(AmdAqlWrap)); - *vmParentWrap = cb->vmAddress() + cb->wrtOffset(); - offsetSize[4] = vmDefQueue; - offsetSize[5] = *vmParentWrap; - memList.push_back(cb); - } - - // Check if the kernel may have printf output - if ((printfInfo().size() > 0) && - // and printf buffer was allocated - (gpu.printfDbgHSA().dbgBuffer() != NULL)) { - offsetSize[3] = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress()); - memList.push_back(gpu.printfDbgHSA().dbgBuffer()); - } - WriteAqlArg(&aqlArgBuf, offsetSize, sizeof(offsetSize), sizeof(size_t)); - const amd::KernelSignature& signature = kernel.signature(); const amd::KernelParameters& kernelParams = kernel.parameters(); diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp index 870f0313de..2c008c2087 100644 --- a/rocclr/runtime/device/gpu/gpukernel.hpp +++ b/rocclr/runtime/device/gpu/gpukernel.hpp @@ -845,12 +845,13 @@ public: uint numElem_; //!< Number of elements }; - // Global offsets located in the first 3 elements - static const uint ExtraArguments = 6; + // Max number of possible extra (hidden) kernel arguments + static const uint MaxExtraArgumentsNum = 6; HSAILKernel(std::string name, HSAILProgram* prog, - std::string compileOptions); + std::string compileOptions, + uint extraArgsNum); virtual ~HSAILKernel(); @@ -928,6 +929,9 @@ public: //! Returns the kernel index in the program uint index() const { return index_; } + //! Returns kernel's extra argument count + uint extraArgumentsNum() const { return extraArgumentsNum_; } + private: //! Disable copy constructor HSAILKernel(const HSAILKernel&); @@ -966,6 +970,8 @@ private: char* hwMetaData_; //!< SI metadata + uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments + union Flags { struct { uint imageEna_: 1; //!< Kernel uses images diff --git a/rocclr/runtime/device/gpu/gpuprogram.cpp b/rocclr/runtime/device/gpu/gpuprogram.cpp index 84aff7f8cd..6f7996508e 100644 --- a/rocclr/runtime/device/gpu/gpuprogram.cpp +++ b/rocclr/runtime/device/gpu/gpuprogram.cpp @@ -1974,7 +1974,7 @@ HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) { break; const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions); assert(symbol && "symbol not found"); - std::string symName = std::string(symbol->str[PRE]) + std::string(symbol->str[POST]); + std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]); size_t symSize = 0; const void *opts = aclExtractSymbol(dev().hsaCompiler(), binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode); @@ -2095,21 +2095,21 @@ HSAILProgram::linkImpl(amd::option::Options* options) } hsa_status_t status = executable_->LoadCodeObject(agent, code_object, NULL); if (status != HSA_STATUS_SUCCESS) { - buildLog_ += "Error while HSA Loader phase: loading HSA Code Object \n"; + buildLog_ += "Error while HSA Loader phase: loading HSA Code Object\n"; return false; } } size_t kernelNamesSize = 0; errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, NULL, &kernelNamesSize); if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while Finalization phase: kernel names query from the ELF failed\n"; + buildLog_ += "Error while Finalization phase: Kernel names size querying from the ELF failed\n"; return false; } if (!isNull() && kernelNamesSize > 0) { char* kernelNames = new char[kernelNamesSize]; errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames, &kernelNamesSize); if (errorCode != ACL_SUCCESS) { - buildLog_ += "Error while Finalization phase: kernel's Metadata is corrupted in the ELF\n"; + buildLog_ += "Error while Finalization phase: Kernel names querying from the ELF failed\n"; delete kernelNames; return false; } @@ -2117,11 +2117,22 @@ HSAILProgram::linkImpl(amd::option::Options* options) delete kernelNames; std::vector::iterator it = vKernels.begin(); bool dynamicParallelism = false; + aclMetadata md; + md.numHiddenKernelArgs = 0; + size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs); for (it; it != vKernels.end(); ++it) { - std::string kernelName = *it; - HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions()); + std::string kernelName(*it); + std::string openclKernelName = Kernel::openclMangledName(kernelName); + errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS, + openclKernelName.c_str(), &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs); + if (errorCode != ACL_SUCCESS) { + buildLog_ += "Error while Finalization phase: Kernel extra arguments count querying from the ELF failed\n"; + return false; + } + HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions(), + md.numHiddenKernelArgs); kernels()[kernelName] = aKernel; - amd::hsa::loader::Symbol *sym = executable_->GetSymbol("", Kernel::openclMangledName(kernelName).c_str(), agent, 0); + amd::hsa::loader::Symbol *sym = executable_->GetSymbol("", openclKernelName.c_str(), agent, 0); if (!sym) { LogError("Failed to get kernel ISA code"); return false; diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp index 3ee22c2da4..399a5fa462 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -1872,9 +1872,9 @@ VirtualGPU::submitKernelInternalHSA( gpuDefQueue->virtualQueue_->vmAddress(); address argum = gpuDefQueue->virtualQueue_->data() + offsArg; print << "Kernel: " << child->name() << "\n"; - static const char* Names[HSAILKernel::ExtraArguments] = { + static const char* Names[HSAILKernel::MaxExtraArgumentsNum] = { "Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "}; - for (j = 0; j < HSAILKernel::ExtraArguments; ++j) { + for (j = 0; j < child->extraArgumentsNum(); ++j) { print << "\t" << Names[j] << *(size_t*)argum; print << "\n"; argum += sizeof(size_t); diff --git a/rocclr/runtime/device/hsa/hsaprogram.cpp b/rocclr/runtime/device/hsa/hsaprogram.cpp index bad3dfc71a..b2e93aaaa3 100644 --- a/rocclr/runtime/device/hsa/hsaprogram.cpp +++ b/rocclr/runtime/device/hsa/hsaprogram.cpp @@ -295,14 +295,14 @@ namespace oclhsa { std::string openClKernelName("&__OpenCL_" + kernelName + "_kernel"); const oclBIFSymbolStruct* isaSymbolStruct = findBIF30SymStruct(symISABinary); assert(isaSymbolStruct && "symbol not found"); - std::string kernelIsaSymbol = isaSymbolStruct->str[PRE] + - openClKernelName + isaSymbolStruct->str[POST]; + std::string kernelIsaSymbol = isaSymbolStruct->str[bif::PRE] + + openClKernelName + isaSymbolStruct->str[bif::POST]; const oclBIFSymbolStruct* debugSymbolStruct = findBIF30SymStruct(symDebugInfo); assert(debugSymbolStruct && "symbol not found"); //For debug symbols, the PRE is used for BRIG debug and the POST is used for //ISA debug - std::string kernelIsaDebugSymbol = debugSymbolStruct->str[POST] + openClKernelName; + std::string kernelIsaDebugSymbol = debugSymbolStruct->str[bif::POST] + openClKernelName; //Extract the ISA section size_t symbolSize;