From 81b331f4c59a53ab0db4e1eb2c842acf3799df4f Mon Sep 17 00:00:00 2001
From: foreman
Date: Wed, 5 Aug 2015 06:18:33 -0400
Subject: [PATCH] P4 to Git Change 1177220 by emankov@em-hsa-amd on 2015/08/05
06:08:39
ECR #333753 - ORCA RT/Compiler Lib/aoc2: AMD HSA Code Object Import feature (part II) - arbitrary hidden (extra) kernargs support
Only HSAIL path is affected. It doesn't affect blit kernels.
To use offline by aoc2:
aoc2 -hsacodeobject= -numhiddenkernargs= -cl-std=CL2.0 -march=hsail(-64) -mdevice=Bonaire
To use online by setting env:
AMD_DEBUG_HSA_NUM_HIDDEN_KERNARGS=
where num >= 0. If num == 0, then no additional arguments will be added on RT for every kernel. The default value is unchanged and equal to 6 for now.
Misc:
+ get rid of PRE & POST defines in Compiler Lib, as they started to conflict with ugl\gl\gs\hwl\ headers with the same defines.
+ minor copy/paste eliminations & typo fixes
+ ocltst complib tests update
Testing: pre check-in, manually based on ocl sdk MatrixMultiplication
Reviewers: Brian Sumner, German Andryeyev, Nikolay Haustov, Artem Tamazov
Affected files ...
... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/common/v0_8/if_acl.cpp#72 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/hsail_be.cpp#49 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/backends/gpu/metadata.cpp#8 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/include/v0_8/aclDefs.h#5 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/include/v0_8/aclEnums.h#19 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/include/v0_8/aclStructs.h#17 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/bif_section_labels.hpp#21 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/v0_8/libUtils.cpp#10 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/utils/v0_8/libUtils.h#20 edit
... //depot/stg/opencl/drivers/opencl/compiler/tools/aoc2/aoc2.cpp#74 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#181 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#249 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#291 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#113 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#199 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#369 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa/hsaprogram.cpp#38 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsakernel.cpp#8 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsakernel.hpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsaprogram.cpp#19 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hsa_foundation/hsavirtual.cpp#43 edit
... //depot/stg/opencl/drivers/opencl/tests/ocltst/module/complib/CLAssumptionCheck.cpp#43 edit
... //depot/stg/opencl/drivers/opencl/tests/ocltst/module/complib/CLEnumCheck.cpp#44 edit
---
.../lib/backends/common/v0_8/if_acl.cpp | 11 +++
rocclr/compiler/lib/include/v0_8/aclDefs.h | 6 ++
rocclr/compiler/lib/include/v0_8/aclEnums.h | 47 +++++-----
rocclr/compiler/lib/include/v0_8/aclStructs.h | 1 +
.../compiler/lib/utils/bif_section_labels.hpp | 6 +-
rocclr/compiler/lib/utils/v0_8/libUtils.cpp | 4 +-
rocclr/compiler/lib/utils/v0_8/libUtils.h | 92 +++++++++++++++++--
rocclr/runtime/device/device.cpp | 12 ++-
rocclr/runtime/device/device.hpp | 2 +-
rocclr/runtime/device/gpu/gpukernel.cpp | 79 ++++++++--------
rocclr/runtime/device/gpu/gpukernel.hpp | 12 ++-
rocclr/runtime/device/gpu/gpuprogram.cpp | 25 +++--
rocclr/runtime/device/gpu/gpuvirtual.cpp | 4 +-
rocclr/runtime/device/hsa/hsaprogram.cpp | 6 +-
14 files changed, 214 insertions(+), 93 deletions(-)
diff --git a/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp b/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp
index a488b99f62..f3e8d7196b 100644
--- a/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp
+++ b/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp
@@ -2641,6 +2641,17 @@ if_aclQueryInfo(aclCompiler *cl,
}
break;
}
+ case RT_NUM_KERNEL_HIDDEN_ARGS: {
+ size_t hidden_kernargs_size = sizeof(md->numHiddenKernelArgs);
+ if (!ptr) {
+ *size = hidden_kernargs_size;
+ success = true;
+ } else if (*size >= hidden_kernargs_size) {
+ memcpy(ptr, &md->numHiddenKernelArgs, hidden_kernargs_size);
+ success = true;
+ }
+ break;
+ }
}
return (success) ? ACL_SUCCESS : ACL_ERROR;
}
diff --git a/rocclr/compiler/lib/include/v0_8/aclDefs.h b/rocclr/compiler/lib/include/v0_8/aclDefs.h
index 2d21ea83d1..10da184217 100644
--- a/rocclr/compiler/lib/include/v0_8/aclDefs.h
+++ b/rocclr/compiler/lib/include/v0_8/aclDefs.h
@@ -28,4 +28,10 @@
#define BIF_API_3_0
#endif
+#ifndef MAX_HIDDEN_KERNARGS_NUM
+#define MAX_HIDDEN_KERNARGS_NUM 6
+#else
+#error "MAX_HIDDEN_KERNARGS_NUM is already defined"
+#endif
+
#endif // _ACL_DEFS_0_8_H_
diff --git a/rocclr/compiler/lib/include/v0_8/aclEnums.h b/rocclr/compiler/lib/include/v0_8/aclEnums.h
index 513acc4e86..aa51dbda16 100644
--- a/rocclr/compiler/lib/include/v0_8/aclEnums.h
+++ b/rocclr/compiler/lib/include/v0_8/aclEnums.h
@@ -188,29 +188,30 @@ typedef enum _bif_sections_enum_0_8 {
//! An enumeration that defines what are valid queries for aclQueryInfo.
typedef enum _rt_query_types_enum_0_8 {
- RT_ABI_VERSION = 0,
- RT_DEVICE_NAME = 1,
- RT_MEM_SIZES = 2,
- RT_GPU_FUNC_CAPS = 3,
- RT_GPU_FUNC_ID = 4,
- RT_GPU_DEFAULT_ID = 5,
- RT_WORK_GROUP_SIZE = 6,
- RT_WORK_REGION_SIZE = 7,
- RT_ARGUMENT_ARRAY = 8,
- RT_GPU_PRINTF_ARRAY = 9,
- RT_CPU_BARRIER_NAMES = 10,
- RT_DEVICE_ENQUEUE = 11,
- RT_KERNEL_INDEX = 12,
- RT_KERNEL_NAME = 13,
- RT_KERNEL_NAMES = 14,
- RT_CONTAINS_LLVMIR = 15,
- RT_CONTAINS_OPTIONS = 16,
- RT_CONTAINS_BRIG = 17,
- RT_CONTAINS_HSAIL = 18,
- RT_CONTAINS_ISA = 19,
- RT_CONTAINS_LOADER_MAP = 20,
- RT_CONTAINS_SPIR = 21,
- RT_LAST_TYPE = 22
+ RT_ABI_VERSION = 0,
+ RT_DEVICE_NAME = 1,
+ RT_MEM_SIZES = 2,
+ RT_GPU_FUNC_CAPS = 3,
+ RT_GPU_FUNC_ID = 4,
+ RT_GPU_DEFAULT_ID = 5,
+ RT_WORK_GROUP_SIZE = 6,
+ RT_WORK_REGION_SIZE = 7,
+ RT_ARGUMENT_ARRAY = 8,
+ RT_GPU_PRINTF_ARRAY = 9,
+ RT_CPU_BARRIER_NAMES = 10,
+ RT_DEVICE_ENQUEUE = 11,
+ RT_KERNEL_INDEX = 12,
+ RT_KERNEL_NAME = 13,
+ RT_KERNEL_NAMES = 14,
+ RT_CONTAINS_LLVMIR = 15,
+ RT_CONTAINS_OPTIONS = 16,
+ RT_CONTAINS_BRIG = 17,
+ RT_CONTAINS_HSAIL = 18,
+ RT_CONTAINS_ISA = 19,
+ RT_CONTAINS_LOADER_MAP = 20,
+ RT_CONTAINS_SPIR = 21,
+ RT_NUM_KERNEL_HIDDEN_ARGS = 22,
+ RT_LAST_TYPE = 23
} aclQueryType_0_8;
//! An enumeration for the various GPU capabilities
diff --git a/rocclr/compiler/lib/include/v0_8/aclStructs.h b/rocclr/compiler/lib/include/v0_8/aclStructs.h
index 8586955aec..49bbd8e68d 100644
--- a/rocclr/compiler/lib/include/v0_8/aclStructs.h
+++ b/rocclr/compiler/lib/include/v0_8/aclStructs.h
@@ -119,6 +119,7 @@ typedef struct _acl_metadata_0_8 {
const char *deviceName; // RT_DEVICE_NAME
bool enqueue_kernel; // RT_DEVICE_ENQUEUE
uint32_t kernel_index; // RT_KERNEL_INDEX
+ uint32_t numHiddenKernelArgs; // RT_NUM_KERNEL_HIDDEN_ARGS
} aclMetadata_0_8;
//! An structure that holds information on the capabilities of the bif device.
diff --git a/rocclr/compiler/lib/utils/bif_section_labels.hpp b/rocclr/compiler/lib/utils/bif_section_labels.hpp
index 1af28f5f30..7106cff7f0 100644
--- a/rocclr/compiler/lib/utils/bif_section_labels.hpp
+++ b/rocclr/compiler/lib/utils/bif_section_labels.hpp
@@ -6,8 +6,10 @@
#ifdef __cplusplus
extern "C" {
#endif
-#define PRE 0
-#define POST 1
+namespace bif {
+const unsigned PRE = 0;
+const unsigned POST = 1;
+}
typedef enum {
symOpenclCompilerOptions,
diff --git a/rocclr/compiler/lib/utils/v0_8/libUtils.cpp b/rocclr/compiler/lib/utils/v0_8/libUtils.cpp
index 0342406dbd..71797a2962 100644
--- a/rocclr/compiler/lib/utils/v0_8/libUtils.cpp
+++ b/rocclr/compiler/lib/utils/v0_8/libUtils.cpp
@@ -1,8 +1,7 @@
//
// Copyright (c) 2011 Advanced Micro Devices, Inc. All rights reserved.
//
-#include "acl.h"
-#include "aclTypes.h"
+
#include "api/v0_8/aclValidation.h"
#include "libUtils.h"
#include "bif/bifbase.hpp"
@@ -10,7 +9,6 @@
#include "utils/versions.hpp"
#include "utils/options.hpp"
#include "backends/gpu/scwrapper/devState.h"
-#include
#include
#include "bif/bif.hpp"
extern aclBinary* constructBinary(size_t struct_version,
diff --git a/rocclr/compiler/lib/utils/v0_8/libUtils.h b/rocclr/compiler/lib/utils/v0_8/libUtils.h
index b49de55d9a..878479f302 100644
--- a/rocclr/compiler/lib/utils/v0_8/libUtils.h
+++ b/rocclr/compiler/lib/utils/v0_8/libUtils.h
@@ -3,12 +3,16 @@
//
#ifndef _CL_LIB_UTILS_0_8_H_
#define _CL_LIB_UTILS_0_8_H_
-#include "v0_8/aclTypes.h"
+#include "acl.h"
#include
#include
#include
#include
+#include
#include "library.hpp"
+#include "utils/bif_section_labels.hpp"
+using namespace bif;
+
// Utility function to set a flag in option structure
// of the aclDevCaps.
void
@@ -150,6 +154,83 @@ aclutAlloc(const aclCompilerOptions *bin);
FreeFunc
aclutFree(const aclCompilerOptions *bin);
+inline std::vector splitSpaceSeparatedString(char *str)
+{
+ std::string s(str);
+ std::stringstream ss(s);
+ std::istream_iterator beg(ss), end;
+ std::vector vec(beg, end);
+ return vec;
+}
+
+// Helper function that returns OpenCL mangled kernel name.
+inline std::string
+aclutOpenclMangledKernelName(const std::string& kernel_name)
+{
+ const oclBIFSymbolStruct* sym = findBIF30SymStruct(symOpenclKernel);
+ assert(sym && "symbol not found");
+ return std::string("&") + sym->str[PRE] + kernel_name + sym->str[POST];
+}
+
+// Helper function that returns OpenCL mangled kernel metadata symbol name.
+inline std::string
+aclutOpenclMangledKernelMetadataName(const std::string& kernel_name)
+{
+ const oclBIFSymbolStruct* sym = findBIF30SymStruct(symOpenclMeta);
+ assert(sym && "symbol not found");
+ return sym->str[PRE] + aclutOpenclMangledKernelName(kernel_name) + sym->str[POST];
+}
+
+#ifdef WITH_TARGET_HSAIL
+// Helper function that updates metadata for all the kernels in binary;
+// the updated attribute is the number of hidden kernel arguments.
+inline acl_error
+aclutUpdateMetadataWithHiddenKernargsNum(aclCompiler* cl, aclBinary* bin, uint32_t num) {
+ if (num == MAX_HIDDEN_KERNARGS_NUM) {
+ return ACL_SUCCESS;
+ }
+ const oclBIFSymbolStruct* sym = findBIF30SymStruct(symOpenclMeta);
+ assert(sym && "symbol not found");
+ aclSections secID = sym->sections[0];
+ size_t kernelNamesSize = 0;
+ acl_error error_code = aclQueryInfo(cl, bin, RT_KERNEL_NAMES, NULL, NULL, &kernelNamesSize);
+ if (error_code != ACL_SUCCESS) {
+ return error_code;
+ }
+ char* kernelNames = new char[kernelNamesSize];
+ error_code = aclQueryInfo(cl, bin, RT_KERNEL_NAMES, NULL, kernelNames, &kernelNamesSize);
+ if (error_code != ACL_SUCCESS) {
+ delete kernelNames;
+ return error_code;
+ }
+ std::vector vKernels = splitSpaceSeparatedString(kernelNames);
+ delete kernelNames;
+ size_t roSize = 0;
+ for (auto it = vKernels.begin(); it != vKernels.end(); ++it) {
+ std::string symbol = aclutOpenclMangledKernelMetadataName(*it);
+ void* roSec = const_cast(aclExtractSymbol(cl, bin, &roSize, secID, symbol.c_str(), &error_code));
+ if (error_code != ACL_SUCCESS) {
+ return error_code;
+ }
+ if (!roSec || roSize == 0) {
+ error_code = ACL_ELF_ERROR;
+ return error_code;
+ }
+ aclMetadata *md = reinterpret_cast(roSec);
+ md->numHiddenKernelArgs = num;
+ error_code = aclRemoveSymbol(cl, bin, secID, symbol.c_str());
+ if (error_code != ACL_SUCCESS) {
+ return error_code;
+ }
+ error_code = aclInsertSymbol(cl, bin, md, roSize, secID, symbol.c_str());
+ if (error_code != ACL_SUCCESS) {
+ return error_code;
+ }
+ }
+ return error_code;
+}
+#endif
+
inline bool is64BitTarget(const aclTargetInfo& target)
{
return (target.arch_id == aclX64 ||
@@ -184,15 +265,6 @@ enum scId {
SC_LAST,
};
-inline std::vector splitSpaceSeparatedString(char *str)
-{
- std::string s(str);
- std::stringstream ss(s);
- std::istream_iterator beg(ss), end;
- std::vector vec(beg, end);
- return vec;
-}
-
// Helper function that allocates an aligned memory.
inline void*
alignedMalloc(size_t size, size_t alignment)
diff --git a/rocclr/runtime/device/device.cpp b/rocclr/runtime/device/device.cpp
index f69726e36c..496c7ad755 100644
--- a/rocclr/runtime/device/device.cpp
+++ b/rocclr/runtime/device/device.cpp
@@ -604,6 +604,14 @@ Kernel::~Kernel()
delete signature_;
}
+std::string
+Kernel::openclMangledName(const std::string& name)
+{
+ const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel);
+ assert(bifSym && "symbol not found");
+ return std::string("&") + bifSym->str[bif::PRE] + name + bifSym->str[bif::POST];
+}
+
void
Memory::saveMapInfo(
const amd::Coord3D origin,
@@ -1246,7 +1254,7 @@ ClBinary::getBIFSymbol(unsigned int symbolID) const
const oclBIFSymbolStruct* symb = findBIFSymbolStruct(BIF20, nSymbols, symID);
assert(symb && "BIF20 symbol with symbolID not found");
if (symb) {
- return std::string(symb->str[PRE]) + std::string(symb->str[POST]);
+ return std::string(symb->str[bif::PRE]) + std::string(symb->str[bif::POST]);
}
break;
}
@@ -1255,7 +1263,7 @@ ClBinary::getBIFSymbol(unsigned int symbolID) const
const oclBIFSymbolStruct* symb = findBIFSymbolStruct(BIF30, nSymbols, symID);
assert(symb && "BIF30 symbol with symbolID not found");
if (symb) {
- return std::string(symb->str[PRE]) + std::string(symb->str[POST]);
+ return std::string(symb->str[bif::PRE]) + std::string(symb->str[bif::POST]);
}
break;
}
diff --git a/rocclr/runtime/device/device.hpp b/rocclr/runtime/device/device.hpp
index 6bff20f27a..4adce37016 100644
--- a/rocclr/runtime/device/device.hpp
+++ b/rocclr/runtime/device/device.hpp
@@ -940,7 +940,7 @@ public:
//! Return the build log
const std::string& buildLog() const { return buildLog_; }
- static std::string openclMangledName(const std::string& name) { return "&__OpenCL_" + name + "_kernel"; }
+ static std::string openclMangledName(const std::string& name);
protected:
std::string name_; //!< kernel name
diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp
index e705ce33b4..2e9c8446ad 100644
--- a/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -9,7 +9,6 @@
#include "device/gpu/gpusched.hpp"
#include "platform/commandqueue.hpp"
#include "utils/options.hpp"
-#include "utils/bif_section_labels.hpp"
#include "acl.h"
#include "SCShadersR678XXCommon.h"
@@ -3432,7 +3431,7 @@ HSAILKernel::initArgList(const aclArgData* aclArg)
size_t offset = 0;
// Reserved arguments for HSAIL launch
- aclArg += ExtraArguments;
+ aclArg += MaxExtraArgumentsNum;
for (uint i = 0; aclArg->struct_size != 0; i++, aclArg++) {
desc.name_ = arguments_[i]->name_.c_str();
desc.type_ = GetOclType(aclArg);
@@ -3479,7 +3478,7 @@ HSAILKernel::initHsailArgs(const aclArgData* aclArg)
int offset = 0;
// Reserved arguments for HSAIL launch
- aclArg += ExtraArguments;
+ aclArg += MaxExtraArgumentsNum;
// Iterate through the each kernel argument
for (; aclArg->struct_size != 0; aclArg++) {
@@ -3569,7 +3568,8 @@ HSAILKernel::initPrintf(const aclPrintfFmt* aclPrintf)
HSAILKernel::HSAILKernel(std::string name,
HSAILProgram* prog,
- std::string compileOptions)
+ std::string compileOptions,
+ uint extraArgsNum)
: device::Kernel(name)
, compileOptions_(compileOptions)
, dev_(prog->dev())
@@ -3578,6 +3578,7 @@ HSAILKernel::HSAILKernel(std::string name,
, code_(NULL)
, codeSize_(0)
, hwMetaData_(NULL)
+ , extraArgumentsNum_(extraArgsNum)
{
hsa_ = true;
}
@@ -3598,14 +3599,16 @@ HSAILKernel::~HSAILKernel()
bool
HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
{
- acl_error error;
- const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel);
- assert(bifSym && "symbol not found");
- std::string openClKernelName(std::string("&") + bifSym->str[PRE] + name() + bifSym->str[POST]);
+ if (extraArgumentsNum_ > MaxExtraArgumentsNum) {
+ LogError("Failed to initialize kernel: extra arguments number is bigger than is supported");
+ return false;
+ }
+ acl_error error = ACL_SUCCESS;
+ std::string openClKernelName = openclMangledName(name());
//compile kernel down to ISA
if (finalize) {
std::string options(compileOptions_.c_str());
- flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") !=
+ flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") !=
std::string::npos) ? true: false;
options.append(" -just-kernel=");
options.append(openClKernelName.c_str());
@@ -3618,7 +3621,7 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL);
buildLog_ += aclGetCompilerLog(dev().hsaCompiler());
if (error != ACL_SUCCESS) {
- LogError("Failed to finalize");
+ LogError("Failed to finalize kernel");
return false;
}
}
@@ -3900,35 +3903,37 @@ HSAILKernel::loadArguments(
address aqlStruct = gpu.cb(1)->sysMemCopy();
bool srdResource = false;
- // The HLC generates 3 additional arguments for the global offsets
- //and fourth argument is the printf_buffer pointer
- size_t offsetSize[HSAILKernel::ExtraArguments] = { 0, 0, 0, 0, 0, 0 };
- for (uint i = 0; i < sizes.dimensions(); ++i) {
- offsetSize[i] = sizes.offset()[i];
+ if (extraArgumentsNum_ > 0) {
+ assert(MaxExtraArgumentsNum >= 6 && "MaxExtraArgumentsNum has changed, the below algorithm should be changed accordingly");
+ size_t extraArgs[MaxExtraArgumentsNum] = { 0, 0, 0, 0, 0, 0 };
+ // The HLC generates up to 3 additional arguments for the global offsets
+ for (uint i = 0; i < sizes.dimensions(); ++i) {
+ extraArgs[i] = sizes.offset()[i];
+ }
+ // Check if the kernel may have printf output
+ if ((printfInfo().size() > 0) &&
+ // and printf buffer was allocated
+ (gpu.printfDbgHSA().dbgBuffer() != NULL)) {
+ // and set the fourth argument as the printf_buffer pointer
+ extraArgs[3] = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
+ memList.push_back(gpu.printfDbgHSA().dbgBuffer());
+ }
+ if (dynamicParallelism()) {
+ // Provide the host parent AQL wrap object to the kernel
+ AmdAqlWrap* wrap = reinterpret_cast(aqlStruct);
+ memset(wrap, 0, sizeof(AmdAqlWrap));
+ wrap->state = AQL_WRAP_BUSY;
+ ConstBuffer* cb = gpu.constBufs_[1];
+ cb->uploadDataToHw(sizeof(AmdAqlWrap));
+ *vmParentWrap = cb->vmAddress() + cb->wrtOffset();
+ // and set 5th & 6th arguments
+ extraArgs[4] = vmDefQueue;
+ extraArgs[5] = *vmParentWrap;
+ memList.push_back(cb);
+ }
+ WriteAqlArg(&aqlArgBuf, extraArgs, sizeof(size_t)*extraArgumentsNum_, sizeof(size_t));
}
- if (dynamicParallelism()) {
- // Provide the host parent AQL wrap object to the kernel
- AmdAqlWrap* wrap = reinterpret_cast(aqlStruct);
- memset(wrap, 0, sizeof(AmdAqlWrap));
- wrap->state = AQL_WRAP_BUSY;
- ConstBuffer* cb = gpu.constBufs_[1];
- cb->uploadDataToHw(sizeof(AmdAqlWrap));
- *vmParentWrap = cb->vmAddress() + cb->wrtOffset();
- offsetSize[4] = vmDefQueue;
- offsetSize[5] = *vmParentWrap;
- memList.push_back(cb);
- }
-
- // Check if the kernel may have printf output
- if ((printfInfo().size() > 0) &&
- // and printf buffer was allocated
- (gpu.printfDbgHSA().dbgBuffer() != NULL)) {
- offsetSize[3] = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress());
- memList.push_back(gpu.printfDbgHSA().dbgBuffer());
- }
- WriteAqlArg(&aqlArgBuf, offsetSize, sizeof(offsetSize), sizeof(size_t));
-
const amd::KernelSignature& signature = kernel.signature();
const amd::KernelParameters& kernelParams = kernel.parameters();
diff --git a/rocclr/runtime/device/gpu/gpukernel.hpp b/rocclr/runtime/device/gpu/gpukernel.hpp
index 870f0313de..2c008c2087 100644
--- a/rocclr/runtime/device/gpu/gpukernel.hpp
+++ b/rocclr/runtime/device/gpu/gpukernel.hpp
@@ -845,12 +845,13 @@ public:
uint numElem_; //!< Number of elements
};
- // Global offsets located in the first 3 elements
- static const uint ExtraArguments = 6;
+ // Max number of possible extra (hidden) kernel arguments
+ static const uint MaxExtraArgumentsNum = 6;
HSAILKernel(std::string name,
HSAILProgram* prog,
- std::string compileOptions);
+ std::string compileOptions,
+ uint extraArgsNum);
virtual ~HSAILKernel();
@@ -928,6 +929,9 @@ public:
//! Returns the kernel index in the program
uint index() const { return index_; }
+ //! Returns kernel's extra argument count
+ uint extraArgumentsNum() const { return extraArgumentsNum_; }
+
private:
//! Disable copy constructor
HSAILKernel(const HSAILKernel&);
@@ -966,6 +970,8 @@ private:
char* hwMetaData_; //!< SI metadata
+ uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments
+
union Flags {
struct {
uint imageEna_: 1; //!< Kernel uses images
diff --git a/rocclr/runtime/device/gpu/gpuprogram.cpp b/rocclr/runtime/device/gpu/gpuprogram.cpp
index 84aff7f8cd..6f7996508e 100644
--- a/rocclr/runtime/device/gpu/gpuprogram.cpp
+++ b/rocclr/runtime/device/gpu/gpuprogram.cpp
@@ -1974,7 +1974,7 @@ HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) {
break;
const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions);
assert(symbol && "symbol not found");
- std::string symName = std::string(symbol->str[PRE]) + std::string(symbol->str[POST]);
+ std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]);
size_t symSize = 0;
const void *opts = aclExtractSymbol(dev().hsaCompiler(),
binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode);
@@ -2095,21 +2095,21 @@ HSAILProgram::linkImpl(amd::option::Options* options)
}
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, NULL);
if (status != HSA_STATUS_SUCCESS) {
- buildLog_ += "Error while HSA Loader phase: loading HSA Code Object \n";
+ buildLog_ += "Error while HSA Loader phase: loading HSA Code Object\n";
return false;
}
}
size_t kernelNamesSize = 0;
errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, NULL, &kernelNamesSize);
if (errorCode != ACL_SUCCESS) {
- buildLog_ += "Error while Finalization phase: kernel names query from the ELF failed\n";
+ buildLog_ += "Error while Finalization phase: Kernel names size querying from the ELF failed\n";
return false;
}
if (!isNull() && kernelNamesSize > 0) {
char* kernelNames = new char[kernelNamesSize];
errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames, &kernelNamesSize);
if (errorCode != ACL_SUCCESS) {
- buildLog_ += "Error while Finalization phase: kernel's Metadata is corrupted in the ELF\n";
+ buildLog_ += "Error while Finalization phase: Kernel names querying from the ELF failed\n";
delete kernelNames;
return false;
}
@@ -2117,11 +2117,22 @@ HSAILProgram::linkImpl(amd::option::Options* options)
delete kernelNames;
std::vector::iterator it = vKernels.begin();
bool dynamicParallelism = false;
+ aclMetadata md;
+ md.numHiddenKernelArgs = 0;
+ size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs);
for (it; it != vKernels.end(); ++it) {
- std::string kernelName = *it;
- HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions());
+ std::string kernelName(*it);
+ std::string openclKernelName = Kernel::openclMangledName(kernelName);
+ errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS,
+ openclKernelName.c_str(), &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs);
+ if (errorCode != ACL_SUCCESS) {
+ buildLog_ += "Error while Finalization phase: Kernel extra arguments count querying from the ELF failed\n";
+ return false;
+ }
+ HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions(),
+ md.numHiddenKernelArgs);
kernels()[kernelName] = aKernel;
- amd::hsa::loader::Symbol *sym = executable_->GetSymbol("", Kernel::openclMangledName(kernelName).c_str(), agent, 0);
+ amd::hsa::loader::Symbol *sym = executable_->GetSymbol("", openclKernelName.c_str(), agent, 0);
if (!sym) {
LogError("Failed to get kernel ISA code");
return false;
diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp
index 3ee22c2da4..399a5fa462 100644
--- a/rocclr/runtime/device/gpu/gpuvirtual.cpp
+++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp
@@ -1872,9 +1872,9 @@ VirtualGPU::submitKernelInternalHSA(
gpuDefQueue->virtualQueue_->vmAddress();
address argum = gpuDefQueue->virtualQueue_->data() + offsArg;
print << "Kernel: " << child->name() << "\n";
- static const char* Names[HSAILKernel::ExtraArguments] = {
+ static const char* Names[HSAILKernel::MaxExtraArgumentsNum] = {
"Offset0: ", "Offset1: ","Offset2: ","PrintfBuf: ", "VqueuePtr: ", "AqlWrap: "};
- for (j = 0; j < HSAILKernel::ExtraArguments; ++j) {
+ for (j = 0; j < child->extraArgumentsNum(); ++j) {
print << "\t" << Names[j] << *(size_t*)argum;
print << "\n";
argum += sizeof(size_t);
diff --git a/rocclr/runtime/device/hsa/hsaprogram.cpp b/rocclr/runtime/device/hsa/hsaprogram.cpp
index bad3dfc71a..b2e93aaaa3 100644
--- a/rocclr/runtime/device/hsa/hsaprogram.cpp
+++ b/rocclr/runtime/device/hsa/hsaprogram.cpp
@@ -295,14 +295,14 @@ namespace oclhsa {
std::string openClKernelName("&__OpenCL_" + kernelName + "_kernel");
const oclBIFSymbolStruct* isaSymbolStruct = findBIF30SymStruct(symISABinary);
assert(isaSymbolStruct && "symbol not found");
- std::string kernelIsaSymbol = isaSymbolStruct->str[PRE] +
- openClKernelName + isaSymbolStruct->str[POST];
+ std::string kernelIsaSymbol = isaSymbolStruct->str[bif::PRE] +
+ openClKernelName + isaSymbolStruct->str[bif::POST];
const oclBIFSymbolStruct* debugSymbolStruct = findBIF30SymStruct(symDebugInfo);
assert(debugSymbolStruct && "symbol not found");
//For debug symbols, the PRE is used for BRIG debug and the POST is used for
//ISA debug
- std::string kernelIsaDebugSymbol = debugSymbolStruct->str[POST] + openClKernelName;
+ std::string kernelIsaDebugSymbol = debugSymbolStruct->str[bif::POST] + openClKernelName;
//Extract the ISA section
size_t symbolSize;