From 376b1f2bfa58e7e816eb8ecf6beca891b5fa17c5 Mon Sep 17 00:00:00 2001
From: foreman
Date: Thu, 8 Sep 2016 20:29:42 -0400
Subject: [PATCH] P4 to Git Change 1312581 by lmoriche@lmoriche_opencl_dev on
2016/09/08 19:11:07
SWDEV-102033 - [OCL-LC-ROCm] Change runtime metadata KeyArgKind
- Required runtime changes to use the new hidden argument metadata
Affected files ...
... //depot/stg/opencl/drivers/opencl/make/llvm.git/llvmlibs#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/amdgpu_metadata.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/amdgpu_metadata.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#15 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#28 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#14 edit
[ROCm/clr commit: 8fadd2c11592361dabfe9d88fa7a4d237d3b7872]
---
.../runtime/device/rocm/amdgpu_metadata.cpp | 73 +++++-----
.../runtime/device/rocm/amdgpu_metadata.hpp | 13 +-
.../rocclr/runtime/device/rocm/rockernel.cpp | 107 ++++++++-------
.../rocclr/runtime/device/rocm/rockernel.hpp | 1 +
.../rocclr/runtime/device/rocm/rocprogram.cpp | 25 +++-
.../rocclr/runtime/device/rocm/rocvirtual.cpp | 129 +++++++++---------
6 files changed, 189 insertions(+), 159 deletions(-)
diff --git a/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.cpp b/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.cpp
index 095d85338b..1faa04da6b 100644
--- a/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.cpp
@@ -102,7 +102,7 @@ namespace code {
}
template<>
- bool Read(std::istream& in, AMDGPU::RuntimeMD::KernelArg::TypeKind& v) {
+ bool Read(std::istream& in, AMDGPU::RuntimeMD::KernelArg::Kind& v) {
return ReadConvert(in, v);
}
@@ -124,17 +124,26 @@ namespace code {
namespace KernelArg {
using namespace AMDGPU::RuntimeMD::KernelArg;
Metadata::Metadata()
- : size(0), align(0), pointeeAlign(0),
+ : size(0), align(0), pointeeAlign(0), accQual(None),
isConst(false), isRestrict(false), isVolatile(false), isPipe(false)
{}
- static const char* TypeKindToString(TypeKind typeKind) {
- switch (typeKind) {
- case Value: return "Value";
- case Pointer: return "Pointer";
+ static const char* KindToString(Kind kind) {
+ switch (kind) {
+ case ByValue: return "ByValue";
+ case GlobalBuffer: return "GlobalBuffer";
+ case DynamicSharedPointer: return "DynamicSharedPointer";
case Image: return "Image";
case Sampler: return "Sampler";
+ case Pipe: return "Pipe";
case Queue: return "Queue";
+ case HiddenGlobalOffsetX: return "HiddenGlobalOffsetX";
+ case HiddenGlobalOffsetY: return "HiddenGlobalOffsetY";
+ case HiddenGlobalOffsetZ: return "HiddenGlobalOffsetZ";
+ case HiddenPrintfBuffer: return "HiddenPrintfBuffer";
+ case HiddenDefaultQueue: return "HiddenDefaultQueue";
+ case HiddenCompletionAction: return "HiddenCompletionAction";
+ case HiddenNone: return "HiddenNone";
default: return "";
}
}
@@ -175,7 +184,7 @@ namespace code {
case KeyArgAlign: return Read(in, align);
case KeyArgTypeName: return Read(in, typeName);
case KeyArgName: return Read(in, name);
- case KeyArgTypeKind: return Read(in, typeKind);
+ case KeyArgKind: return Read(in, kind);
case KeyArgValueType: return Read(in, valueType);
case KeyArgPointeeAlign: return Read(in, pointeeAlign);
case KeyArgAddrQual: return Read(in, addrQual);
@@ -191,23 +200,29 @@ namespace code {
void Metadata::Print(std::ostream& out) {
out
- << "Type: " << TypeKindToString(typeKind);
- if (typeKind == Value) {
+ << "Kind: " << KindToString(kind);
+ if (kind == ByValue) {
out << " ValueType:" << ValueTypeToString(valueType);
}
if (isConst) { out << " Const"; }
if (isRestrict) { out << " Restrict"; }
if (isVolatile) { out << " Volatile"; }
if (isPipe) { out << " Pipe"; }
-
+ if (kind == Image || kind == Pipe) {
+ out << " Access: " << AccessQualToString(accQual);
+ }
out
- << " Access: " << AccessQualToString(accQual)
<< " Address: " << (unsigned) addrQual
<< " Size: " << size
- << " Align: " << align
- << " Type Name: " << typeName;
+ << " Align: " << align;
+ if (kind == DynamicSharedPointer) {
+ out << " Pointee Align: " << pointeeAlign;
+ }
+ if (!typeName.empty()) {
+ out << " Type Name: \"" << typeName << "\"";
+ }
if (!name.empty()) {
- out << " Name: " << name;
+ out << " Name: \"" << name << "\"";
}
}
@@ -221,11 +236,11 @@ namespace code {
hasWorkgroupSizeHint(false),
hasVectorTypeHint(false),
hasKernelIndex(false),
- hasSGPRs(false), hasVGPRs(false),
hasMinWavesPerSIMD(false), hasMaxWavesPerSIMD(false),
hasFlatWorkgroupSizeLimits(false),
hasMaxWorkgroupSize(false),
- isNoPartialWorkgroups(false)
+ isNoPartialWorkgroups(false),
+ hasPrintfInfo(false)
{}
void Metadata::SetCommon(uint8_t mdVersion, uint8_t mdRevision,
@@ -260,7 +275,7 @@ namespace code {
case KeyArgAlign:
case KeyArgTypeName:
case KeyArgName:
- case KeyArgTypeKind:
+ case KeyArgKind:
case KeyArgValueType:
case KeyArgPointeeAlign:
case KeyArgAddrQual:
@@ -284,12 +299,6 @@ namespace code {
case KeyKernelIndex:
hasKernelIndex = true;
return Read(in, kernelIndex);
- case KeySGPRs:
- hasSGPRs = true;
- return Read(in, numSgprs);
- case KeyVGPRs:
- hasVGPRs = true;
- return Read(in, numVgprs);
case KeyMinWavesPerSIMD:
hasMinWavesPerSIMD = true;
return Read(in, minWavesPerSimd);
@@ -306,6 +315,10 @@ namespace code {
return Read3(in, maxWorkgroupSize);
case KeyNoPartialWorkGroups:
isNoPartialWorkgroups = true;
+ return true;
+ case KeyPrintfInfo:
+ hasPrintfInfo = true;
+ return Read(in, printfInfo);
default:
return false;
}
@@ -345,12 +358,6 @@ namespace code {
if (hasKernelIndex) {
out << " Kernel iIndex: " << kernelIndex << std::endl;
}
- if (hasSGPRs) {
- out << " SGPRs: " << numSgprs << std::endl;
- }
- if (hasVGPRs) {
- out << " VGPRs: " << numVgprs << std::endl;
- }
if (hasMinWavesPerSIMD) {
out << " Min waves per SIMD: " << minWavesPerSimd << std::endl;
}
@@ -364,6 +371,9 @@ namespace code {
if (isNoPartialWorkgroups) {
out << " No partial workgroups" << std::endl;
}
+ if (hasPrintfInfo) {
+ out << " Printf info: " << printfInfo << std::endl;
+ }
out << " Arguments" << std::endl;
for (uint32_t i = 0; i < args.size(); ++i) {
out << " " << i << ": ";
@@ -424,7 +434,7 @@ namespace code {
case KeyArgAlign:
case KeyArgTypeName:
case KeyArgName:
- case KeyArgTypeKind:
+ case KeyArgKind:
case KeyArgValueType:
case KeyArgPointeeAlign:
case KeyArgAddrQual:
@@ -437,13 +447,12 @@ namespace code {
case KeyWorkGroupSizeHint:
case KeyVecTypeHint:
case KeyKernelIndex:
- case KeySGPRs:
- case KeyVGPRs:
case KeyMinWavesPerSIMD:
case KeyMaxWavesPerSIMD:
case KeyFlatWorkGroupSizeLimits:
case KeyMaxWorkGroupSize:
case KeyNoPartialWorkGroups:
+ case KeyPrintfInfo:
if (!kernel) { return false; }
if (!kernel->ReadValue(in, key)) { return false; }
break;
diff --git a/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.hpp b/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.hpp
index 7162706f71..e4add8c5e5 100644
--- a/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/amdgpu_metadata.hpp
@@ -64,7 +64,7 @@ namespace code {
uint32_t pointeeAlign;
std::string typeName;
std::string name;
- AMDGPU::RuntimeMD::KernelArg::TypeKind typeKind;
+ AMDGPU::RuntimeMD::KernelArg::Kind kind;
AMDGPU::RuntimeMD::KernelArg::ValueType valueType;
uint8_t addrQual;
AMDGPU::RuntimeMD::KernelArg::AccessQualifer accQual;
@@ -77,7 +77,7 @@ namespace code {
uint32_t PointeeAlign() const { return pointeeAlign; }
const std::string& TypeName() const { return typeName; }
const std::string& Name() const { return name; }
- AMDGPU::RuntimeMD::KernelArg::TypeKind TypeKind() const { return typeKind; }
+ AMDGPU::RuntimeMD::KernelArg::Kind Kind() const { return kind; }
AMDGPU::RuntimeMD::KernelArg::ValueType ValueType() const { return valueType; }
uint8_t AddrQual() const { return addrQual; }
AMDGPU::RuntimeMD::KernelArg::AccessQualifer AccQual() const { return accQual; }
@@ -104,16 +104,17 @@ namespace code {
unsigned hasWorkgroupSizeHint : 1;
unsigned hasVectorTypeHint : 1;
unsigned hasKernelIndex : 1;
- unsigned hasSGPRs : 1, hasVGPRs : 1;
unsigned hasMinWavesPerSIMD : 1, hasMaxWavesPerSIMD : 1;
unsigned hasFlatWorkgroupSizeLimits : 1;
unsigned hasMaxWorkgroupSize : 1;
unsigned isNoPartialWorkgroups : 1;
+ unsigned hasPrintfInfo : 1;
std::string name;
uint32_t requiredWorkgroupSize[3];
uint32_t workgroupSizeHint[3];
std::string vectorTypeHint;
+ std::string printfInfo;
uint32_t kernelIndex;
uint32_t numSgprs, numVgprs;
@@ -129,12 +130,11 @@ namespace code {
bool HasWorkgroupSizeHint() const { return hasWorkgroupSizeHint; }
bool HasVecTypeHint() const { return hasVectorTypeHint; }
bool HasKernelIndex() const { return hasKernelIndex; }
- bool HasSGPRs() const { return hasSGPRs; }
- bool HasVGPRs() const { return hasVGPRs; }
bool HasMinWavesPerSIMD() const { return hasMinWavesPerSIMD; }
bool HasMaxWavesPerSIMD() const { return hasMaxWavesPerSIMD; }
bool HasFlatWorkgroupSizeLimits() const { return hasFlatWorkgroupSizeLimits; }
bool HasMaxWorkgroupSize() const { return hasMaxWorkgroupSize; }
+ bool HasPrintfInfo() const { return hasPrintfInfo; }
size_t KernelArgCount() const { return args.size(); }
const KernelArg::Metadata& GetKernelArgMetadata(size_t index) const;
@@ -143,9 +143,8 @@ namespace code {
const uint32_t* RequiredWorkgroupSize() const { return hasRequiredWorkgroupSize ? requiredWorkgroupSize : nullptr; }
const uint32_t* WorkgroupSizeHint() const { return hasWorkgroupSizeHint ? workgroupSizeHint : nullptr; }
const std::string& VecTypeHint() const { return vectorTypeHint; }
+ const std::string& PrintfInfo() const { return printfInfo; }
uint32_t KernelIndex() const { return hasKernelIndex ? kernelIndex : UINT32_MAX; }
- uint32_t SGPRS() const { return hasSGPRs ? numSgprs : UINT32_MAX; }
- uint32_t VGPRS() const { return hasVGPRs ? numVgprs : UINT32_MAX; }
uint32_t MinWavesPerSIMD() const { return hasMinWavesPerSIMD ? minWavesPerSimd : UINT32_MAX; }
uint32_t MaxWavesPerSIMD() const { return hasMaxWavesPerSIMD ? maxWavesPerSimd : UINT32_MAX; }
uint32_t MinFlatWorkgroupSize() const { return hasFlatWorkgroupSizeLimits ? minFlatWorkgroupSize : UINT32_MAX; }
diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
index b97714e978..d2573a1085 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.cpp
@@ -19,15 +19,30 @@ namespace roc {
static inline ROC_ARG_TYPE
GetKernelArgType(const amd::hsa::code::KernelArg::Metadata& lcArg)
{
- switch (lcArg.TypeKind()) {
- case AMDGPU::RuntimeMD::KernelArg::Pointer:
+ switch (lcArg.Kind()) {
+ case AMDGPU::RuntimeMD::KernelArg::GlobalBuffer:
+ case AMDGPU::RuntimeMD::KernelArg::DynamicSharedPointer:
return ROC_ARGTYPE_POINTER;
- case AMDGPU::RuntimeMD::KernelArg::Value:
+ case AMDGPU::RuntimeMD::KernelArg::ByValue:
return ROC_ARGTYPE_VALUE;
case AMDGPU::RuntimeMD::KernelArg::Image:
return ROC_ARGTYPE_IMAGE;
case AMDGPU::RuntimeMD::KernelArg::Sampler:
return ROC_ARGTYPE_SAMPLER;
+ case AMDGPU::RuntimeMD::KernelArg::HiddenGlobalOffsetX:
+ return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X;
+ case AMDGPU::RuntimeMD::KernelArg::HiddenGlobalOffsetY:
+ return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y;
+ case AMDGPU::RuntimeMD::KernelArg::HiddenGlobalOffsetZ:
+ return ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z;
+ case AMDGPU::RuntimeMD::KernelArg::HiddenPrintfBuffer:
+ return ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER;
+ case AMDGPU::RuntimeMD::KernelArg::HiddenDefaultQueue:
+ return ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE;
+ case AMDGPU::RuntimeMD::KernelArg::HiddenCompletionAction:
+ return ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION;
+ case AMDGPU::RuntimeMD::KernelArg::HiddenNone:
+ return ROC_ARGTYPE_HIDDEN_NONE;
default:
return ROC_ARGTYPE_ERROR;
}
@@ -63,7 +78,8 @@ GetKernelArgType(const aclArgData* argInfo)
case ARG_TYPE_POINTER:
return ROC_ARGTYPE_POINTER;
case ARG_TYPE_VALUE:
- return ROC_ARGTYPE_VALUE;
+ return (argInfo->arg.value.data == DATATYPE_struct)
+ ? ROC_ARGTYPE_REFERENCE : ROC_ARGTYPE_VALUE;
case ARG_TYPE_IMAGE:
return ROC_ARGTYPE_IMAGE;
case ARG_TYPE_SAMPLER:
@@ -121,8 +137,7 @@ GetKernelArgAlignment(const aclArgData* argInfo)
static inline size_t
GetKernelArgPointeeAlignment(const amd::hsa::code::KernelArg::Metadata& lcArg)
{
- if (lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer
- && lcArg.AddrQual() == AMDGPU::RuntimeMD::KernelArg::Local) {
+ if (lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::DynamicSharedPointer) {
uint32_t align = lcArg.PointeeAlign();
if (align == 0) {
LogWarning("Missing DynamicSharedPointer alignment");
@@ -147,8 +162,8 @@ GetKernelArgPointeeAlignment(const aclArgData* argInfo)
static inline ROC_ACCESS_TYPE
GetKernelArgAccessType(const amd::hsa::code::KernelArg::Metadata& lcArg)
{
- if (lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer
- || lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Image) {
+ if (lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::GlobalBuffer
+ || lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::Image) {
switch (lcArg.AccQual()) {
case AMDGPU::RuntimeMD::KernelArg::ReadOnly:
return ROC_ACCESS_TYPE_RO;
@@ -191,21 +206,21 @@ GetKernelArgAccessType(const aclArgData* argInfo)
static inline ROC_ADDRESS_QUALIFIER
GetKernelAddrQual(const amd::hsa::code::KernelArg::Metadata& lcArg)
{
- if (lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer) {
- switch (lcArg.AddrQual()) {
- case AMDGPU::RuntimeMD::KernelArg::Global:
- return ROC_ADDRESS_GLOBAL;
- case AMDGPU::RuntimeMD::KernelArg::Constant:
- return ROC_ADDRESS_CONSTANT;
- case AMDGPU::RuntimeMD::KernelArg::Local:
- return ROC_ADDRESS_LOCAL;
- default:
- LogError("Unsupported address type");
- return ROC_ADDRESS_ERROR;
- }
+ if (lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::DynamicSharedPointer) {
+ return ROC_ADDRESS_LOCAL;
}
- else if ((lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Image) ||
- (lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Sampler)) {
+ else if (lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::GlobalBuffer) {
+ if (lcArg.AddrQual() == AMDGPU::RuntimeMD::KernelArg::Global) {
+ return ROC_ADDRESS_GLOBAL;
+ }
+ else if (lcArg.AddrQual() == AMDGPU::RuntimeMD::KernelArg::Constant) {
+ return ROC_ADDRESS_CONSTANT;
+ }
+ LogError("Unsupported address type");
+ return ROC_ADDRESS_ERROR;
+ }
+ else if (lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::Image
+ || lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::Sampler) {
return ROC_ADDRESS_GLOBAL;
}
return ROC_ADDRESS_ERROR;
@@ -246,9 +261,7 @@ GetKernelDataType(const amd::hsa::code::KernelArg::Metadata& lcArg)
{
aclArgDataType dataType;
- if ((lcArg.TypeKind() != AMDGPU::RuntimeMD::KernelArg::Pointer) ||
- (lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Value))
- {
+ if (lcArg.Kind() != AMDGPU::RuntimeMD::KernelArg::ByValue) {
return ROC_DATATYPE_ERROR;
}
@@ -382,7 +395,8 @@ GetOclType(const Kernel::Argument* arg)
if (arg->type_ == ROC_ARGTYPE_POINTER || arg->type_ == ROC_ARGTYPE_IMAGE) {
return T_POINTER;
}
- else if (arg->type_ == ROC_ARGTYPE_VALUE) {
+ else if (arg->type_ == ROC_ARGTYPE_VALUE
+ || arg->type_ == ROC_ARGTYPE_REFERENCE) {
switch (arg->dataType_) {
case ROC_DATATYPE_S8:
case ROC_DATATYPE_U8:
@@ -483,7 +497,8 @@ static inline cl_kernel_arg_type_qualifier
GetOclTypeQual(const amd::hsa::code::KernelArg::Metadata& lcArg)
{
cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
- if (lcArg.TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer) {
+ if (lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::GlobalBuffer
+ || lcArg.Kind() == AMDGPU::RuntimeMD::KernelArg::DynamicSharedPointer) {
if (lcArg.IsVolatile()) {
rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
}
@@ -607,7 +622,6 @@ Kernel::initArguments_LC(const amd::hsa::code::Kernel::Metadata& kernelMD)
// Initialize HSAIL kernel argument
Kernel::Argument* arg = new Kernel::Argument;
- arg->index_ = /* lcArg.IsHidden() ? uint(-1) : */ params.size();
arg->name_ = lcArg.Name();
arg->typeName_ = lcArg.TypeName();
arg->size_ = lcArg.Size();
@@ -618,11 +632,20 @@ Kernel::initArguments_LC(const amd::hsa::code::Kernel::Metadata& kernelMD)
arg->access_ = GetKernelArgAccessType(lcArg);
arg->pointeeAlignment_ = GetKernelArgPointeeAlignment(lcArg);
+ bool isHidden = arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X
+ || arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y
+ || arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z
+ || arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER
+ || arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE
+ || arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION
+ || arg->type_ == ROC_ARGTYPE_HIDDEN_NONE;
+
+ arg->index_ = isHidden ? uint(-1) : params.size();
hsailArgList_.push_back(arg);
- /*if (lcArg.IsHidden()) {
+ if (isHidden) {
continue;
- }*/
+ }
// Initialize Device kernel parameters
amd::KernelParameterDescriptor desc;
@@ -650,35 +673,13 @@ Kernel::initArguments_LC(const amd::hsa::code::Kernel::Metadata& kernelMD)
// Local memory for CPU
size = sizeof(cl_mem);
}
- offset = (size_t) amd::alignUp(offset, std::min(size, size_t(16)));
+ offset = (size_t) amd::alignUp(offset, std::min(size, size_t(16)));
desc.offset_ = offset;
offset += amd::alignUp(size, sizeof(uint32_t));
params.push_back(desc);
}
- // Push the hidden arguments. These will be generated by LC at some point
- static ROC_ARG_TYPE hiddenArgs[] = {
- ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X,
- ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y,
- ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z,
- };
- for (auto type : hiddenArgs) {
- Kernel::Argument* arg = new Kernel::Argument;
- arg->index_ = uint(-1);
- arg->name_ = "";
- arg->typeName_ = "size_t";
- arg->size_ = sizeof(size_t);
- arg->type_ = type;
- arg->addrQual_ = ROC_ADDRESS_ERROR;
- arg->dataType_ = ROC_DATATYPE_U64;
- arg->alignment_ = arg->size_;
- arg->access_ = ROC_ACCESS_TYPE_NONE;
- arg->pointeeAlignment_ = 0;
-
- hsailArgList_.push_back(arg);
- }
-
createSignature(params);
}
#endif // defined(WITH_LIGHTNING_COMPILER)
diff --git a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
index 747187fcc6..996a3ca9e7 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rockernel.hpp
@@ -20,6 +20,7 @@ enum ROC_ARG_TYPE
ROC_ARGTYPE_ERROR = 0,
ROC_ARGTYPE_POINTER,
ROC_ARGTYPE_VALUE,
+ ROC_ARGTYPE_REFERENCE,
ROC_ARGTYPE_IMAGE,
ROC_ARGTYPE_SAMPLER,
ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X,
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp b/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp
index 2ac2b9ee00..d5a88d9513 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocprogram.cpp
@@ -36,6 +36,8 @@
#endif // !defined(WITH_LIGHTNING_COMPILER)
#include "utils/bif_section_labels.hpp"
+#include "amd_hsa_kernel_code.h"
+
#include
#include
#include
@@ -1027,11 +1029,10 @@ HSAILProgram::linkImpl_LC(amd::option::Options *options)
return false;
}
- for ( auto &kernelName : kernelNameList )
- {
+ for (auto &kernelName : kernelNameList) {
hsa_executable_symbol_t kernelSymbol;
- hsa_executable_get_symbol ( hsaExecutable_, "", kernelName.c_str(),
- hsaDevice, 0, &kernelSymbol );
+ hsa_executable_get_symbol(
+ hsaExecutable_, "", kernelName.c_str(), hsaDevice, 0, &kernelSymbol);
uint64_t kernelCodeHandle;
status = hsa_executable_symbol_get_info(
@@ -1093,6 +1094,22 @@ HSAILProgram::linkImpl_LC(amd::option::Options *options)
return false;
}
+#if 0
+ for (auto s = elf.nextSymbol(NULL); s != NULL; s = elf.nextSymbol(s)) {
+ amd::OclElf::SymbolInfo si;
+ if (!elf.getSymbolInfo(s, &si)
+ || strcmp(si.sec_name, ".text") != 0
+ || strcmp(si.sym_name, kernelName.c_str()) != 0) {
+ continue;
+ }
+ const amd_kernel_code_t* akc = (amd_kernel_code_t*)
+ ((address) out_exec->Buf().data() + (si.address - si.sec_addr));
+
+ // FIXME_lmoriche: this is where we could get the SGPRs and VGPRs
+ break;
+ }
+#endif
+
Kernel *aKernel = new roc::Kernel(
kernelName,
this,
diff --git a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
index b5473a5589..f657e24207 100644
--- a/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/projects/clr/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -1497,29 +1497,34 @@ VirtualGPU::submitKernelInternal(
// matching parameter in the OCL signature (not a valid arg->index_)
if (arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_X) {
size_t offset_x = sizes.dimensions() >= 1 ? sizes.offset()[0] : 0;
- argPtr = addArg(argPtr, &offset_x, sizeof(void*));
+ assert(arg->size_ == sizeof(offset_x) && "check the sizes");
+ argPtr = addArg(argPtr, &offset_x, arg->size_, arg->alignment_);
continue;
}
else if (arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Y) {
size_t offset_y = sizes.dimensions() >= 2 ? sizes.offset()[1] : 0;
- argPtr = addArg(argPtr, &offset_y, sizeof(void*));
+ assert(arg->size_ == sizeof(offset_y) && "check the sizes");
+ argPtr = addArg(argPtr, &offset_y, arg->size_, arg->alignment_);
continue;
}
else if (arg->type_ == ROC_ARGTYPE_HIDDEN_GLOBAL_OFFSET_Z) {
size_t offset_z = sizes.dimensions() == 3 ? sizes.offset()[2] : 0;
- argPtr = addArg(argPtr, &offset_z, sizeof(void*));
+ assert(arg->size_ == sizeof(offset_z) && "check the sizes");
+ argPtr = addArg(argPtr, &offset_z, arg->size_, arg->alignment_);
continue;
}
else if (arg->type_ == ROC_ARGTYPE_HIDDEN_PRINTF_BUFFER) {
address bufferPtr = printfDbg()->dbgBuffer();
- argPtr = addArg(argPtr, &bufferPtr, sizeof(void*));
+ assert(arg->size_ == sizeof(bufferPtr) && "check the sizes");
+ argPtr = addArg(argPtr, &bufferPtr, arg->size_, arg->alignment_);
continue;
}
else if (arg->type_ == ROC_ARGTYPE_HIDDEN_DEFAULT_QUEUE
|| arg->type_ == ROC_ARGTYPE_HIDDEN_COMPLETION_ACTION
|| arg->type_ == ROC_ARGTYPE_HIDDEN_NONE) {
void* zero = 0;
- argPtr = addArg(argPtr, &zero, sizeof(void*));
+ assert(arg->size_ <= sizeof(zero) && "check the sizes");
+ argPtr = addArg(argPtr, &zero, arg->size_, arg->alignment_);
continue;
}
@@ -1558,76 +1563,74 @@ VirtualGPU::submitKernelInternal(
mem->signalWrite(&dev());
}
}
- else if (arg->type_ == ROC_ARGTYPE_VALUE) {
- if (arg->dataType_ == ROC_DATATYPE_STRUCT) {
- void *mem = allocKernArg(arg->size_, arg->alignment_);
- if (mem == NULL) {
- LogError("Out of memory");
- return false;
- }
- memcpy(mem, srcArgPtr, arg->size_);
- argPtr = addArg(argPtr, &mem, sizeof(void*));
- continue;
+ else if (arg->type_ == ROC_ARGTYPE_REFERENCE) {
+ void *mem = allocKernArg(arg->size_, arg->alignment_);
+ if (mem == NULL) {
+ LogError("Out of memory");
+ return false;
}
+ memcpy(mem, srcArgPtr, arg->size_);
+ argPtr = addArg(argPtr, &mem, sizeof(void*));
+ }
+ else if (arg->type_ == ROC_ARGTYPE_VALUE) {
argPtr = addArg(argPtr, srcArgPtr, arg->size_, arg->alignment_);
- srcArgPtr += arg->size_;
}
else if (arg->type_ == ROC_ARGTYPE_IMAGE) {
- amd::Memory* mem = *reinterpret_cast(srcArgPtr);
- Image* image = static_cast(mem->getDeviceMemory(dev()));
- if (image == NULL) {
- LogError("Kernel image argument is not an image object");
- return false;
- }
+ amd::Memory* mem = *reinterpret_cast(srcArgPtr);
+ Image* image = static_cast(mem->getDeviceMemory(dev()));
+ if (image == NULL) {
+ LogError("Kernel image argument is not an image object");
+ return false;
+ }
- if (dev().settings().enableImageHandle_) {
- const uint64_t image_srd = image->getHsaImageObject().handle;
- assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
- argPtr = addArg(argPtr, &image_srd, sizeof(image_srd));
- }
- else {
- // Image arguments are of size 48 bytes and are aligned to 16 bytes
- argPtr = addArg(argPtr, (void *)image->getHsaImageObject().handle,
- HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
- }
+ if (dev().settings().enableImageHandle_) {
+ const uint64_t image_srd = image->getHsaImageObject().handle;
+ assert(amd::isMultipleOf(image_srd, sizeof(image_srd)));
+ argPtr = addArg(argPtr, &image_srd, sizeof(image_srd));
+ }
+ else {
+ // Image arguments are of size 48 bytes and are aligned to 16 bytes
+ argPtr = addArg(argPtr, (void *)image->getHsaImageObject().handle,
+ HSA_IMAGE_OBJECT_SIZE, HSA_IMAGE_OBJECT_ALIGNMENT);
+ }
- //! @todo Compiler has to return read/write attributes
- const cl_mem_flags flags = mem->getMemFlags();
- if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
- mem->signalWrite(&dev());
- }
+ //! @todo Compiler has to return read/write attributes
+ const cl_mem_flags flags = mem->getMemFlags();
+ if (!flags || (flags & (CL_MEM_READ_WRITE | CL_MEM_WRITE_ONLY))) {
+ mem->signalWrite(&dev());
+ }
}
else if (arg->type_ == ROC_ARGTYPE_SAMPLER) {
- amd::Sampler* sampler = *reinterpret_cast(srcArgPtr);
- if (sampler == NULL) {
- LogError("Kernel sampler argument is not an sampler object");
- return false;
- }
+ amd::Sampler* sampler = *reinterpret_cast(srcArgPtr);
+ if (sampler == NULL) {
+ LogError("Kernel sampler argument is not an sampler object");
+ return false;
+ }
- hsa_ext_sampler_descriptor_t samplerDescriptor;
- fillSampleDescriptor(samplerDescriptor, *sampler);
+ hsa_ext_sampler_descriptor_t samplerDescriptor;
+ fillSampleDescriptor(samplerDescriptor, *sampler);
- hsa_ext_sampler_t hsa_sampler;
- hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(),
- &samplerDescriptor, &hsa_sampler);
- if (status != HSA_STATUS_SUCCESS) {
- LogError("Error creating device sampler object!");
- return false;
- }
+ hsa_ext_sampler_t hsa_sampler;
+ hsa_status_t status = hsa_ext_sampler_create(dev().getBackendDevice(),
+ &samplerDescriptor, &hsa_sampler);
+ if (status != HSA_STATUS_SUCCESS) {
+ LogError("Error creating device sampler object!");
+ return false;
+ }
- if (dev().settings().enableImageHandle_) {
- uint64_t sampler_srd = hsa_sampler.handle;
- argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd));
- samplerList_.push_back(hsa_sampler);
- // TODO: destroy sampler.
- }
- else {
- argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
+ if (dev().settings().enableImageHandle_) {
+ uint64_t sampler_srd = hsa_sampler.handle;
+ argPtr = addArg(argPtr, &sampler_srd, sizeof(sampler_srd));
+ samplerList_.push_back(hsa_sampler);
+ // TODO: destroy sampler.
+ }
+ else {
+ argPtr = amd::alignUp(argPtr, HSA_SAMPLER_OBJECT_ALIGNMENT);
- memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
- argPtr += HSA_SAMPLER_OBJECT_SIZE;
- hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
- }
+ memcpy(argPtr, (void*)hsa_sampler.handle, HSA_SAMPLER_OBJECT_SIZE);
+ argPtr += HSA_SAMPLER_OBJECT_SIZE;
+ hsa_ext_sampler_destroy(dev().getBackendDevice(), hsa_sampler);
+ }
}
}