From e682abb47ef3ff5f1e52aa3bb27b7e507bc4566c Mon Sep 17 00:00:00 2001
From: foreman
Date: Mon, 15 Aug 2016 18:51:49 -0400
Subject: [PATCH] P4 to Git Change 1303140 by lmoriche@lmoriche_opencl_dev on
2016/08/15 17:04:37
SWDEV-94610 - Code provided by Wilkin - Implement the roc Program Manager to call the Lightning Compiler instead of the compiler library.
- Embed and use the pre-compiled header generated by the built-in library build
- If LLVM_BIN is not set, try to find Clang from the libamdocl path
Testing: http://ocltc.amd.com:8111/viewModification.html?modId=75068&personal=true&buildTypeId=&tab=vcsModificationBuilds&show_all_builds=true
Affected files ...
... //depot/stg/opencl/drivers/opencl/compiler/lib/loaders/elf/elf.cpp#35 edit
... //depot/stg/opencl/drivers/opencl/compiler/lib/loaders/elf/elf.hpp#24 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/blitcl.cpp#9 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#200 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/build/Makefile.oclrocm#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/roccompiler.cpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocdevice.cpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.cpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rockernel.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmetadata.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocmetadata.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.cpp#7 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocprogram.hpp#5 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/rocm/rocvirtual.cpp#8 edit
... //depot/stg/opencl/drivers/opencl/runtime/platform/program.cpp#81 edit
---
rocclr/compiler/lib/loaders/elf/elf.cpp | 8 +-
rocclr/compiler/lib/loaders/elf/elf.hpp | 21 +-
rocclr/runtime/device/blitcl.cpp | 147 +++---
rocclr/runtime/device/device.cpp | 11 +
rocclr/runtime/device/rocm/roccompiler.cpp | 140 +++++-
rocclr/runtime/device/rocm/rocdevice.cpp | 22 +-
rocclr/runtime/device/rocm/rockernel.cpp | 392 +++++++++++++++-
rocclr/runtime/device/rocm/rockernel.hpp | 11 +
rocclr/runtime/device/rocm/rocmetadata.cpp | 514 +++++++++++++++++++++
rocclr/runtime/device/rocm/rocmetadata.hpp | 193 ++++++++
rocclr/runtime/device/rocm/rocprogram.cpp | 490 ++++++++++++++++++--
rocclr/runtime/device/rocm/rocprogram.hpp | 91 +++-
rocclr/runtime/device/rocm/rocvirtual.cpp | 7 +-
rocclr/runtime/platform/program.cpp | 12 +-
14 files changed, 1924 insertions(+), 135 deletions(-)
create mode 100644 rocclr/runtime/device/rocm/rocmetadata.cpp
create mode 100644 rocclr/runtime/device/rocm/rocmetadata.hpp
diff --git a/rocclr/compiler/lib/loaders/elf/elf.cpp b/rocclr/compiler/lib/loaders/elf/elf.cpp
index 4bbd739672..3d64698ab9 100644
--- a/rocclr/compiler/lib/loaders/elf/elf.cpp
+++ b/rocclr/compiler/lib/loaders/elf/elf.cpp
@@ -97,16 +97,18 @@ namespace {
"Dwarf debug frame" },
{ OclElf::JITBINARY, ".text", ELF_T_BYTE, 1, SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR,
"x86 JIT Binary" },
- { OclElf::CODEGEN, ".cg", ELF_T_BYTE, 1, SHT_PROGBITS, 0,
+ { OclElf::CODEGEN, ".cg", ELF_T_BYTE, 1, SHT_PROGBITS, 0,
"Target dependent IL" },
{ OclElf::TEXT, ".text", ELF_T_BYTE, 1, SHT_PROGBITS, SHF_ALLOC | SHF_EXECINSTR,
"Device specific ISA" },
- { OclElf::INTERNAL, ".internal", ELF_T_BYTE, 1, SHT_PROGBITS, 0,
+ { OclElf::INTERNAL, ".internal", ELF_T_BYTE, 1, SHT_PROGBITS, 0,
"Internal usage" },
- { OclElf::SPIR, ".spir", ELF_T_BYTE, 1, SHT_PROGBITS, 0,
+ { OclElf::SPIR, ".spir", ELF_T_BYTE, 1, SHT_PROGBITS, 0,
"Vendor/Device-independent LLVM IR" },
{ OclElf::SPIRV, ".spirv", ELF_T_BYTE, 1, SHT_PROGBITS, 0,
"SPIR-V Binary" },
+ { OclElf::RUNTIME_METADATA,".AMDGPU.runtime_metadata", ELF_T_BYTE, 1, SHT_PROGBITS, 0,
+ "AMDGPU runtime metadata" },
};
// index 0 is reserved and must be there (NULL section)
diff --git a/rocclr/compiler/lib/loaders/elf/elf.hpp b/rocclr/compiler/lib/loaders/elf/elf.hpp
index 49c33fe583..162116e739 100644
--- a/rocclr/compiler/lib/loaders/elf/elf.hpp
+++ b/rocclr/compiler/lib/loaders/elf/elf.hpp
@@ -53,12 +53,12 @@ bool isCALTarget(const char* p, signed char ec);
// Symbol handle
typedef struct symbol_handle *Sym_Handle;
-class OclElf
+class OclElf
{
public:
enum {
CAL_BASE = 1001, // A number that is not dependent on libelf.h
- CPU_BASE = 2001,
+ CPU_BASE = 2001,
CPU_FEATURES_FIRST = 0, // Never generated, but keep it for simplicity.
CPU_FEATURES_LAST = 0xF // This should be consistent with cpudevice.hpp
} oclElfBase;
@@ -66,23 +66,25 @@ public:
typedef enum {
// NOTE!!! Never remove an entry or change the order.
- // All CAL targets are within [CAL_FIRST, CAL_LAST].
+ // All CAL targets are within [CAL_FIRST, CAL_LAST].
CAL_FIRST = CAL_TARGET_600 + CAL_BASE,
CAL_LAST = CAL_TARGET_LAST + CAL_BASE,
-
+
// All CPU targets are within [CPU_FIRST, CPU_LAST]
CPU_FIRST = CPU_FEATURES_FIRST + CPU_BASE,
- CPU_LAST = CPU_FEATURES_LAST + CPU_BASE,
+ CPU_LAST = CPU_FEATURES_LAST + CPU_BASE,
+
OCL_TARGETS_LAST,
} oclElfTargets;
-
+
typedef enum {
CAL_PLATFORM = 0,
CPU_PLATFORM = 1,
COMPLIB_PLATFORM = 2,
- LAST_PLATFORM = 3
- } oclElfPlatform;
-
+ LC_PLATFORM = 3,
+ LAST_PLATFORM = 4
+ } oclElfPlatform;
+
typedef enum {
LLVMIR = 0,
SOURCE,
@@ -114,6 +116,7 @@ public:
INTERNAL,
SPIR,
SPIRV,
+ RUNTIME_METADATA,
OCL_ELF_SECTIONS_LAST
} oclElfSections;
diff --git a/rocclr/runtime/device/blitcl.cpp b/rocclr/runtime/device/blitcl.cpp
index d222880ad4..78000884b7 100644
--- a/rocclr/runtime/device/blitcl.cpp
+++ b/rocclr/runtime/device/blitcl.cpp
@@ -6,24 +6,8 @@ namespace device {
#define BLIT_KERNELS(...) #__VA_ARGS__
-const char* BlitSourceCode = BLIT_KERNELS(
-
-extern void __amd_copyBufferToImage(
- __global uint*, __write_only image2d_array_t, ulong4,
- int4, int4, uint4, ulong4);
-
-extern void __amd_copyImageToBuffer(
- __read_only image2d_array_t, __global uint*, __global ushort*,
- __global uchar*, int4, ulong4, int4, uint4, ulong4);
-
-extern void __amd_copyImage(
- __read_only image2d_array_t, __write_only image2d_array_t,
- int4, int4, int4);
-
-extern void __amd_copyImage1DA(
- __read_only image2d_array_t, __write_only image2d_array_t,
- int4, int4, int4);
-
+const char* BlitSourceCode =
+BLIT_KERNELS(
extern void __amd_copyBufferRect(
__global uchar*, __global uchar*,
ulong4, ulong4, ulong4);
@@ -44,58 +28,6 @@ extern void __amd_fillBuffer(
__global uchar*, __global uint*, __constant uchar*,
uint, ulong, ulong);
-extern void __amd_fillImage(
- __write_only image2d_array_t,
- float4, int4, uint4, int4, int4, uint);
-
-
-__kernel void copyBufferToImage(
- __global uint* src,
- __write_only image2d_array_t dst,
- ulong4 srcOrigin,
- int4 dstOrigin,
- int4 size,
- uint4 format,
- ulong4 pitch)
-{
- __amd_copyBufferToImage(src, dst, srcOrigin, dstOrigin, size, format, pitch);
-}
-
-__kernel void copyImageToBuffer(
- __read_only image2d_array_t src,
- __global uint* dstUInt,
- __global ushort* dstUShort,
- __global uchar* dstUChar,
- int4 srcOrigin,
- ulong4 dstOrigin,
- int4 size,
- uint4 format,
- ulong4 pitch)
-{
- __amd_copyImageToBuffer(src, dstUInt, dstUShort, dstUChar,
- srcOrigin, dstOrigin, size, format, pitch);
-}
-
-__kernel void copyImage(
- __read_only image2d_array_t src,
- __write_only image2d_array_t dst,
- int4 srcOrigin,
- int4 dstOrigin,
- int4 size)
-{
- __amd_copyImage(src, dst, srcOrigin, dstOrigin, size);
-}
-
-__kernel void copyImage1DA(
- __read_only image2d_array_t src,
- __write_only image2d_array_t dst,
- int4 srcOrigin,
- int4 dstOrigin,
- int4 size)
-{
- __amd_copyImage1DA(src, dst, srcOrigin, dstOrigin, size);
-}
-
__kernel void copyBufferRect(
__global uchar* src,
__global uchar* dst,
@@ -148,6 +80,76 @@ __kernel void fillBuffer(
{
__amd_fillBuffer(bufUChar, bufUInt, pattern, patternSize, offset, size);
}
+)
+#if !defined(WITH_LIGHTNING_COMPILER)
+BLIT_KERNELS(
+extern void __amd_copyBufferToImage(
+ __global uint*, __write_only image2d_array_t, ulong4,
+ int4, int4, uint4, ulong4);
+
+extern void __amd_copyImageToBuffer(
+ __read_only image2d_array_t, __global uint*, __global ushort*,
+ __global uchar*, int4, ulong4, int4, uint4, ulong4);
+
+extern void __amd_copyImage(
+ __read_only image2d_array_t, __write_only image2d_array_t,
+ int4, int4, int4);
+
+extern void __amd_copyImage1DA(
+ __read_only image2d_array_t, __write_only image2d_array_t,
+ int4, int4, int4);
+
+extern void __amd_fillImage(
+ __write_only image2d_array_t,
+ float4, int4, uint4, int4, int4, uint);
+
+
+__kernel void copyBufferToImage(
+ __global uint* src,
+ __write_only image2d_array_t dst,
+ ulong4 srcOrigin,
+ int4 dstOrigin,
+ int4 size,
+ uint4 format,
+ ulong4 pitch)
+{
+ __amd_copyBufferToImage(src, dst, srcOrigin, dstOrigin, size, format, pitch);
+}
+
+__kernel void copyImageToBuffer(
+ __read_only image2d_array_t src,
+ __global uint* dstUInt,
+ __global ushort* dstUShort,
+ __global uchar* dstUChar,
+ int4 srcOrigin,
+ ulong4 dstOrigin,
+ int4 size,
+ uint4 format,
+ ulong4 pitch)
+{
+ __amd_copyImageToBuffer(src, dstUInt, dstUShort, dstUChar,
+ srcOrigin, dstOrigin, size, format, pitch);
+}
+
+__kernel void copyImage(
+ __read_only image2d_array_t src,
+ __write_only image2d_array_t dst,
+ int4 srcOrigin,
+ int4 dstOrigin,
+ int4 size)
+{
+ __amd_copyImage(src, dst, srcOrigin, dstOrigin, size);
+}
+
+__kernel void copyImage1DA(
+ __read_only image2d_array_t src,
+ __write_only image2d_array_t dst,
+ int4 srcOrigin,
+ int4 dstOrigin,
+ int4 size)
+{
+ __amd_copyImage1DA(src, dst, srcOrigin, dstOrigin, size);
+}
__kernel void fillImage(
__write_only image2d_array_t image,
@@ -161,7 +163,8 @@ __kernel void fillImage(
__amd_fillImage(image, patternFLOAT4, patternINT4, patternUINT4,
origin, size, type);
}
-
-);
+)
+#endif // !defined(WITH_LIGHTNING_COMPILER)
+;
} // namespace device
diff --git a/rocclr/runtime/device/device.cpp b/rocclr/runtime/device/device.cpp
index 60c1de21bc..e97b18c82b 100644
--- a/rocclr/runtime/device/device.cpp
+++ b/rocclr/runtime/device/device.cpp
@@ -140,14 +140,21 @@ Device::BlitProgram::create(amd::Device* device,
}
// Build all kernels
+#if defined(WITH_LIGHTNING_COMPILER)
+ std::string opt = "";
+#else // !defined(WITH_LIGHTNING_COMPILER)
std::string opt = "-Wf,--force_disable_spir -fno-lib-no-inline "\
"-fno-sc-keep-calls -cl-internal-kernel ";
+#endif // !defined(WITH_LIGHTNING_COMPILER)
+
if (extraOptions != NULL) {
opt += extraOptions;
}
+#if !defined(WITH_LIGHTNING_COMPILER)
if (!GPU_DUMP_BLIT_KERNELS) {
opt += " -fno-enable-dump";
}
+#endif // !defined(WITH_LIGHTNING_COMPILER)
if (CL_SUCCESS != program_->build(devices, opt.c_str(),
NULL, NULL, GPU_DUMP_BLIT_KERNELS)) {
return false;
@@ -1248,7 +1255,11 @@ Program::setBinary(char* binaryIn, size_t size)
return false;
}
+#if defined(WITH_LIGHTNING_COMPILER)
+ if (!clBinary()->setElfIn(ELFCLASS64)) {
+#else // !defined(WITH_LIGHTNING_COMPILER)
if (!clBinary()->setElfIn(ELFCLASS32)) {
+#endif // !defined(WITH_LIGHTNING_COMPILER)
LogError("Setting input OCL binary failed");
return false;
}
diff --git a/rocclr/runtime/device/rocm/roccompiler.cpp b/rocclr/runtime/device/rocm/roccompiler.cpp
index dc84629292..327ffb5bff 100644
--- a/rocclr/runtime/device/rocm/roccompiler.cpp
+++ b/rocclr/runtime/device/rocm/roccompiler.cpp
@@ -11,7 +11,9 @@
#include "os/os.hpp"
#include "rocdevice.hpp"
#include "rocprogram.hpp"
-#if !defined(WITH_LIGHTNING_COMPILER)
+#if defined(WITH_LIGHTNING_COMPILER)
+#include "opencl-c.amdgcn.inc"
+#else // !defined(WITH_LIGHTNING_COMPILER)
#include "roccompilerlib.hpp"
#endif // !defined(WITH_LIGHTNING_COMPILER)
#include "utils/options.hpp"
@@ -30,6 +32,137 @@ static void logFunction(const char* msg, size_t size)
static int programsCount = 0;
+#if defined(WITH_LIGHTNING_COMPILER)
+bool
+HSAILProgram::compileImpl_LC(const std::string& sourceCode,
+ const std::vector& headers,
+ const char** headerIncludeNames,
+ amd::option::Options* options)
+{
+ std::vector complibOptions;
+ if (!this->compileOptions_.empty()) {
+ complibOptions.push_back(this->compileOptions_);
+ }
+
+ std::vector inputs;
+ amd::opencl_driver::Data* src = device().compiler()->NewBufferReference(
+ amd::opencl_driver::DT_CL,sourceCode.c_str(),
+ sourceCode.length());
+ if (src == NULL) {
+ buildLog_ += "Error while creating data from source code";
+ return false;
+ }
+ inputs.push_back(src);
+
+ //Find the temp folder for the OS
+ std::string tempFolder = amd::Os::getEnvironment("TEMP");
+ if (tempFolder.empty()) {
+ tempFolder = amd::Os::getEnvironment("TMP");
+ if (tempFolder.empty()) {
+ tempFolder = WINDOWS_SWITCH(".","/tmp");;
+ }
+ }
+ //Iterate through each source code and dump it into tmp
+ std::fstream f;
+ std::vector headerFileNames(headers.size());
+ std::vector newDirs;
+ for (size_t i = 0; i < headers.size(); ++i) {
+ std::string headerPath = tempFolder;
+ std::string headerIncludeName(headerIncludeNames[i]);
+ // replace / in path with current os's file separator
+ if ( amd::Os::fileSeparator() != '/') {
+ for (std::string::iterator it = headerIncludeName.begin(),
+ end = headerIncludeName.end();
+ it != end;
+ ++it) {
+ if (*it == '/') *it = amd::Os::fileSeparator();
+ }
+ }
+ size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator());
+ if (pos != std::string::npos) {
+ headerPath += amd::Os::fileSeparator();
+ headerPath += headerIncludeName.substr(0, pos);
+ headerIncludeName = headerIncludeName.substr(pos+1);
+ }
+ if (!amd::Os::pathExists(headerPath)) {
+ bool ret = amd::Os::createPath(headerPath);
+ assert(ret && "failed creating path!");
+ newDirs.push_back(headerPath);
+ }
+ std::string headerFullName
+ = headerPath + amd::Os::fileSeparator() + headerIncludeName;
+ headerFileNames[i] = headerFullName;
+ f.open(headerFullName.c_str(), std::fstream::out);
+ //Should we allow asserts
+ assert(!f.fail() && "failed creating header file!");
+ f.write(headers[i]->c_str(), headers[i]->length());
+ f.close();
+
+ amd::opencl_driver::Data* inc = device().compiler()->NewFileReference(
+ amd::opencl_driver::DT_CL_HEADER,
+ headerFileNames[i]);
+ inputs.push_back(inc);
+ }
+
+
+ //Set the options for the compiler
+ //Set the include path for the temp folder that contains the includes
+ if(!headers.empty()) {
+ complibOptions.push_back("-I"+tempFolder);
+ }
+
+ //Add only for CL2.0 and later
+ if (options->oVariables->CLStd[2] >= '2') {
+ std::stringstream opts;
+ opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE="
+ << device().info().maxGlobalVariableSize_;
+
+ complibOptions.push_back(opts.str());
+ }
+
+ // Compile source to IR
+ appendHsailOptions(complibOptions);
+
+ amd::opencl_driver::File* pch = device().compiler()->NewTempFile(
+ amd::opencl_driver::DT_CL_HEADER);
+ if (pch == NULL || !pch->WriteData((const char*) opencl_c_amdgcn,
+ opencl_c_amdgcn_size)) {
+ buildLog_ += "Error while opening the opencl-c header ";
+ return false;
+ }
+
+ // FIXME_lmoriche: Force OpenCL-C 2.0, since the built-ins are built that way.
+ complibOptions.push_back("-Xclang");complibOptions.push_back("-cl-std=CL2.0");
+ complibOptions.push_back("-Xclang");complibOptions.push_back("-include-pch");
+ complibOptions.push_back("-Xclang");complibOptions.push_back(pch->Name());
+
+ amd::opencl_driver::Buffer* output = device().compiler()->NewBuffer(amd::opencl_driver::DT_LLVM_BC);
+ if (output == NULL) {
+ buildLog_ += "Error while creating buffer for the LLVM bitcode";
+ return false;
+ }
+
+ if (!device().compiler()->CompileToLLVMBitcode(inputs, output, complibOptions)) {
+ buildLog_ += "Error while compiling \
+ opencl source: Compiling CL to IR";
+#if 0
+ std::cerr << "\n**** Compiler Output After CompileToLLVMBitcode ****\n";
+ std::cerr << device().compiler()->Output().c_str();
+ std::cerr << "********************************************************\n\n";
+#endif
+ return false;
+ }
+
+ // save the source code
+ //Create Binary
+ codeObjBinary_ = new CodeObjBinary();
+
+ openCLSource_ = sourceCode;
+ codeObjBinary_->saveIR(std::string(output->Buf().begin(), output->Buf().end()));
+ return true;
+}
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
bool
HSAILProgram::compileImpl(const std::string& sourceCode,
const std::vector& headers,
@@ -37,14 +170,13 @@ HSAILProgram::compileImpl(const std::string& sourceCode,
amd::option::Options* options)
{
#if defined(WITH_LIGHTNING_COMPILER)
- assert(!"FIXME_Wilkin");
- return false;
+ return compileImpl_LC(sourceCode, headers, headerIncludeNames, options);
#else // !defined(WITH_LIGHTNING_COMPILER)
acl_error errorCode;
aclTargetInfo target;
//Defaulting to bonaire
- //Todo (sramalin) : Query the device for asic type-
+ //Todo (sramalin) : Query the device for asic type-
//Defaulting to Bonair for now.
target = g_complibApi._aclGetTargetInfo(LP64_SWITCH("hsail","hsail64"), "Bonaire",
&errorCode);
diff --git a/rocclr/runtime/device/rocm/rocdevice.cpp b/rocclr/runtime/device/rocm/rocdevice.cpp
index e850c5ca3e..2bde6e794d 100644
--- a/rocclr/runtime/device/rocm/rocdevice.cpp
+++ b/rocclr/runtime/device/rocm/rocdevice.cpp
@@ -37,6 +37,11 @@
#include
#endif // WITHOUT_HSA_BACKEND
+#if defined(ATI_OS_LINUX)
+#include
+#include
+#endif // defined(ATI_OS_LINUX)
+
#define OPENCL_VERSION_STR XSTR(OPENCL_MAJOR) "." XSTR(OPENCL_MINOR)
#ifndef WITHOUT_HSA_BACKEND
@@ -232,7 +237,20 @@ Device::~Device()
bool NullDevice::initCompiler(bool isOffline) {
#if defined(WITH_LIGHTNING_COMPILER)
if (!compilerHandle_) {
- const std::string llvmbin(amd::Os::getEnvironment("LLVM_BIN"));
+ std::string llvmbin = amd::Os::getEnvironment("LLVM_BIN");
+#if defined(ATI_OS_LINUX)
+ // FIXME_Wilkin: When no LLVM_BIN defined, use the default path
+ if (llvmbin.empty()) {
+ Dl_info info;
+ if (dladdr((const void*)&roc::NullDevice::initCompiler, &info)) {
+ llvmbin = dirname(strdup(info.dli_fname));
+ size_t pos = llvmbin.rfind("lib");
+ if (pos != std::string::npos) {
+ llvmbin.replace(pos, 3, "bin");
+ }
+ }
+ }
+#endif // defined(ATI_OS_LINUX)
compilerHandle_ = amd::opencl_driver::CompilerFactory()
.CreateAMDGPUCompiler(llvmbin);
if (!compilerHandle_) {
@@ -436,7 +454,6 @@ Device::create()
return false;
}
-#if !defined(WITH_LIGHTNING_COMPILER) // FIXME_Wilkin
blitProgram_ = new BlitProgram(context_);
// Create blit programs
if (blitProgram_ == NULL || !blitProgram_->create(this)) {
@@ -445,7 +462,6 @@ Device::create()
LogError("Couldn't create blit kernels!");
return false;
}
-#endif // !defined(WITH_LIGHTNING_COMPILER)
mapCacheOps_ = new amd::Monitor("Map Cache Lock", true);
if (NULL == mapCacheOps_) {
diff --git a/rocclr/runtime/device/rocm/rockernel.cpp b/rocclr/runtime/device/rocm/rockernel.cpp
index f6a89d0ddc..5cd2eb0529 100644
--- a/rocclr/runtime/device/rocm/rockernel.cpp
+++ b/rocclr/runtime/device/rocm/rockernel.cpp
@@ -5,6 +5,9 @@
#include "rockernel.hpp"
#include "SCHSAInterface.h"
#include "amd_hsa_kernel_code.h"
+#if defined(WITH_LIGHTNING_COMPILER)
+#include "rocmetadata.hpp"
+#endif // defined(WITH_LIGHTNING_COMPILER)
#include
@@ -12,6 +15,25 @@
namespace roc {
+#if defined(WITH_LIGHTNING_COMPILER)
+inline static HSAIL_ARG_TYPE
+GetHSAILArgType(const RuntimeMD::KernelArg::Metadata* lcArg)
+{
+ switch (lcArg->TypeKind()) {
+ case AMDGPU::RuntimeMD::KernelArg::Pointer:
+ return HSAIL_ARGTYPE_POINTER;
+ case AMDGPU::RuntimeMD::KernelArg::Value:
+ return HSAIL_ARGTYPE_VALUE;
+ case AMDGPU::RuntimeMD::KernelArg::Image:
+ return HSAIL_ARGTYPE_IMAGE;
+ case AMDGPU::RuntimeMD::KernelArg::Sampler:
+ return HSAIL_ARGTYPE_SAMPLER;
+ default:
+ return HSAIL_ARGTYPE_ERROR;
+ }
+}
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
inline static HSAIL_ARG_TYPE
GetHSAILArgType(const aclArgData* argInfo)
{
@@ -30,6 +52,17 @@ GetHSAILArgType(const aclArgData* argInfo)
}
}
+#if defined(WITH_LIGHTNING_COMPILER)
+inline static size_t
+GetHSAILArgAlignment(const RuntimeMD::KernelArg::Metadata* lcArg)
+{
+ if (lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer)
+ return lcArg->Align();
+
+ return 1;
+}
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
inline static size_t
GetHSAILArgAlignment(const aclArgData* argInfo)
{
@@ -41,6 +74,25 @@ GetHSAILArgAlignment(const aclArgData* argInfo)
}
}
+#if defined(WITH_LIGHTNING_COMPILER)
+inline static HSAIL_ACCESS_TYPE
+GetHSAILArgAccessType(const RuntimeMD::KernelArg::Metadata* lcArg)
+{
+ if (lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer) {
+ switch (lcArg->AccQual()) {
+ case AMDGPU::RuntimeMD::KernelArg::ReadOnly:
+ return HSAIL_ACCESS_TYPE_RO;
+ case AMDGPU::RuntimeMD::KernelArg::WriteOnly:
+ return HSAIL_ACCESS_TYPE_WO;
+ case AMDGPU::RuntimeMD::KernelArg::ReadWrite:
+ default:
+ return HSAIL_ACCESS_TYPE_RW;
+ }
+ }
+ return HSAIL_ACCESS_TYPE_NONE;
+}
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
inline static HSAIL_ACCESS_TYPE
GetHSAILArgAccessType(const aclArgData* argInfo)
{
@@ -58,6 +110,30 @@ GetHSAILArgAccessType(const aclArgData* argInfo)
return HSAIL_ACCESS_TYPE_NONE;
}
+#if defined(WITH_LIGHTNING_COMPILER)
+inline static HSAIL_ADDRESS_QUALIFIER
+GetHSAILAddrQual(const RuntimeMD::KernelArg::Metadata* lcArg)
+{
+ if (lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer) {
+ switch (lcArg->AddrQual()) {
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ return HSAIL_ADDRESS_GLOBAL;
+ case AMDGPUAS::LOCAL_ADDRESS:
+ return HSAIL_ADDRESS_LOCAL;
+ default:
+ LogError("Unsupported address type");
+ return HSAIL_ADDRESS_ERROR;
+ }
+ }
+ else if ((lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Image) ||
+ (lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Sampler)) {
+ return HSAIL_ADDRESS_GLOBAL;
+ }
+ return HSAIL_ADDRESS_ERROR;
+}
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
inline static HSAIL_ADDRESS_QUALIFIER
GetHSAILAddrQual(const aclArgData* argInfo)
{
@@ -84,6 +160,50 @@ GetHSAILAddrQual(const aclArgData* argInfo)
return HSAIL_ADDRESS_ERROR;
}
+#if defined(WITH_LIGHTNING_COMPILER)
+/* f16 returns f32 - workaround due to comp lib */
+inline static HSAIL_DATA_TYPE
+GetHSAILDataType(const RuntimeMD::KernelArg::Metadata* lcArg)
+{
+ aclArgDataType dataType;
+
+ if ((lcArg->TypeKind() != AMDGPU::RuntimeMD::KernelArg::Pointer) ||
+ (lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Value))
+ {
+ return HSAIL_DATATYPE_ERROR;
+ }
+
+ switch (lcArg->ValueType()) {
+ case AMDGPU::RuntimeMD::KernelArg::I8:
+ return HSAIL_DATATYPE_S8;
+ case AMDGPU::RuntimeMD::KernelArg::I16:
+ return HSAIL_DATATYPE_S16;
+ case AMDGPU::RuntimeMD::KernelArg::I32:
+ return HSAIL_DATATYPE_S32;
+ case AMDGPU::RuntimeMD::KernelArg::I64:
+ return HSAIL_DATATYPE_S64;
+ case AMDGPU::RuntimeMD::KernelArg::U8:
+ return HSAIL_DATATYPE_U8;
+ case AMDGPU::RuntimeMD::KernelArg::U16:
+ return HSAIL_DATATYPE_U16;
+ case AMDGPU::RuntimeMD::KernelArg::U32:
+ return HSAIL_DATATYPE_U32;
+ case AMDGPU::RuntimeMD::KernelArg::U64:
+ return HSAIL_DATATYPE_U64;
+ case AMDGPU::RuntimeMD::KernelArg::F16:
+ return HSAIL_DATATYPE_F32;
+ case AMDGPU::RuntimeMD::KernelArg::F32:
+ return HSAIL_DATATYPE_F32;
+ case AMDGPU::RuntimeMD::KernelArg::F64:
+ return HSAIL_DATATYPE_F64;
+ case AMDGPU::RuntimeMD::KernelArg::Struct:
+ return HSAIL_DATATYPE_STRUCT;
+ default:
+ return HSAIL_DATATYPE_ERROR;
+ }
+}
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
/* f16 returns f32 - workaround due to comp lib */
inline static HSAIL_DATA_TYPE
GetHSAILDataType(const aclArgData* argInfo)
@@ -176,6 +296,79 @@ GetHSAILArgSize(const aclArgData *argInfo)
}
}
+#if defined(WITH_LIGHTNING_COMPILER)
+inline static clk_value_type_t
+GetOclType(const RuntimeMD::KernelArg::Metadata* lcArg)
+{
+ static const clk_value_type_t ClkValueMapType[6][6] = {
+ { T_CHAR, T_CHAR2, T_CHAR3, T_CHAR4, T_CHAR8, T_CHAR16 },
+ { T_SHORT, T_SHORT2, T_SHORT3, T_SHORT4, T_SHORT8, T_SHORT16 },
+ { T_INT, T_INT2, T_INT3, T_INT4, T_INT8, T_INT16 },
+ { T_LONG, T_LONG2, T_LONG3, T_LONG4, T_LONG8, T_LONG16 },
+ { T_FLOAT, T_FLOAT2, T_FLOAT3, T_FLOAT4, T_FLOAT8, T_FLOAT16 },
+ { T_DOUBLE, T_DOUBLE2, T_DOUBLE3, T_DOUBLE4, T_DOUBLE8, T_DOUBLE16 },
+ };
+
+ uint sizeType;
+ uint numElements;
+ if ((lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer) ||
+ (lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Image)) {
+ return T_POINTER;
+ }
+ else if (lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Value) {
+ switch (lcArg->ValueType()) {
+ case AMDGPU::RuntimeMD::KernelArg::I8:
+ case AMDGPU::RuntimeMD::KernelArg::U8:
+ sizeType = 0;
+ numElements = lcArg->Size();
+ break;
+ case AMDGPU::RuntimeMD::KernelArg::I16:
+ case AMDGPU::RuntimeMD::KernelArg::U16:
+ case AMDGPU::RuntimeMD::KernelArg::F16:
+ sizeType = 1;
+ numElements = lcArg->Size() / 2;
+ break;
+ case AMDGPU::RuntimeMD::KernelArg::I32:
+ case AMDGPU::RuntimeMD::KernelArg::U32:
+ sizeType = 2;
+ numElements = lcArg->Size() / 4;
+ break;
+ case AMDGPU::RuntimeMD::KernelArg::I64:
+ case AMDGPU::RuntimeMD::KernelArg::U64:
+ sizeType = 3;
+ numElements = lcArg->Size() / 8;
+ break;
+ case AMDGPU::RuntimeMD::KernelArg::F32:
+ sizeType = 4;
+ numElements = lcArg->Size() / 4;
+ break;
+ case AMDGPU::RuntimeMD::KernelArg::F64:
+ sizeType = 5;
+ numElements = lcArg->Size() / 8;
+ break;
+ default:
+ return T_VOID;
+ }
+
+ switch (numElements) {
+ case 1: return ClkValueMapType[sizeType][0];
+ case 2: return ClkValueMapType[sizeType][1];
+ case 3: return ClkValueMapType[sizeType][2];
+ case 4: return ClkValueMapType[sizeType][3];
+ case 8: return ClkValueMapType[sizeType][4];
+ case 16: return ClkValueMapType[sizeType][5];
+ default: return T_VOID;
+ }
+ }
+ else if (lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Sampler) {
+ return T_SAMPLER;
+ }
+ else {
+ return T_VOID;
+ }
+}
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
inline static clk_value_type_t
GetOclType(const aclArgData* argInfo)
{
@@ -238,6 +431,30 @@ GetOclType(const aclArgData* argInfo)
}
}
+#if defined(WITH_LIGHTNING_COMPILER)
+inline static cl_kernel_arg_address_qualifier
+GetOclAddrQual(const RuntimeMD::KernelArg::Metadata* lcArg)
+{
+ if (lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer) {
+ switch (lcArg->AddrQual()) {
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ return CL_KERNEL_ARG_ADDRESS_GLOBAL;
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ return CL_KERNEL_ARG_ADDRESS_CONSTANT;
+ case AMDGPUAS::LOCAL_ADDRESS:
+ return CL_KERNEL_ARG_ADDRESS_LOCAL;
+ default:
+ return CL_KERNEL_ARG_ADDRESS_PRIVATE;
+ }
+ }
+ else if (lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Image) {
+ return CL_KERNEL_ARG_ADDRESS_GLOBAL;
+ }
+ //default for all other cases
+ return CL_KERNEL_ARG_ADDRESS_PRIVATE;
+}
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
inline static cl_kernel_arg_address_qualifier
GetOclAddrQual(const aclArgData* argInfo)
{
@@ -264,6 +481,26 @@ GetOclAddrQual(const aclArgData* argInfo)
return CL_KERNEL_ARG_ADDRESS_PRIVATE;
}
+#if defined(WITH_LIGHTNING_COMPILER)
+inline static cl_kernel_arg_access_qualifier
+GetOclAccessQual(const RuntimeMD::KernelArg::Metadata* lcArg)
+{
+ if (lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Image) {
+ switch (lcArg->AccQual()) {
+ case AMDGPU::RuntimeMD::KernelArg::ReadOnly:
+ return CL_KERNEL_ARG_ACCESS_READ_ONLY;
+ case AMDGPU::RuntimeMD::KernelArg::WriteOnly:
+ return CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
+ case AMDGPU::RuntimeMD::KernelArg::ReadWrite:
+ return CL_KERNEL_ARG_ACCESS_READ_WRITE;
+ default:
+ return CL_KERNEL_ARG_ACCESS_NONE;
+ }
+ }
+ return CL_KERNEL_ARG_ACCESS_NONE;
+}
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
inline static cl_kernel_arg_access_qualifier
GetOclAccessQual(const aclArgData* argInfo)
{
@@ -282,6 +519,26 @@ GetOclAccessQual(const aclArgData* argInfo)
return CL_KERNEL_ARG_ACCESS_NONE;
}
+#if defined(WITH_LIGHTNING_COMPILER)
+inline static cl_kernel_arg_type_qualifier
+GetOclTypeQual(const RuntimeMD::KernelArg::Metadata* lcArg)
+{
+ cl_kernel_arg_type_qualifier rv = CL_KERNEL_ARG_TYPE_NONE;
+ if (lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Pointer) {
+ if (lcArg->IsVolatile()) {
+ rv |= CL_KERNEL_ARG_TYPE_VOLATILE;
+ }
+ if (lcArg->IsRestrict()) {
+ rv |= CL_KERNEL_ARG_TYPE_RESTRICT;
+ }
+ if (lcArg->IsConst()) {
+ rv |= CL_KERNEL_ARG_TYPE_CONST;
+ }
+ }
+ return rv;
+}
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
inline static cl_kernel_arg_type_qualifier
GetOclTypeQual(const aclArgData* argInfo)
{
@@ -446,6 +703,66 @@ Kernel::initArgList(const aclArgData* aclArg)
createSignature(params);
}
+#if defined(WITH_LIGHTNING_COMPILER)
+void
+Kernel::initArgsParams( const RuntimeMD::KernelArg::Metadata* lcArg, size_t* kOffset,
+ device::Kernel::parameters_t& params, size_t* pOffset )
+{
+ HsailKernelArg* arg = new HsailKernelArg;
+
+ // Initialize HSAIL kernel argument
+ arg->name_ = lcArg->Name();
+ arg->typeName_ = lcArg->TypeName();
+ arg->size_ = lcArg->Size(); // LC doesn't distinguish vector or single element
+ arg->offset_ = *kOffset;
+ arg->type_ = GetHSAILArgType(lcArg);
+ arg->addrQual_ = GetHSAILAddrQual(lcArg);
+ arg->dataType_ = GetHSAILDataType(lcArg);
+ // If vector of args we add additional arguments to flatten it out
+ arg->numElem_ = ((lcArg->TypeKind() == AMDGPU::RuntimeMD::KernelArg::Value) &&
+ (lcArg->ValueType() != AMDGPU::RuntimeMD::KernelArg::Struct)) ?
+ (lcArg->Size() / arg->size_) : 1;
+ arg->alignment_ = GetHSAILArgAlignment(lcArg);
+ arg->access_ = GetHSAILArgAccessType(lcArg);
+
+ hsailArgList_.push_back(arg);
+
+ *kOffset += lcArg->Size();
+
+ // Initialize Device kernel parameters
+ amd::KernelParameterDescriptor desc;
+
+ desc.name_ = lcArg->Name().c_str();
+ desc.type_ = GetOclType(lcArg);
+ desc.addressQualifier_ = GetOclAddrQual(lcArg);
+ desc.accessQualifier_ = GetOclAccessQual(lcArg);
+ desc.typeQualifier_ = GetOclTypeQual(lcArg);
+ desc.typeName_ = lcArg->TypeName().c_str();
+
+ // Make a check if it is local or global
+ if (desc.addressQualifier_ == CL_KERNEL_ARG_ADDRESS_LOCAL) {
+ desc.size_ = 0;
+ }
+ else {
+ desc.size_ = lcArg->Size();
+ }
+
+ // Make offset alignment to match CPU metadata, since
+ // in multidevice config abstraction layer has a single signature
+ // and CPU sends the paramaters as they are allocated in memory
+ size_t size = desc.size_;
+ if (size == 0) {
+ // Local memory for CPU
+ size = sizeof(cl_mem);
+ }
+ *pOffset = (size_t) amd::alignUp(*pOffset, std::min(size, size_t(16)));
+ desc.offset_ = *pOffset;
+ *pOffset += amd::alignUp(size, sizeof(uint32_t));
+
+ params.push_back(desc);
+}
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
void
Kernel::initHsailArgs(const aclArgData* aclArg)
{
@@ -492,9 +809,80 @@ Kernel::Kernel(std::string name, HSAILProgram* prog,
kernargSegmentAlignment_(kernargSegmentAlignment),
extraArgumentsNum_(extraArgsNum) {}
+#if defined(WITH_LIGHTNING_COMPILER)
+bool Kernel::init_LC(){
+ hsa_agent_t hsaDevice = program_->hsaDevice();
+
+ // Pull out metadata from the ELF
+ const CodeObjBinary* codeObj = program_->codeObjBinary();
+ const RuntimeMD::Program::Metadata* runtimeMD = codeObj->GetProgramMetadata();
+
+ if (!runtimeMD) {
+ return false;
+ }
+
+ size_t idx = runtimeMD->KernelIndexByName(name());
+ const RuntimeMD::Kernel::Metadata* kernelMD = runtimeMD->GetKernelMetadata(idx);
+
+ size_t sizeOfArgList = kernelMD->KernelArgCount();
+
+ size_t kOffset = 0;
+ size_t pOffset = 0;
+ device::Kernel::parameters_t params;
+ for (uint32_t i=0; i < sizeOfArgList; i++) {
+ const RuntimeMD::KernelArg::Metadata* kernelArg = kernelMD->GetKernelArgMetadata(i);
+ initArgsParams(kernelArg, &kOffset, params, &pOffset);
+ }
+ createSignature(params);
+
+ //Set the workgroup information for the kernel
+ memset(&workGroupInfo_, 0, sizeof(workGroupInfo_));
+ workGroupInfo_.availableLDSSize_ = program_->dev().info().localMemSizePerCU_;
+ assert(workGroupInfo_.availableLDSSize_ > 0);
+ workGroupInfo_.availableSGPRs_ = 0;
+ workGroupInfo_.availableVGPRs_ = 0;
+
+ const uint32_t* workGroupSizeHint = kernelMD->WorkgroupSizeHint();
+ size_t sizeOfWorkGroupSize = (workGroupSizeHint) ? *workGroupSizeHint : 0;
+
+ uint32_t wavefront_size = 0;
+ if (HSA_STATUS_SUCCESS !=
+ hsa_agent_get_info(
+ program_->hsaDevice(), HSA_AGENT_INFO_WAVEFRONT_SIZE,
+ &wavefront_size)) {
+ return false;
+ }
+ assert(wavefront_size > 0);
+
+ // Setting it the same as used LDS.
+ workGroupInfo_.localMemSize_ = workgroupGroupSegmentByteSize_;
+ workGroupInfo_.privateMemSize_ = workitemPrivateSegmentByteSize_;
+ workGroupInfo_.usedLDSSize_ = workgroupGroupSegmentByteSize_;
+ workGroupInfo_.preferredSizeMultiple_ = wavefront_size;
+ workGroupInfo_.usedSGPRs_ = 0;
+ workGroupInfo_.usedStackSize_ = 0;
+ workGroupInfo_.usedVGPRs_ = 0;
+ workGroupInfo_.wavefrontPerSIMD_ =
+ program_->dev().info().maxWorkItemSizes_[0] / wavefront_size;
+ workGroupInfo_.wavefrontSize_ = wavefront_size;
+ if (workGroupInfo_.compileSize_[0] != 0) {
+ workGroupInfo_.size_ =
+ workGroupInfo_.compileSize_[0] *
+ workGroupInfo_.compileSize_[1] *
+ workGroupInfo_.compileSize_[2];
+ }
+ else {
+ workGroupInfo_.size_ = program_->dev().info().maxWorkGroupSize_;
+ }
+
+ //TODO: WC - handle printf
+ return true;
+}
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
bool Kernel::init(){
#if defined(WITH_LIGHTNING_COMPILER)
- assert(!"FIXME_Wilkin");
+ return init_LC();
#else // !defined(WITH_LIGHTNING_COMPILER)
acl_error errorCode;
//compile kernel down to ISA
@@ -604,8 +992,8 @@ bool Kernel::init(){
// Set the Printf List
initPrintf(reinterpret_cast(aclPrintfList.get()));
}
-#endif // !defined(WITH_LIGHTNING_COMPILER)
return true;
+#endif // !defined(WITH_LIGHTNING_COMPILER)
}
void Kernel::initPrintf(const aclPrintfFmt* aclPrintf) {
diff --git a/rocclr/runtime/device/rocm/rockernel.hpp b/rocclr/runtime/device/rocm/rockernel.hpp
index 6a68e67aa5..73ba7cc1aa 100644
--- a/rocclr/runtime/device/rocm/rockernel.hpp
+++ b/rocclr/runtime/device/rocm/rockernel.hpp
@@ -148,6 +148,11 @@ public:
//! Initializes the metadata required for this kernel
bool init();
+#if defined(WITH_LIGHTNING_COMPILER)
+ //! Initializes the metadata required for this kernel
+ bool init_LC();
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
const HSAILProgram* program() {
return static_cast(program_);
}
@@ -172,6 +177,12 @@ private:
//! Initializes Hsail Argument metadata and info ;
void initHsailArgs(const aclArgData* aclArg);
+#if defined(WITH_LIGHTNING_COMPILER)
+ //! Initializes Hsail Argument metadata and info for LC
+ void initArgsParams( const RuntimeMD::KernelArg::Metadata* lcArg, size_t* kOffset,
+ device::Kernel::parameters_t& params, size_t* pOffset );
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
//! Initializes HSAIL Printf metadata and info
void initPrintf(const aclPrintfFmt* aclPrintf);
diff --git a/rocclr/runtime/device/rocm/rocmetadata.cpp b/rocclr/runtime/device/rocm/rocmetadata.cpp
new file mode 100644
index 0000000000..34a0c48687
--- /dev/null
+++ b/rocclr/runtime/device/rocm/rocmetadata.cpp
@@ -0,0 +1,514 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include
+#include
+
+#include "rocmetadata.hpp"
+
+namespace roc {
+namespace RuntimeMD {
+
+ template
+ bool Read(std::istream& in, T& v);
+
+ template<>
+ bool Read(std::istream& in, uint32_t& v) {
+ in.read((char *)&v, sizeof(v));
+ return (in.tellg() != (std::streampos) -1 ) && !in.eof() && !in.fail() && !in.bad();
+ }
+
+ template<>
+ bool Read(std::istream& in, uint16_t& v) {
+ in.read((char *)&v, sizeof(v));
+ return !in.eof() && !in.fail() && !in.bad();
+ }
+
+ template<>
+ bool Read(std::istream& in, uint8_t& v) {
+ in.read((char *)&v, sizeof(v));
+ return !in.eof() && !in.fail() && !in.bad();
+ }
+
+ template<>
+ bool Read(std::istream& in, std::string& v) {
+ uint32_t len;
+ if (!Read(in, len)) { return false; }
+ v.resize(len);
+ if (!in.read(&v[0], len)) { return false; }
+ return true;
+ }
+
+ template
+ bool Read3(std::istream& in, T* v) {
+ for (size_t i = 0; i < 3; ++i) {
+ if (!Read(in, v[i])) { return false; }
+ }
+ return true;
+ }
+
+ template
+ bool ReadConvert(std::istream& in, T& v) {
+ T1 v1;
+ if (!Read(in, v1)) { return false; }
+ v = static_cast(v1);
+ return true;
+ }
+
+ template<>
+ bool Read(std::istream& in, AMDGPU::RuntimeMD::Key& v) {
+ return ReadConvert(in, v);
+ }
+
+ template<>
+ bool Read(std::istream& in, AMDGPU::RuntimeMD::KernelArg::TypeKind& v) {
+ return ReadConvert(in, v);
+ }
+
+ template<>
+ bool Read(std::istream& in, AMDGPU::RuntimeMD::KernelArg::ValueType& v) {
+ return ReadConvert(in, v);
+ }
+
+ template<>
+ bool Read(std::istream& in, AMDGPU::RuntimeMD::KernelArg::AccessQualifer& v) {
+ return ReadConvert(in, v);
+ }
+
+ template<>
+ bool Read(std::istream& in, AMDGPU::RuntimeMD::Language& v) {
+ return ReadConvert(in, v);
+ }
+
+ namespace KernelArg {
+ using namespace AMDGPU::RuntimeMD::KernelArg;
+ Metadata::Metadata()
+ : size(0), align(0),
+ isConst(false), isRestrict(false), isVolatile(false), isPipe(false)
+ {}
+
+ static const char* TypeKindToString(TypeKind typeKind) {
+ switch (typeKind) {
+ case Value: return "Value";
+ case Pointer: return "Pointer";
+ case Image: return "Image";
+ case Sampler: return "Sampler";
+ case Queue: return "Queue";
+ default: return "";
+ }
+ }
+
+ static const char* ValueTypeToString(ValueType valueType) {
+ switch (valueType) {
+ case Struct: return "Struct";
+ case I8: return "I8";
+ case U8: return "U8";
+ case I16: return "I16";
+ case U16: return "U16";
+ case F16: return "F16";
+ case I32: return "I32";
+ case U32: return "U32";
+ case F32: return "F32";
+ case I64: return "I64";
+ case U64: return "U64";
+ case F64: return "F64";
+ default: return "";
+ }
+ }
+
+ static const char* AccessQualToString(AccessQualifer accessQual) {
+ switch (accessQual) {
+ case None: return "None";
+ case ReadOnly: return "ReadOnly";
+ case WriteOnly: return "WriteOnly";
+ case ReadWrite: return "ReadWrite";
+ default: return "";
+ }
+ }
+
+ bool Metadata::ReadValue(std::istream& in, AMDGPU::RuntimeMD::Key key) {
+ using namespace AMDGPU::RuntimeMD;
+
+ switch (key) {
+ case KeyArgSize: return Read(in, size);
+ case KeyArgAlign: return Read(in, align);
+ case KeyArgTypeName: return Read(in, typeName);
+ case KeyArgName: return Read(in, name);
+ case KeyArgTypeKind: return Read(in, typeKind);
+ case KeyArgValueType: return Read(in, valueType);
+ case KeyArgAddrQual: return Read(in, addrQual);
+ case KeyArgAccQual: return Read(in, accQual);
+ case KeyArgIsConst: isConst = true; return true;
+ case KeyArgIsRestrict: isRestrict = true; return true;
+ case KeyArgIsVolatile: isVolatile = true; return true;
+ case KeyArgIsPipe: isPipe = true; return true;
+ default:
+ return false;
+ }
+ }
+
+ void Metadata::Print(std::ostream& out) {
+ out
+ << "Type: " << TypeKindToString(typeKind);
+ if (typeKind == Value) {
+ out << " ValueType:" << ValueTypeToString(valueType);
+ }
+ if (isConst) { out << " Const"; }
+ if (isRestrict) { out << " Restrict"; }
+ if (isVolatile) { out << " Volatile"; }
+ if (isPipe) { out << " Pipe"; }
+
+ out
+ << " Access: " << AccessQualToString(accQual)
+ << " Address: " << addrQual
+ << " Size: " << size
+ << " Align: " << align
+ << " Type Name: " << typeName;
+ if (!name.empty()) {
+ out << " Name: " << name;
+ }
+ }
+
+ void Metadata::PrintOut() const {
+ std::cout
+ << "Type: " << TypeKindToString(typeKind) << std::endl;
+ if (typeKind == Value) {
+ std::cout << " ValueType:" << ValueTypeToString(valueType) << std::endl;
+ }
+ if (isConst) { std::cout << " Const" << std::endl; }
+ if (isRestrict) { std::cout << " Restrict" << std::endl; }
+ if (isVolatile) { std::cout << " Volatile" << std::endl; }
+ if (isPipe) { std::cout << " Pipe" << std::endl; }
+
+ std::cout
+ << " Access: " << AccessQualToString(accQual)
+ << " Address: " << addrQual
+ << " Size: " << size
+ << " Align: " << align
+ << " Type Name: " << typeName << std::endl;
+ if (!name.empty()) {
+ std::cout << " Name: " << name << std::endl;
+ }
+ }
+
+ }
+
+ namespace Kernel {
+ Metadata::Metadata()
+ : mdVersion(UINT8_MAX), mdRevision(UINT8_MAX),
+ language((AMDGPU::RuntimeMD::Language) UINT8_MAX), languageVersion(UINT16_MAX),
+ hasRequiredWorkgroupSize(false),
+ hasWorkgroupSizeHint(false),
+ hasVectorTypeHint(false),
+ hasKernelIndex(false),
+ hasSGPRs(false), hasVGPRs(false),
+ hasMinWavesPerSIMD(false), hasMaxWavesPerSIMD(false),
+ hasFlatWorkgroupSizeLimits(false),
+ hasMaxWorkgroupSize(false),
+ isNoPartialWorkgroups(false)
+ {}
+
+ void Metadata::SetCommon(uint8_t mdVersion, uint8_t mdRevision,
+ AMDGPU::RuntimeMD::Language language, uint16_t languageVersion) {
+ this->mdVersion = mdVersion;
+ this->mdRevision = mdRevision;
+ this->language = language;
+ this->languageVersion = languageVersion;
+ }
+
+ const KernelArg::Metadata* Metadata::GetKernelArgMetadata(size_t index) const {
+ assert((index < args.size()) && "kernel argument index too big");
+ return &(args[index]);
+ }
+
+ bool Metadata::ReadValue(std::istream& in, AMDGPU::RuntimeMD::Key key) {
+ using namespace AMDGPU::RuntimeMD;
+
+ KernelArg::Metadata* arg = args.empty() ? nullptr : &args.back();
+
+ switch (key) {
+ case KeyKernelName:
+ return Read(in, name);
+ case KeyArgBegin:
+ args.resize(args.size() + 1);
+ break;
+ case KeyArgEnd:
+ // Verified in Program::Metadata::Read.
+ break;
+ case KeyArgSize:
+ case KeyArgAlign:
+ case KeyArgTypeName:
+ case KeyArgName:
+ case KeyArgTypeKind:
+ case KeyArgValueType:
+ case KeyArgAddrQual:
+ case KeyArgAccQual:
+ case KeyArgIsConst:
+ case KeyArgIsRestrict:
+ case KeyArgIsVolatile:
+ case KeyArgIsPipe:
+ if (!arg) { return false; }
+ if (!arg->ReadValue(in, key)) { return false; }
+ break;
+ case KeyReqdWorkGroupSize:
+ hasRequiredWorkgroupSize = true;
+ return Read3(in, requiredWorkgroupSize);
+ case KeyWorkGroupSizeHint:
+ hasWorkgroupSizeHint = true;
+ return Read3(in, workgroupSizeHint);
+ case KeyVecTypeHint:
+ hasVectorTypeHint = true;
+ return Read(in, vectorTypeHint);
+ case KeyKernelIndex:
+ hasKernelIndex = true;
+ return Read(in, kernelIndex);
+ case KeySGPRs:
+ hasSGPRs = true;
+ return Read(in, numSgprs);
+ case KeyVGPRs:
+ hasVGPRs = true;
+ return Read(in, numVgprs);
+ case KeyMinWavesPerSIMD:
+ hasMinWavesPerSIMD = true;
+ return Read(in, minWavesPerSimd);
+ case KeyMaxWavesPerSIMD:
+ hasMaxWavesPerSIMD = true;
+ return Read(in, maxWavesPerSimd);
+ case KeyFlatWorkGroupSizeLimits:
+ hasFlatWorkgroupSizeLimits = true;
+ return
+ Read(in, minFlatWorkgroupSize) &&
+ Read(in, maxFlatWorkgroupSize);
+ case KeyMaxWorkGroupSize:
+ hasMaxWorkgroupSize = true;
+ return Read3(in, maxWorkgroupSize);
+ case KeyNoPartialWorkGroups:
+ isNoPartialWorkgroups = true;
+ default:
+ return false;
+ }
+ return true;
+ }
+
+ static const char* LanguageToString(AMDGPU::RuntimeMD::Language language) {
+ using namespace AMDGPU::RuntimeMD;
+ switch (language) {
+ case OpenCL_C: return "OpenCL C";
+ case HCC: return "HCC";
+ case OpenMP: return "OpenMP";
+ case OpenCL_CPP: return "OpenCL C++";
+ default: return "";
+ }
+ }
+
+ void Metadata::Print(std::ostream& out) {
+ using namespace metadata_output;
+
+ out << " Kernel";
+ if (HasName()) {
+ out << " " << name;
+ }
+ out <<
+ " (" << LanguageToString(language) << ' ' << (int) languageVersion <<
+ "), metadata " << (int) mdVersion << '.' << (int) mdRevision << std::endl;
+ if (hasRequiredWorkgroupSize) {
+ out << " Required workgroup size: " << dim3(requiredWorkgroupSize) << std::endl;
+ }
+ if (hasWorkgroupSizeHint) {
+ out << " Workgroup size hint: " << dim3(workgroupSizeHint) << std::endl;
+ }
+ if (hasVectorTypeHint) {
+ out << " Vector type hint: " << vectorTypeHint << std::endl;
+ }
+ if (hasKernelIndex) {
+ out << " Kernel iIndex: " << kernelIndex << std::endl;
+ }
+ if (hasSGPRs) {
+ out << " SGPRs: " << numSgprs << std::endl;
+ }
+ if (hasVGPRs) {
+ out << " VGPRs: " << numVgprs << std::endl;
+ }
+ if (hasMinWavesPerSIMD) {
+ out << " Min waves per SIMD: " << minWavesPerSimd << std::endl;
+ }
+ if (hasMaxWavesPerSIMD) {
+ out << " Max waves per SIMD: " << maxWavesPerSimd << std::endl;
+ }
+ if (hasFlatWorkgroupSizeLimits) {
+ out << " Min flat workgroup size: " << minFlatWorkgroupSize << std::endl;
+ out << " Max flat workgroup size: " << maxFlatWorkgroupSize << std::endl;
+ }
+ if (isNoPartialWorkgroups) {
+ out << " No partial workgroups" << std::endl;
+ }
+ out << " Arguments" << std::endl;
+ for (uint32_t i = 0; i < args.size(); ++i) {
+ out << " " << i << ": ";
+ args[i].Print(out);
+ out << std::endl;
+ }
+ }
+ }
+
+ namespace Program {
+ bool Metadata::ReadFrom(std::istream& in) {
+ using namespace AMDGPU::RuntimeMD;
+ Kernel::Metadata* kernel = nullptr;
+ bool arg = false;
+ uint8_t mdVersion = UINT8_MAX, mdRevision = UINT8_MAX;
+ Language language = (Language) UINT8_MAX; uint16_t languageVersion = UINT16_MAX;
+ while (in.tellg() != (std::streampos) -1 && !in.eof()) {
+ Key key;
+ if (!Read(in, key)) {
+ if (in.eof()) { break; }
+ return false;
+ }
+ switch (key) {
+ case KeyNull: break; // Ignore
+ case KeyMDVersion:
+ if (!Read(in, mdRevision) ||
+ !Read(in, mdVersion)) {
+ return false;
+ }
+ break;
+ case KeyLanguage:
+ if (!Read(in, language)) { return false; }
+ break;
+ case KeyLanguageVersion:
+ if (!Read(in, languageVersion)) { return false; }
+ break;
+ case KeyKernelBegin:
+ if (kernel) { return false; }
+ kernels.resize(kernels.size() + 1);
+ kernel = &kernels.back();
+ kernel->SetCommon(mdVersion, mdRevision, language, languageVersion);
+ break;
+ case KeyKernelEnd:
+ if (!kernel) { return false; }
+ kernel = nullptr;
+ break;
+ case KeyArgBegin:
+ if (!kernel || arg) { return false; }
+ arg = true;
+ if (!kernel->ReadValue(in, key)) { return false; }
+ break;
+ case KeyArgEnd:
+ if (!kernel || !arg) { return false; }
+ arg = false;
+ break;
+ case KeyKernelName:
+ case KeyArgSize:
+ case KeyArgAlign:
+ case KeyArgTypeName:
+ case KeyArgName:
+ case KeyArgTypeKind:
+ case KeyArgValueType:
+ case KeyArgAddrQual:
+ case KeyArgAccQual:
+ case KeyArgIsConst:
+ case KeyArgIsRestrict:
+ case KeyArgIsVolatile:
+ case KeyArgIsPipe:
+ case KeyReqdWorkGroupSize:
+ case KeyWorkGroupSizeHint:
+ case KeyVecTypeHint:
+ case KeyKernelIndex:
+ case KeySGPRs:
+ case KeyVGPRs:
+ case KeyMinWavesPerSIMD:
+ case KeyMaxWavesPerSIMD:
+ case KeyFlatWorkGroupSizeLimits:
+ case KeyMaxWorkGroupSize:
+ case KeyNoPartialWorkGroups:
+ if (!kernel) { return false; }
+ if (!kernel->ReadValue(in, key)) { return false; }
+ break;
+ default:
+ //out << "Unsupported metadata key: " << key << std::endl;
+ return false;
+ }
+ }
+ return true;
+ }
+
+ const Kernel::Metadata* Metadata::GetKernelMetadata(size_t index) const {
+ assert(kernels.size() && "kernel metadata not found");
+ assert((index < kernels.size()) && "kernel index too big");
+
+ return &(kernels[index]);
+ }
+
+ size_t Metadata::KernelIndexByName(const std::string& name) const {
+ assert(kernels.size() && "kernel metadata not found");
+
+ size_t idx = 0;
+ for (auto kernel : kernels) {
+ if (kernel.Name().compare(name) == 0) { return idx; }
+ idx++;
+ }
+ return kernels.max_size();
+ }
+
+ bool Metadata::ReadFrom(const void* buffer, size_t size) {
+ std::istringstream is(std::string(static_cast(buffer), size));
+ if (!ReadFrom(is)) { return false; }
+ return true;
+ }
+
+ void Metadata::Print(std::ostream& out) {
+ out << "roc runtime metadata (" << kernels.size() << " kernels):" << std::endl;
+ for (Kernel::Metadata& kernel : kernels) {
+ kernel.Print(out);
+ }
+ }
+ }
+
+ namespace metadata_output {
+ std::ostream& operator<<(std::ostream& out, const dim3& d) {
+ out << "(" << d.data[0] << ", " << d.data[1] << ", " << d.data[2] << ")";
+ return out;
+ }
+ }
+
+}
+}
diff --git a/rocclr/runtime/device/rocm/rocmetadata.hpp b/rocclr/runtime/device/rocm/rocmetadata.hpp
new file mode 100644
index 0000000000..d34ad61a87
--- /dev/null
+++ b/rocclr/runtime/device/rocm/rocmetadata.hpp
@@ -0,0 +1,193 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// The University of Illinois/NCSA
+// Open Source License (NCSA)
+//
+// Copyright (c) 2014-2016, Advanced Micro Devices, Inc. All rights reserved.
+//
+// Developed by:
+//
+// AMD Research and AMD HSA Software Development
+//
+// Advanced Micro Devices, Inc.
+//
+// www.amd.com
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal with the Software without restriction, including without limitation
+// the rights to use, copy, modify, merge, publish, distribute, sublicense,
+// and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// - Redistributions of source code must retain the above copyright notice,
+// this list of conditions and the following disclaimers.
+// - Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimers in
+// the documentation and/or other materials provided with the distribution.
+// - Neither the names of Advanced Micro Devices, Inc,
+// nor the names of its contributors may be used to endorse or promote
+// products derived from this Software without specific prior written
+// permission.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS WITH THE SOFTWARE.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef ROC_METADATA_HPP_
+#define ROC_METADATA_HPP_
+
+#include
+#include
+#include
+#include
+#include
+
+#undef None
+#include "AMDGPU.h"
+#include "AMDGPURuntimeMetadata.h"
+
+namespace roc {
+namespace RuntimeMD {
+
+ namespace KernelArg {
+ class Metadata {
+ private:
+ uint32_t size;
+ uint32_t align;
+ std::string typeName;
+ std::string name;
+ AMDGPU::RuntimeMD::KernelArg::TypeKind typeKind;
+ AMDGPU::RuntimeMD::KernelArg::ValueType valueType;
+ uint8_t addrQual;
+ AMDGPU::RuntimeMD::KernelArg::AccessQualifer accQual;
+ bool isConst, isRestrict, isVolatile, isPipe;
+
+ public:
+ Metadata();
+ uint32_t Size() const { return size; }
+ uint32_t Align() const { return align; }
+ std::string TypeName() const { return typeName; }
+ std::string Name() const { return name; }
+ AMDGPU::RuntimeMD::KernelArg::TypeKind TypeKind() const { return typeKind; }
+ AMDGPU::RuntimeMD::KernelArg::ValueType ValueType() const { return valueType; }
+ uint8_t AddrQual() const { return addrQual; }
+ AMDGPU::RuntimeMD::KernelArg::AccessQualifer AccQual() const { return accQual; }
+ bool IsConst() const { return isConst; }
+ bool IsRestrict() const { return isRestrict; }
+ bool IsVolatile() const { return isVolatile; }
+ bool IsPipe() const { return isPipe; }
+
+ bool ReadValue(std::istream& in, AMDGPU::RuntimeMD::Key key);
+ void Print(std::ostream& out);
+ void PrintOut() const;
+ };
+ }
+
+ namespace Kernel {
+ class Metadata {
+ private:
+ uint8_t mdVersion, mdRevision;
+ AMDGPU::RuntimeMD::Language language;
+ uint16_t languageVersion;
+ std::vector args;
+
+ unsigned hasName : 1;
+ unsigned hasRequiredWorkgroupSize : 1;
+ unsigned hasWorkgroupSizeHint : 1;
+ unsigned hasVectorTypeHint : 1;
+ unsigned hasKernelIndex : 1;
+ unsigned hasSGPRs : 1, hasVGPRs : 1;
+ unsigned hasMinWavesPerSIMD : 1, hasMaxWavesPerSIMD : 1;
+ unsigned hasFlatWorkgroupSizeLimits : 1;
+ unsigned hasMaxWorkgroupSize : 1;
+ unsigned isNoPartialWorkgroups : 1;
+
+ std::string name;
+ uint32_t requiredWorkgroupSize[3];
+ uint32_t workgroupSizeHint[3];
+ std::string vectorTypeHint;
+
+ uint32_t kernelIndex;
+ uint32_t numSgprs, numVgprs;
+ uint32_t minWavesPerSimd, maxWavesPerSimd;
+ uint32_t minFlatWorkgroupSize, maxFlatWorkgroupSize;
+ uint32_t maxWorkgroupSize[3];
+
+ public:
+ Metadata();
+
+ bool HasName() const { return hasName; }
+ bool HasRequiredWorkgroupSize() const { return hasRequiredWorkgroupSize; }
+ bool HasWorkgroupSizeHint() const { return hasWorkgroupSizeHint; }
+ bool HasVecTypeHint() const { return hasVectorTypeHint; }
+ bool HasKernelIndex() const { return hasKernelIndex; }
+ bool HasSGPRs() const { return hasSGPRs; }
+ bool HasVGPRs() const { return hasVGPRs; }
+ bool HasMinWavesPerSIMD() const { return hasMinWavesPerSIMD; }
+ bool HasMaxWavesPerSIMD() const { return hasMaxWavesPerSIMD; }
+ bool HasFlatWorkgroupSizeLimits() const { return hasFlatWorkgroupSizeLimits; }
+ bool HasMaxWorkgroupSize() const { return hasMaxWorkgroupSize; }
+
+ size_t KernelArgCount() const { return args.size(); }
+ const KernelArg::Metadata* GetKernelArgMetadata(size_t index) const;
+
+ const std::string& Name() const { return name; }
+ const uint32_t* RequiredWorkgroupSize() const { return hasRequiredWorkgroupSize ? requiredWorkgroupSize : nullptr; }
+ const uint32_t* WorkgroupSizeHint() const { return hasWorkgroupSizeHint ? workgroupSizeHint : nullptr; }
+ std::string VecTypeHint() const { return vectorTypeHint; }
+ uint32_t KernelIndex() const { return hasKernelIndex ? kernelIndex : UINT32_MAX; }
+ uint32_t SGPRS() const { return hasSGPRs ? numSgprs : UINT32_MAX; }
+ uint32_t VGPRS() const { return hasVGPRs ? numVgprs : UINT32_MAX; }
+ uint32_t MinWavesPerSIMD() const { return hasMinWavesPerSIMD ? minWavesPerSimd : UINT32_MAX; }
+ uint32_t MaxWavesPerSIMD() const { return hasMaxWavesPerSIMD ? maxWavesPerSimd : UINT32_MAX; }
+ uint32_t MinFlatWorkgroupSize() const { return hasFlatWorkgroupSizeLimits ? minFlatWorkgroupSize : UINT32_MAX; }
+ uint32_t MaxFlatWorkgroupSize() const { return hasFlatWorkgroupSizeLimits ? maxFlatWorkgroupSize : UINT32_MAX; }
+ const uint32_t* MaxWorkgroupSize() const { return hasMaxWorkgroupSize ? maxWorkgroupSize : 0; }
+ bool IsNoPartialWorkgroups() const { return isNoPartialWorkgroups; }
+
+ void SetCommon(uint8_t mdVersion, uint8_t mdRevision, AMDGPU::RuntimeMD::Language language, uint16_t languageVersion);
+ bool ReadValue(std::istream& in, AMDGPU::RuntimeMD::Key key);
+ void Print(std::ostream& out);
+ };
+ }
+
+ namespace Program {
+ class Metadata {
+ private:
+ uint16_t version;
+ std::vector kernels;
+
+ public:
+ size_t KernelCount() const { return kernels.size(); }
+ const Kernel::Metadata* GetKernelMetadata(size_t index) const;
+ size_t KernelIndexByName(const std::string& name) const;
+
+ bool ReadFrom(std::istream& in);
+ bool ReadFrom(const void* buffer, size_t size);
+ void Print(std::ostream& out);
+ };
+ }
+
+ namespace metadata_output {
+
+ struct dim3 {
+ uint32_t* data;
+
+ dim3(uint32_t* data_)
+ : data(data_) {}
+ };
+
+ std::ostream& operator<<(std::ostream& out, const dim3& d);
+ }
+
+}
+}
+
+#endif // ROC_METADATA_HPP_
diff --git a/rocclr/runtime/device/rocm/rocprogram.cpp b/rocclr/runtime/device/rocm/rocprogram.cpp
index de90dfe0c0..d10b5f0674 100644
--- a/rocclr/runtime/device/rocm/rocprogram.cpp
+++ b/rocclr/runtime/device/rocm/rocprogram.cpp
@@ -12,6 +12,26 @@
#include "rockernel.hpp"
#if defined(WITH_LIGHTNING_COMPILER)
#include "driver/AmdCompiler.h"
+#include "opencl-c.amdgcn.inc"
+#include "builtins-irif.amdgcn.inc"
+#include "builtins-ockl.amdgcn.inc"
+#include "builtins-ocml.amdgcn.inc"
+#include "builtins-opencl.amdgcn.inc"
+#include "correctly_rounded_sqrt_off.amdgcn.inc"
+#include "correctly_rounded_sqrt_on.amdgcn.inc"
+#include "daz_opt_off.amdgcn.inc"
+#include "daz_opt_on.amdgcn.inc"
+#include "finite_only_off.amdgcn.inc"
+#include "finite_only_on.amdgcn.inc"
+#include "isa_version_701.amdgcn.inc"
+#include "isa_version_800.amdgcn.inc"
+#include "isa_version_801.amdgcn.inc"
+#include "isa_version_802.amdgcn.inc"
+#include "isa_version_803.amdgcn.inc"
+#include "isa_version_804.amdgcn.inc"
+#include "isa_version_810.amdgcn.inc"
+#include "unsafe_math_off.amdgcn.inc"
+#include "unsafe_math_on.amdgcn.inc"
#else // !defined(WITH_LIGHTNING_COMPILER)
#include "roccompilerlib.hpp"
#endif // !defined(WITH_LIGHTNING_COMPILER)
@@ -30,6 +50,34 @@
namespace roc {
#ifndef WITHOUT_HSA_BACKEND
+
+#if defined(WITH_LIGHTNING_COMPILER)
+ static hsa_status_t GetKernelNamesCallback(
+ hsa_executable_t exec,
+ hsa_executable_symbol_t symbol,
+ void *data ) {
+ std::vector* symNameList = (reinterpret_cast *>(data));
+
+ hsa_symbol_kind_t sym_type;
+ hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &sym_type);
+
+ if (sym_type == HSA_SYMBOL_KIND_KERNEL) {
+ uint32_t len;
+ hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &len);
+
+ char* symName = (char*) malloc(len);
+ hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, symName);
+
+ std::string kernelName(symName,len);
+ symNameList->push_back(kernelName);
+
+ free(symName);
+ }
+
+ return HSA_STATUS_SUCCESS;
+ }
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
/* Temporary log function for the compiler library */
static void logFunction(const char *msg, size_t size) {
std::cout << "Compiler Library log :" << msg << std::endl;
@@ -40,13 +88,15 @@ namespace roc {
// Free the elf binary
if (binaryElf_ != NULL) {
#if defined(WITH_LIGHTNING_COMPILER)
- assert(!"FIXME_Wilkin");
+ if (lcBinaryElf_) {
+ free(lcBinaryElf_);
+ }
#else // !defined(WITH_LIGHTNING_COMPILER)
error = g_complibApi._aclBinaryFini(binaryElf_);
-#endif // !defined(WITH_LIGHTNING_COMPILER)
if (error != ACL_SUCCESS) {
LogWarning( "Error while destroying the acl binary \n" );
}
+#endif // !defined(WITH_LIGHTNING_COMPILER)
}
// Destroy the executable.
if (hsaExecutable_.handle != 0) {
@@ -84,6 +134,14 @@ namespace roc {
hsaProgramHandle_.handle = 0;
hsaProgramCodeObject_.handle = 0;
hsaExecutable_.handle = 0;
+
+#if defined(WITH_LIGHTNING_COMPILER)
+ lcProgramCodeObject_.handle = 0;
+ lcExecutable_.handle = 0;
+ codeObjBinary_ = NULL;
+ lcBinaryElf_ = NULL;
+ lcBinaryElfSize_ = 0;
+#endif // defined(WITH_LIGHTNING_COMPILER)
}
bool HSAILProgram::initClBinary(char *binaryIn, size_t size) { // Save the
@@ -140,7 +198,11 @@ namespace roc {
outFileName = options->getDumpFileName(".bin");
}
+#if defined(WITH_LIGHTNING_COMPILER)
+ bool useELF64 = true;
+#else // !defined(WITH_LIGHTNING_COMPILER)
bool useELF64 = getCompilerOptions()->oVariables->EnableGpuElf64;
+#endif // !defined(WITH_LIGHTNING_COMPILER)
if (!clBinary()->setElfOut(useELF64 ? ELFCLASS64 : ELFCLASS32,
(outFileName.size() >
0) ? outFileName.c_str() : NULL)) {
@@ -203,7 +265,7 @@ namespace roc {
// Checking llvmir in .llvmir section
bool containsLlvmirText = true;
#if defined(WITH_LIGHTNING_COMPILER)
- assert(!"FIXME_Wilkin");
+ // TODO:FIXME_Wilkin - Query
bool containsOpts = false;
bool containsHsailText = false;
bool containsBrig = false;
@@ -318,16 +380,14 @@ namespace roc {
void *mem = const_cast(binary.first);
acl_error errorCode;
#if defined(WITH_LIGHTNING_COMPILER)
- assert(!"FIXME_Wilkin");
- errorCode = ACL_ERROR;
+ // TODO: FIXME_Wilkin
#else // !defined(WITH_LIGHTNING_COMPILER)
binaryElf_ = g_complibApi._aclReadFromMem(mem, binary.second, &errorCode);
-#endif // !defined(WITH_LIGHTNING_COMPILER)
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error while BRIG Codegen phase: aclReadFromMem failure \n" ;
- LogWarning("aclReadFromMem failed");
return continueCompileFrom;
}
+#endif // !defined(WITH_LIGHTNING_COMPILER)
// Calculate the next stage to compile from, based on sections in binaryElf_;
// No any validity checks here
std::vector completeStages;
@@ -359,11 +419,11 @@ namespace roc {
#else // !defined(WITH_LIGHTNING_COMPILER)
const void *opts = g_complibApi._aclExtractSymbol(device().compiler(),
binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode);
-#endif // !defined(WITH_LIGHTNING_COMPILER)
if (errorCode != ACL_SUCCESS) {
recompile = true;
break;
}
+#endif // !defined(WITH_LIGHTNING_COMPILER)
std::string sBinOptions = std::string((char*)opts, symSize);
std::string sCurOptions = compileOptions_ + linkOptions_;
amd::option::Options curOptions, binOptions;
@@ -404,7 +464,8 @@ namespace roc {
void *rawBinary = NULL;
size_t size = 0;
#if defined(WITH_LIGHTNING_COMPILER)
- assert(!"FIXME_Wilkin");
+ rawBinary = codeObjBinary_->Binary();
+ size = codeObjBinary_->BinarySize();
#else // !defined(WITH_LIGHTNING_COMPILER)
if (g_complibApi._aclWriteToMem(binaryElf_, &rawBinary, &size)
!= ACL_SUCCESS) {
@@ -415,8 +476,10 @@ namespace roc {
clBinary()->saveBIFBinary((char*)rawBinary, size);
//Set the type of binary
setType(type);
+#if !defined(WITH_LIGHTNING_COMPILER)
//Free memory containing rawBinary
binaryElf_->binOpts.dealloc(rawBinary);
+#endif // !defined(WITH_LIGHTNING_COMPILER)
return true;
}
@@ -517,15 +580,13 @@ namespace roc {
}
bool HSAILProgram::initBrigModule() {
+#if defined(WITH_LIGHTNING_COMPILER)
+ brigModule_ = NULL;
+#else // !defined(WITH_LIGHTNING_COMPILER)
const char *symbol_name = "__BRIG__";
- BrigModuleHeader* brig;
+ BrigModuleHeader* brig;
acl_error error_code;
size_t size;
-#if defined(WITH_LIGHTNING_COMPILER)
- assert(!"FIXME_Wilkin");
- const void* symbol_data = NULL;
- error_code = ACL_ERROR;
-#else // !defined(WITH_LIGHTNING_COMPILER)
const void* symbol_data = g_complibApi._aclExtractSymbol(
device().compiler(),
binaryElf_,
@@ -533,7 +594,6 @@ namespace roc {
aclBRIG,
symbol_name,
&error_code);
-#endif // !defined(WITH_LIGHTNING_COMPILER)
if (error_code != ACL_SUCCESS) {
std::string error = "Could not find Brig in BIF: ";
error += symbol_name;
@@ -544,6 +604,7 @@ namespace roc {
brig = (BrigModuleHeader*)malloc(size);
memcpy(brig, symbol_data, size);
brigModule_ = brig;
+#endif // !defined(WITH_LIGHTNING_COMPILER)
return true;
}
void HSAILProgram::destroyBrigModule() {
@@ -552,6 +613,9 @@ namespace roc {
}
}
bool HSAILProgram::initBrigContainer() {
+#if defined(WITH_LIGHTNING_COMPILER)
+ hsaBrigContainer_ = NULL;
+#else // !defined(WITH_LIGHTNING_COMPILER)
assert(brigModule_ != NULL);
//Create a BRIG container
@@ -559,6 +623,7 @@ namespace roc {
if (!hsaBrigContainer_) {
return false;
}
+#endif // !defined(WITH_LIGHTNING_COMPILER)
return true;
}
@@ -566,7 +631,7 @@ namespace roc {
delete (hsaBrigContainer_);
}
-
+
void HSAILProgram::hsaError(const char *msg, hsa_status_t status) {
std::string fmsg;
fmsg += msg;
@@ -582,12 +647,284 @@ namespace roc {
buildLog_ += fmsg;
}
+#if defined(WITH_LIGHTNING_COMPILER)
+ bool HSAILProgram::linkImpl_LC(amd::option::Options *options) {
+ // call LinkLLVMBitcode
+ std::vector inputs;
+
+ amd::opencl_driver::Data* opencl_bc = device().compiler()->NewBufferReference(
+ amd::opencl_driver::DT_LLVM_BC,
+ (const char*) builtins_opencl_amdgcn,
+ builtins_opencl_amdgcn_size);
+ if (opencl_bc == NULL) {
+ buildLog_ += "Error while open opencl library bitcode ";
+ return false;
+ }
+
+ amd::opencl_driver::Data* ocml_bc = device().compiler()->NewBufferReference(
+ amd::opencl_driver::DT_LLVM_BC,
+ (const char*) builtins_ocml_amdgcn,
+ builtins_ocml_amdgcn_size);
+ if (ocml_bc == NULL) {
+ buildLog_ += "Error while open ocml library bitcode ";
+ return false;
+ }
+ amd::opencl_driver::Data* ockl_bc = device().compiler()->NewBufferReference(
+ amd::opencl_driver::DT_LLVM_BC,
+ (const char*) builtins_ockl_amdgcn,
+ builtins_ockl_amdgcn_size);
+ if (ockl_bc == NULL) {
+ buildLog_ += "Error while open ockl library bitcode ";
+ return false;
+ }
+
+ amd::opencl_driver::Data* irif_bc = device().compiler()->NewBufferReference(
+ amd::opencl_driver::DT_LLVM_BC,
+ (const char*) builtins_irif_amdgcn,
+ builtins_irif_amdgcn_size);
+ if (irif_bc == NULL) {
+ buildLog_ += "Error while open irif (llvm) library bitcode ";
+ return false;
+ }
+
+ const std::string llvmIR = codeObjBinary_->getLlvmIR();
+ amd::opencl_driver::Data* llvm_src = device().compiler()->NewBufferReference(
+ amd::opencl_driver::DT_LLVM_BC,
+ llvmIR.c_str(),
+ llvmIR.length());
+ if (llvm_src == NULL) {
+ buildLog_ += "Error while creating data from LLVM bitcode";
+ return false;
+ }
+
+ inputs.push_back(llvm_src);
+ inputs.push_back(opencl_bc);
+ inputs.push_back(ocml_bc);
+ inputs.push_back(ockl_bc);
+ inputs.push_back(irif_bc);
+
+ std::vector linkOptions;
+ amd::opencl_driver::Data* linked_bc = device().compiler()->NewBuffer(
+ amd::opencl_driver::DT_LLVM_BC);
+ if (!device().compiler()->LinkLLVMBitcode(inputs, linked_bc, linkOptions)) {
+ buildLog_ += "Error while linking source & LLVM library: linking source & IR library";
+#if 0
+ std::cerr << "\n**** Compiler Output After LinkLLVMBitcode ****\n";
+ std::cerr << device().compiler()->Output().c_str();
+ std::cerr << "***********************************************\n\n";
+#endif
+ return false;
+ }
+
+
+ // convert option string into vector here as clang treats option
+ // with leading space as file name
+ std::vector complibOptions;
+ if (!options->origOptionStr.empty())
+ {
+ std::istringstream buf(options->origOptionStr);
+ std::istream_iterator beg(buf), end;
+ std::vector origOptions(beg, end);
+
+ complibOptions.insert( complibOptions.end(),
+ origOptions.begin(),
+ origOptions.end());
+ }
+
+ appendHsailOptions(complibOptions);
+ complibOptions.push_back("-mcpu=fiji");
+
+ inputs.clear();
+ inputs.push_back(linked_bc);
+
+ amd::opencl_driver::Buffer* out_exec = device().compiler()->NewBuffer(
+ amd::opencl_driver::DT_EXECUTABLE);
+ if (out_exec == NULL) {
+ buildLog_ += "Error while creating output file for the executable";
+ return false;
+ }
+
+ if (!device().compiler()->CompileAndLinkExecutable(inputs,
+ (amd::opencl_driver::Data*) out_exec,
+ complibOptions))
+ {
+ buildLog_ += "Error while creating executable: Compiling LLVM IRs to exe";
+#if 0
+ std::cerr << "\n**** Compiler Output After CompileAndLinkExecutable ****\n";
+ std::cerr << device().compiler()->Output().c_str();
+ std::cerr << "********************************************************\n\n";
+#endif
+ return false;
+ }
+
+
+ // allocate memory and store the ELF code object
+ lcBinaryElfSize_ = out_exec->Size();
+// lcBinaryElf_ = (void *)malloc(lcBinaryElfSize_);
+// memcpy(lcBinaryElf_, (void *) out_exec->Buf().data(), lcBinaryElfSize_);
+
+ hsa_status_t status;
+ status = hsa_code_object_deserialize( out_exec->Buf().data(),
+ out_exec->Size(),
+ NULL, &lcProgramCodeObject_ );
+ if (status != HSA_STATUS_SUCCESS) {
+ hsaError("Failed to deserialize code object from a buffer.");
+ return false;
+ }
+
+ status = hsa_executable_create( HSA_PROFILE_FULL,
+ HSA_EXECUTABLE_STATE_UNFROZEN,
+ NULL, &lcExecutable_ );
+ if (status != HSA_STATUS_SUCCESS) {
+ hsaError("Failed to create executable", status);
+ return false;
+ }
+
+ // Load the code object.
+ hsa_agent_t hsaDevice = dev().getBackendDevice();
+ status = hsa_executable_load_code_object( lcExecutable_, hsaDevice,
+ lcProgramCodeObject_, NULL );
+ if (status != HSA_STATUS_SUCCESS) {
+ hsaError("Failed to load code object", status);
+ return false;
+ }
+
+ // Freeze the executable.
+ status = hsa_executable_freeze( lcExecutable_, NULL );
+ if (status != HSA_STATUS_SUCCESS) {
+ hsaError("Failed to freeze executable");
+ return false;
+ }
+
+ //TODO: WC - use the proper target code based on the agent
+ std::string target = "AMD:AMDGPU:8:0:3";
+ codeObjBinary_->init( target, out_exec->Buf().data(), out_exec->Size());
+ saveBinaryAndSetType(TYPE_EXECUTABLE);
+
+ buildLog_ += device().compiler()->Output();
+
+ // Get the list of kernels
+ std::vector kernelNameList;
+ status = hsa_executable_iterate_symbols( lcExecutable_, GetKernelNamesCallback,
+ (void *) &kernelNameList );
+ if (status != HSA_STATUS_SUCCESS) {
+ hsaError("Failed to get kernel names");
+ return false;
+ }
+
+ for ( auto &kernelName : kernelNameList )
+ {
+ hsa_executable_symbol_t kernelSymbol;
+ hsa_executable_get_symbol ( lcExecutable_, "", kernelName.c_str(),
+ hsaDevice, 0, &kernelSymbol );
+
+ uint64_t kernelCodeHandle;
+ status = hsa_executable_symbol_get_info(
+ kernelSymbol,
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT,
+ &kernelCodeHandle);
+ if (status != HSA_STATUS_SUCCESS) {
+ hsaError("Failed to get the kernel code", status);
+ return false;
+ }
+
+ uint32_t workgroupGroupSegmentByteSize;
+ status = hsa_executable_symbol_get_info(
+ kernelSymbol,
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
+ &workgroupGroupSegmentByteSize);
+ if (status != HSA_STATUS_SUCCESS) {
+ hsaError("Failed to get group segment size info", status);
+ return false;
+ }
+
+ uint32_t workitemPrivateSegmentByteSize;
+ status = hsa_executable_symbol_get_info(
+ kernelSymbol,
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
+ &workitemPrivateSegmentByteSize);
+ if (status != HSA_STATUS_SUCCESS) {
+ hsaError("Failed to get private segment size info", status);
+ return false;
+ }
+
+ uint32_t kernargSegmentByteSize;
+ status = hsa_executable_symbol_get_info(
+ kernelSymbol,
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
+ &kernargSegmentByteSize);
+ if (status != HSA_STATUS_SUCCESS) {
+ hsaError("Failed to get kernarg segment size info", status);
+ return false;
+ }
+
+ uint32_t kernargSegmentAlignment;
+ status = hsa_executable_symbol_get_info(
+ kernelSymbol,
+ HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
+ &kernargSegmentAlignment);
+ if (status != HSA_STATUS_SUCCESS) {
+ hsaError("Failed to get kernarg segment alignment info", status);
+ return false;
+ }
+
+ // for OpenCL default hidden kernel arguments assuming there is no printf
+ size_t numHiddenKernelArgs = 0; // FIXME_lmoriche:3;
+
+ // Fix the kernel name issue that causes string comparison does not work
+ // due to an extra character at the end
+ // TODO: find out the root cause
+ kernelName.resize(kernelName.length()-1);
+
+ Kernel *aKernel = new roc::Kernel(
+ kernelName,
+ this,
+ kernelCodeHandle,
+ workgroupGroupSegmentByteSize,
+ workitemPrivateSegmentByteSize,
+ // TODO: remove the workaround
+ // add 24 bytes for global offsets as workaround for LC reporting
+ // excluded the hidden arguments
+ kernargSegmentByteSize /* FIXME_lmoriche:+24*/,
+ kernargSegmentAlignment,
+ numHiddenKernelArgs
+ );
+ if (!aKernel->init()) {
+ return false;
+ }
+ aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
+ kernels()[kernelName] = aKernel;
+ }
+
+#if 0
+ // Cleaning up
+ status = hsa_code_object_destroy( lcProgramCodeObject_ );
+ if (status != HSA_STATUS_SUCCESS) {
+ hsaError("Failed to destory the code object", status);
+ return false;
+ }
+
+ status = hsa_executable_destroy( lcExecutable_ );
+ if (status != HSA_STATUS_SUCCESS) {
+ hsaError("Failed to destory the executable", status);
+ return false;
+ }
+#endif
+ return true;
+ }
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
bool HSAILProgram::linkImpl(amd::option::Options *options) {
acl_error errorCode;
aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
bool finalize = true;
// If !binaryElf_ then program must have been created using clCreateProgramWithBinary
- if (!binaryElf_) {
+#if defined(WITH_LIGHTNING_COMPILER)
+ if (!codeObjBinary_)
+#else // !defined(WITH_LIGHTNING_COMPILER)
+ if (!binaryElf_)
+#endif // !defined(WITH_LIGHTNING_COMPILER)
+ {
continueCompileFrom = getNextCompilationStageFromBinary(options);
}
switch (continueCompileFrom) {
@@ -603,19 +940,20 @@ namespace roc {
// Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases:
// 1. if the program is created with binary and contains only hsail text
case ACL_TYPE_HSAIL_TEXT: {
- std::string curOptions = options->origOptionStr + hsailOptions();
#if defined(WITH_LIGHTNING_COMPILER)
- assert(!"FIXME_Wilkin");
- errorCode = ACL_ERROR;
+ if (!linkImpl_LC(options)) {
+ return false;
+ }
#else // !defined(WITH_LIGHTNING_COMPILER)
+ std::string curOptions = options->origOptionStr + hsailOptions();
errorCode = g_complibApi._aclCompile(device().compiler(), binaryElf_,
curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, logFunction);
buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler());
-#endif // !defined(WITH_LIGHTNING_COMPILER)
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error while BRIG Codegen phase: compilation error \n" ;
return false;
}
+#endif // !defined(WITH_LIGHTNING_COMPILER)
break;
}
case ACL_TYPE_CG:
@@ -633,6 +971,7 @@ namespace roc {
return true;
}
+#if !defined(WITH_LIGHTNING_COMPILER)
hsa_agent_t hsaDevice = dev().getBackendDevice();
if (!initBrigModule()) {
hsaError("Failed to create Brig Module");
@@ -767,14 +1106,10 @@ namespace roc {
kernelName = kernelName.substr(0,kernelName.size() - strlen("_kernel"));
aclMetadata md;
md.numHiddenKernelArgs = 0;
+
size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs);
-#if defined(WITH_LIGHTNING_COMPILER)
- assert(!"FIXME_Wilkin");
- errorCode = ACL_ERROR;
-#else // !defined(WITH_LIGHTNING_COMPILER)
errorCode = g_complibApi._aclQueryInfo(device().compiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS,
openclKernelName.c_str(), &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs);
-#endif // !defined(WITH_LIGHTNING_COMPILER)
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error while Finalization phase: Kernel extra arguments count querying from the ELF failed\n";
return false;
@@ -838,9 +1173,6 @@ namespace roc {
}
}
saveBinaryAndSetType(TYPE_EXECUTABLE);
-#if defined(WITH_LIGHTNING_COMPILER)
- assert(!"FIXME_Wilkin");
-#else // !defined(WITH_LIGHTNING_COMPILER)
buildLog_ += g_complibApi._aclGetCompilerLog(device().compiler());
#endif // !defined(WITH_LIGHTNING_COMPILER)
return true;
@@ -876,18 +1208,20 @@ namespace roc {
hsailOptions.append(" -DFP_FAST_FMA=1");
//TODO: this is a quick fix to restore original f32 denorm flushing
//Make this target/option dependent
+#if !defined(WITH_LIGHTNING_COMPILER) // TODO: WC
hsailOptions.append(" -cl-denorms-are-zero");
+#endif // !defined(WITH_LIGHTNING_COMPILER)
//TODO(sramalin) : Query the device for opencl version
// and only set if -cl-std wasn't specified in
// original build options (app)
//hsailOptions.append(" -cl-std=CL1.2");
//check if the host is 64 bit or 32 bit
LP64_ONLY(hsailOptions.append(" -m64"));
- //Now append each extension supported by the device
+ //Now append each extension supported by the device
// one by one
std::string token;
std::istringstream iss("");
- iss.str(device().info().extensions_);
+ iss.str(device().info().extensions_);
while (getline(iss, token, ' ')) {
if (!token.empty()) {
hsailOptions.append(" -D");
@@ -898,6 +1232,94 @@ namespace roc {
return hsailOptions;
}
-#endif // WITHOUT_HSA_BACKEND
-} // namespace hsa
+#if defined(WITH_LIGHTNING_COMPILER)
+ void HSAILProgram::appendHsailOptions(std::vector& options)
+ {
+ //TODO: this is a quick fix to restore original f32 denorm flushing
+ //Make this target/option dependent
+ options.push_back("-Xclang");
+ options.push_back("-cl-denorms-are-zero");
+#if 0
+ // option to debug metadata section
+
+ options.push_back("-Xclang");
+ options.push_back("-backend-option");
+
+ options.push_back("-Xclang");
+ options.push_back("-print-after-all");
+#endif
+ //options.push_back("-mcpu=fiji");
+ //options.push_back("-include"); options.push_back("opencl-c.h");
+
+ //Set options for the standard device specific options
+ //This is just for legacy compiler code
+ // All our devices support these options now
+ options.push_back("-DFP_FAST_FMAF=1");
+ options.push_back("-DFP_FAST_FMA=1");
+
+ //TODO(sramalin) : Query the device for opencl version
+ // and only set if -cl-std wasn't specified in
+ // original build options (app)
+ //options.push_back(" -cl-std=CL1.2");
+
+ //check if the host is 64 bit or 32 bit
+ LP64_ONLY(options.push_back("-m64"));
+
+ //Now append each extension supported by the device
+ // one by one
+ std::string token;
+ std::istringstream iss("");
+ iss.str(device().info().extensions_);
+ while (getline(iss, token, ' ')) {
+ if (!token.empty()) {
+ options.push_back("-D"+token+"=1");
+ }
+ }
+ return;
+ }
+
+ void CodeObjBinary::init(std::string& target, void* binary, size_t binarySize)
+ {
+ target_ = target;
+ binary_ = binary;
+ binarySize_ = binarySize;
+
+ oclElf_ = new amd::OclElf(ELFCLASS64, (char *)binary_, binarySize_, NULL, ELF_C_READ);
+
+ // load the runtime metadata
+ runtimeMD_ = new roc::RuntimeMD::Program::Metadata();
+ }
+
+ void CodeObjBinary::fini()
+ {
+ if (oclElf_) {
+ delete oclElf_;
+ }
+
+ if (runtimeMD_) {
+ delete runtimeMD_;
+ }
+
+ target_ = "";
+ binary_ = NULL;
+ binarySize_ = 0;
+ }
+
+ const RuntimeMD::Program::Metadata* CodeObjBinary::GetProgramMetadata() const
+ {
+ char* metaData;
+ size_t metaSize;
+ if (!oclElf_->getSection(amd::OclElf::RUNTIME_METADATA, &metaData, &metaSize)) {
+ LogWarning( "Error while access runtime metadata section from the binary \n" );
+ }
+
+ if (!runtimeMD_->ReadFrom((void *) metaData, metaSize)) {
+ LogWarning( "Error while parsing runtime metadata \n" );
+ }
+
+ return runtimeMD_;
+ }
+#endif // defined(WITH_LIGHTNING_COMPILER)
+#endif // WITHOUT_HSA_BACKEND
+} // namespace roc
diff --git a/rocclr/runtime/device/rocm/rocprogram.hpp b/rocclr/runtime/device/rocm/rocprogram.hpp
index d3b4f97069..1b017363dd 100644
--- a/rocclr/runtime/device/rocm/rocprogram.hpp
+++ b/rocclr/runtime/device/rocm/rocprogram.hpp
@@ -17,10 +17,65 @@
#include "rocdevice.hpp"
#include "HSAILItems.h"
+#if defined(WITH_LIGHTNING_COMPILER)
+#include "rocmetadata.hpp"
+#include "driver/AmdCompiler.h"
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
using namespace HSAIL_ASM;
//! \namespace roc HSA Device Implementation
namespace roc {
+#if defined(WITH_LIGHTNING_COMPILER)
+ class CodeObjBinary {
+ public:
+ CodeObjBinary()
+ : target_(""), kernelArgAlign_(0), capFlags_(0), encryptCode_(0),
+ binary_(NULL), binarySize_(0), llvmIR_(""), oclElf_(NULL), runtimeMD_(NULL) {}
+
+ void init(std::string& target, void* binary, size_t binarySize);
+ void fini();
+
+ std::string Target() const { return target_; }
+ uint32_t KernelArgAlign() const { return kernelArgAlign_; }
+ void* Binary() const { return binary_; }
+ size_t BinarySize() const { return binarySize_; }
+
+ void saveIR(std::string llvmIR) { llvmIR_ = llvmIR; }
+ std::string getLlvmIR() const { return llvmIR_; }
+
+ amd::OclElf* oclElf() const { return oclElf_; }
+ RuntimeMD::Program::Metadata* runtimeMD() const { return runtimeMD_; }
+
+ const RuntimeMD::Program::Metadata* GetProgramMetadata() const;
+
+ private:
+ enum CapFlag {
+ capSaveSource = 0,
+ capSaveLLVMIR = 1,
+ capSaveCG = 2,
+ capSaveEXE = 3,
+ capSaveHSAIL = 4,
+ capSaveISASM = 5,
+ capEncryted = 6
+ };
+
+ std::string target_; // target device
+ uint32_t kernelArgAlign_;
+ uint32_t capFlags_;
+ uint32_t encryptCode_;
+
+ void * binary_; //!< code object binary (ISA)
+ size_t binarySize_; //!< size of the code object binary
+
+ std::string llvmIR_; //!< LLVM IR binary code
+
+ amd::OclElf* oclElf_; //!< ELF object to access runtime metadata
+
+ roc::RuntimeMD::Program::Metadata* runtimeMD_; //!< runtime metadata
+ };
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
//! \class empty program
class HSAILProgram : public device::Program
{
@@ -38,6 +93,12 @@ namespace roc {
const aclBinary* binaryElf() const {
return static_cast(binaryElf_); }
+#if defined(WITH_LIGHTNING_COMPILER)
+ //! Returns the code object binary associated with the progrm
+ const CodeObjBinary* codeObjBinary() const { //! Binary for the code object
+ return static_cast(codeObjBinary_); }
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
const std::string& HsailText() {
return hsailProgram_;
}
@@ -65,9 +126,17 @@ namespace roc {
virtual bool compileImpl(
const std::string& sourceCode, //!< the program's source code
const std::vector& headers,
- const char** headerIncludeNames,
+ const char** headerIncludeNames,
amd::option::Options* options //!< compile options's object
);
+#if defined(WITH_LIGHTNING_COMPILER)
+ virtual bool compileImpl_LC(
+ const std::string& sourceCode, //!< the program's source code
+ const std::vector& headers,
+ const char** headerIncludeNames,
+ amd::option::Options* options //!< compile options's object
+ );
+#endif // defined(WITH_LIGHTNING_COMPILER)
/*! \brief Compiles LLVM binary to HSAIL code (compiler backend: link+opt+codegen)
*
@@ -79,9 +148,12 @@ namespace roc {
virtual bool linkImpl(amd::option::Options* options);
+#if defined(WITH_LIGHTNING_COMPILER)
+ virtual bool linkImpl_LC(amd::option::Options* options);
+#endif // defined(WITH_LIGHTNING_COMPILER)
//! Link the device programs.
- virtual bool linkImpl (const std::vector& inputPrograms,
+ virtual bool linkImpl (const std::vector& inputPrograms,
amd::option::Options* options,
bool createLibrary);
@@ -137,6 +209,11 @@ namespace roc {
//compiler library
std::string hsailOptions();
+#if defined(WITH_LIGHTNING_COMPILER)
+ //! append all the HSAIL options to the compiler library
+ void appendHsailOptions(std::vector& options);
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
std::string openCLSource_; //!< Original OpenCL source
std::string hsailProgram_; //!< HSAIL program after compilation.
std::string llvmBinary_; //!< LLVM IR binary code
@@ -150,6 +227,16 @@ namespace roc {
hsa_ext_program_t hsaProgramHandle_; //!< Handle to HSA runtime program
hsa_code_object_t hsaProgramCodeObject_; //!< Handle to HSA code object
hsa_executable_t hsaExecutable_; //!< Handle to HSA executable
+
+#if defined(WITH_LIGHTNING_COMPILER)
+ hsa_code_object_t lcProgramCodeObject_; //!< Handle to LC code object
+ hsa_executable_t lcExecutable_; //!< Handle to LC executable
+
+ CodeObjBinary* codeObjBinary_; //! Binary for the code object
+
+ void* lcBinaryElf_; //!< memory store the ELF code object
+ size_t lcBinaryElfSize_; //!< size of the ELF code object
+#endif // defined(WITH_LIGHTNING_COMPILER)
};
/*@}*/} // namespace roc
diff --git a/rocclr/runtime/device/rocm/rocvirtual.cpp b/rocclr/runtime/device/rocm/rocvirtual.cpp
index 9a23d44f3c..5fe96f1ba6 100644
--- a/rocclr/runtime/device/rocm/rocvirtual.cpp
+++ b/rocclr/runtime/device/rocm/rocvirtual.cpp
@@ -522,14 +522,17 @@ VirtualGPU::create(bool profilingEna)
return false;
}
-#if !defined(WITH_LIGHTNING_COMPILER) // FIXME_Wilkin
device::BlitManager::Setup blitSetup;
+ #if defined(WITH_LIGHTNING_COMPILER)
+ // TODO: Wilkin - remove the setting of value_ after image kernels are available
+ blitSetup.value_ = 0x3724; // disable the image related BLIT kernels for now
+#endif // defined(WITH_LIGHTNING_COMPILER)
+
blitMgr_ = new KernelBlitManager(*this, blitSetup);
if ((NULL == blitMgr_) || !blitMgr_->create(roc_device_)) {
LogError("Could not create BlitManager!");
return false;
}
-#endif // !defined(WITH_LIGHTNING_COMPILER)
// Create signal for the barrier packet.
hsa_signal_t signal = { 0 };
diff --git a/rocclr/runtime/platform/program.cpp b/rocclr/runtime/platform/program.cpp
index 33e958d2e1..1336fa1ab5 100644
--- a/rocclr/runtime/platform/program.cpp
+++ b/rocclr/runtime/platform/program.cpp
@@ -49,7 +49,10 @@ Program::addDeviceProgram(Device& device, const void* image, size_t length,
amd::option::Options* options)
{
#if defined(WITH_LIGHTNING_COMPILER)
- if (image != NULL) { assert(!"FIMXE_Wilkins: check the code below"); }
+ // LC binary must be in ELF format
+ if (image != NULL && !amd::isElfMagic((const char *) image)) {
+ return CL_INVALID_BINARY;
+ }
#else // !defined(WITH_LIGHTNING_COMPILER)
if (image != NULL &&
!aclValidateBinaryImage(image, length,
@@ -76,7 +79,10 @@ Program::addDeviceProgram(Device& device, const void* image, size_t length,
emptyOptions = true;
}
#if defined(WITH_LIGHTNING_COMPILER)
- if (image != NULL && length != 0) { assert(!"FIMXE_Wilkins: check the code below"); }
+ if (image != NULL && length != 0 && amd::isElfMagic((const char *) image)) {
+ assert(!"FIMXE_Wilkins: check the code below");
+ return CL_INVALID_BINARY;
+ }
#else // !defined(WITH_LIGHTNING_COMPILER)
if (image != NULL && length != 0 && aclValidateBinaryImage(image, length, BINARY_TYPE_ELF)) {
acl_error errorCode;
@@ -99,9 +105,7 @@ Program::addDeviceProgram(Device& device, const void* image, size_t length,
return CL_INVALID_COMPILER_OPTIONS;
}
}
-#if !defined(WITH_LIGHTNING_COMPILER)
options->oVariables->Legacy = isAMDILTarget(*aclutGetTargetInfo(binary));
-#endif // !defined(WITH_LIGHTNING_COMPILER)
}
#endif // !defined(WITH_LIGHTNING_COMPILER)
options->oVariables->BinaryIsSpirv = isSPIRV_;