From 5e7eb30dd70cd302b4e826c43c815f09083b7925 Mon Sep 17 00:00:00 2001
From: foreman <dl.constructicon@amd.com>
Date: Thu, 24 Mar 2016 12:15:44 -0400
Subject: [PATCH] P4 to Git Change 1250949 by gandryey@gera-w8 on 2016/03/24
 12:06:49

	SWDEV-90618 - cl_kernel_info_amd always returns 0 when working via HSAIL path
	- Allow null kernel creation for offline compilation

Affected files ...

... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#312 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.cpp#224 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuprogram.hpp#66 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuscsi.cpp#36 edit
---
 rocclr/runtime/device/gpu/gpukernel.cpp  |  5 +--
 rocclr/runtime/device/gpu/gpuprogram.cpp | 30 ++++++++++++++---
 rocclr/runtime/device/gpu/gpuprogram.hpp |  4 +--
 rocclr/runtime/device/gpu/gpuscsi.cpp    | 42 +++++++++++++++---------
 4 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp
index 58d859fd84..74dc4c39e7 100644
--- a/rocclr/runtime/device/gpu/gpukernel.cpp
+++ b/rocclr/runtime/device/gpu/gpukernel.cpp
@@ -3422,10 +3422,7 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize)
         }
     }
 
-    // Allocate HW resources for the real program only
-    if (!prog().isNull()) {
-        aqlCreateHWInfo(sym);
-    }
+    aqlCreateHWInfo(sym);
 
     // Pull out metadata from the ELF
     size_t sizeOfArgList;
diff --git a/rocclr/runtime/device/gpu/gpuprogram.cpp b/rocclr/runtime/device/gpu/gpuprogram.cpp
index b545375c9b..c9a18defb1 100644
--- a/rocclr/runtime/device/gpu/gpuprogram.cpp
+++ b/rocclr/runtime/device/gpu/gpuprogram.cpp
@@ -2151,7 +2151,7 @@ HSAILProgram::linkImpl(amd::option::Options* options)
     // ACL_TYPE_CG stage is not performed for offline compilation
     hsa_agent_t agent;
     agent.handle = 1;
-    if (!isNull() && hsaLoad) {
+    if (hsaLoad) {
         executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL);
         if (executable_ == NULL) {
             buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n";
@@ -2176,7 +2176,7 @@ HSAILProgram::linkImpl(amd::option::Options* options)
         buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
         return false;
     }
-    if (!isNull() && kernelNamesSize > 0) {
+    if (kernelNamesSize > 0) {
         char* kernelNames = new char[kernelNamesSize];
         errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames, &kernelNamesSize);
         if (errorCode != ACL_SUCCESS) {
@@ -2447,8 +2447,10 @@ void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment,
     case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
     case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
     case AMDGPU_HSA_SEGMENT_READONLY_AGENT: {
-        gpu::Memory *gpuMem = reinterpret_cast<gpu::Memory*>(seg);
-        return reinterpret_cast<void*>(gpuMem->vmAddress() + offset);
+        if (!program_->isNull()) {
+            gpu::Memory *gpuMem = reinterpret_cast<gpu::Memory*>(seg);
+            return reinterpret_cast<void*>(gpuMem->vmAddress() + offset);
+        }
     }
     case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset;
     default:
@@ -2487,7 +2489,7 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate(
         case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break;
         case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT:          state |= amd::Sampler::StateAddressRepeat; break;
         case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break;
-		case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: state |= amd::Sampler::StateAddressNone; break;
+        case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: state |= amd::Sampler::StateAddressNone; break;
         default:
             assert(false);
             return HSA_STATUS_ERROR_INVALID_ARGUMENT;
@@ -2540,6 +2542,10 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
     assert(size);
     assert(align);
     assert(sizeof(void*) == 8 || sizeof(void*) == 4);
+    if (program_->isNull()) {
+        return new char [size];
+    }
+
     gpu::Memory* mem = new gpu::Memory(program_->dev(), amd::alignUp(size, align));
     if (!mem || !mem->create(gpu::Resource::Local)) {
         delete mem;
@@ -2562,10 +2568,24 @@ bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src,
     if (0 == size) {
         return true;
     }
+    if (program_->isNull()) {
+        memcpy(reinterpret_cast<address>(dst) + offset, src, size);
+        return true;
+    }
     assert(program_->dev().xferQueue());
     gpu::Memory* mem = reinterpret_cast<gpu::Memory*>(dst);
     return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true);
     return true;
 }
 
+void ORCAHSALoaderContext::GpuMemFree(void *ptr, size_t size)
+{
+    if (program_->isNull()) {
+        delete [] reinterpret_cast<char*>(ptr);
+    }
+    else {
+        delete reinterpret_cast<gpu::Memory*>(ptr);
+    }
+}
+
 } // namespace gpu
diff --git a/rocclr/runtime/device/gpu/gpuprogram.hpp b/rocclr/runtime/device/gpu/gpuprogram.hpp
index 70fcd07ad0..532727e2ca 100644
--- a/rocclr/runtime/device/gpu/gpuprogram.hpp
+++ b/rocclr/runtime/device/gpu/gpuprogram.hpp
@@ -480,9 +480,7 @@ private:
 
     bool GpuMemCopy(void *dst, size_t offset, const void *src, size_t size);
 
-    void GpuMemFree(void *ptr, size_t size = 0) {
-        delete reinterpret_cast<gpu::Memory*>(ptr);
-    }
+    void GpuMemFree(void *ptr, size_t size = 0);
 
     ORCAHSALoaderContext(const ORCAHSALoaderContext &c);
 
diff --git a/rocclr/runtime/device/gpu/gpuscsi.cpp b/rocclr/runtime/device/gpu/gpuscsi.cpp
index 9f38b9fdc7..1c7859f5cd 100644
--- a/rocclr/runtime/device/gpu/gpuscsi.cpp
+++ b/rocclr/runtime/device/gpu/gpuscsi.cpp
@@ -153,26 +153,27 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
     if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast<void*>(&akc_align))) {
         return false;
     }
-    code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, akc_align));
-    // Initialize kernel ISA code
-    if (code_ && code_->create(Resource::Shader)) {
-        address cpuCodePtr = static_cast<address>(code_->map(NULL, Resource::WriteOnly));
-        // Copy only amd_kernel_code_t
-        memcpy(cpuCodePtr,  reinterpret_cast<address>(akc), codeSize_);
-        code_->unmap(NULL);
-    }
-    else {
-        LogError("Failed to allocate ISA code!");
-        return false;
+
+    // Allocate HW resources for the real program only
+    if (!prog().isNull()) {
+        code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, akc_align));
+        // Initialize kernel ISA code
+        if (code_ && code_->create(Resource::Shader)) {
+            address cpuCodePtr = static_cast<address>(code_->map(NULL, Resource::WriteOnly));
+            // Copy only amd_kernel_code_t
+            memcpy(cpuCodePtr,  reinterpret_cast<address>(akc), codeSize_);
+            code_->unmap(NULL);
+        }
+        else {
+            LogError("Failed to allocate ISA code!");
+            return false;
+        }
     }
 
     assert((akc->workitem_private_segment_byte_size & 3) == 0 &&
         "Scratch must be DWORD aligned");
     workGroupInfo_.scratchRegs_ =
         amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint);
-    workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable();
-    workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable();
-    workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize;
     workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size;
     workGroupInfo_.availableLDSSize_ = dev().info().localMemSize_;
     workGroupInfo_.localMemSize_ =
@@ -180,8 +181,19 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym)
     workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count;
     workGroupInfo_.usedStackSize_ = 0;
     workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count;
-    workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize;
 
+    if (!prog().isNull()) {
+        workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable();
+        workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable();
+        workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize;
+        workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize;
+    }
+    else {
+        workGroupInfo_.availableSGPRs_ = 104;
+        workGroupInfo_.availableVGPRs_ = 256;
+        workGroupInfo_.preferredSizeMultiple_ =
+        workGroupInfo_.wavefrontPerSIMD_ = 64;
+    }
     return true;
 }
 } // namespace gpu