diff --git a/rocclr/runtime/device/gpu/gpukernel.cpp b/rocclr/runtime/device/gpu/gpukernel.cpp index 58d859fd84..74dc4c39e7 100644 --- a/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/rocclr/runtime/device/gpu/gpukernel.cpp @@ -3422,10 +3422,7 @@ HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize) } } - // Allocate HW resources for the real program only - if (!prog().isNull()) { - aqlCreateHWInfo(sym); - } + aqlCreateHWInfo(sym); // Pull out metadata from the ELF size_t sizeOfArgList; diff --git a/rocclr/runtime/device/gpu/gpuprogram.cpp b/rocclr/runtime/device/gpu/gpuprogram.cpp index b545375c9b..c9a18defb1 100644 --- a/rocclr/runtime/device/gpu/gpuprogram.cpp +++ b/rocclr/runtime/device/gpu/gpuprogram.cpp @@ -2151,7 +2151,7 @@ HSAILProgram::linkImpl(amd::option::Options* options) // ACL_TYPE_CG stage is not performed for offline compilation hsa_agent_t agent; agent.handle = 1; - if (!isNull() && hsaLoad) { + if (hsaLoad) { executable_ = loader_->CreateExecutable(HSA_PROFILE_FULL, NULL); if (executable_ == NULL) { buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n"; @@ -2176,7 +2176,7 @@ HSAILProgram::linkImpl(amd::option::Options* options) buildLog_ += "Error: Querying of kernel names size from the binary failed.\n"; return false; } - if (!isNull() && kernelNamesSize > 0) { + if (kernelNamesSize > 0) { char* kernelNames = new char[kernelNamesSize]; errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames, &kernelNamesSize); if (errorCode != ACL_SUCCESS) { @@ -2447,8 +2447,10 @@ void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: case AMDGPU_HSA_SEGMENT_READONLY_AGENT: { - gpu::Memory *gpuMem = reinterpret_cast(seg); - return reinterpret_cast(gpuMem->vmAddress() + offset); + if (!program_->isNull()) { + gpu::Memory *gpuMem = reinterpret_cast(seg); + return reinterpret_cast(gpuMem->vmAddress() + offset); + } } case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset; default: @@ -2487,7 +2489,7 @@ hsa_status_t ORCAHSALoaderContext::SamplerCreate( case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break; case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT: state |= amd::Sampler::StateAddressRepeat; break; case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break; - case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: state |= amd::Sampler::StateAddressNone; break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: state |= amd::Sampler::StateAddressNone; break; default: assert(false); return HSA_STATUS_ERROR_INVALID_ARGUMENT; @@ -2540,6 +2542,10 @@ void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) { assert(size); assert(align); assert(sizeof(void*) == 8 || sizeof(void*) == 4); + if (program_->isNull()) { + return new char [size]; + } + gpu::Memory* mem = new gpu::Memory(program_->dev(), amd::alignUp(size, align)); if (!mem || !mem->create(gpu::Resource::Local)) { delete mem; @@ -2562,10 +2568,24 @@ bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, if (0 == size) { return true; } + if (program_->isNull()) { + memcpy(reinterpret_cast
(dst) + offset, src, size); + return true; + } assert(program_->dev().xferQueue()); gpu::Memory* mem = reinterpret_cast(dst); return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true); return true; } +void ORCAHSALoaderContext::GpuMemFree(void *ptr, size_t size) +{ + if (program_->isNull()) { + delete [] reinterpret_cast(ptr); + } + else { + delete reinterpret_cast(ptr); + } +} + } // namespace gpu diff --git a/rocclr/runtime/device/gpu/gpuprogram.hpp b/rocclr/runtime/device/gpu/gpuprogram.hpp index 70fcd07ad0..532727e2ca 100644 --- a/rocclr/runtime/device/gpu/gpuprogram.hpp +++ b/rocclr/runtime/device/gpu/gpuprogram.hpp @@ -480,9 +480,7 @@ private: bool GpuMemCopy(void *dst, size_t offset, const void *src, size_t size); - void GpuMemFree(void *ptr, size_t size = 0) { - delete reinterpret_cast(ptr); - } + void GpuMemFree(void *ptr, size_t size = 0); ORCAHSALoaderContext(const ORCAHSALoaderContext &c); diff --git a/rocclr/runtime/device/gpu/gpuscsi.cpp b/rocclr/runtime/device/gpu/gpuscsi.cpp index 9f38b9fdc7..1c7859f5cd 100644 --- a/rocclr/runtime/device/gpu/gpuscsi.cpp +++ b/rocclr/runtime/device/gpu/gpuscsi.cpp @@ -153,26 +153,27 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym) if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast(&akc_align))) { return false; } - code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, akc_align)); - // Initialize kernel ISA code - if (code_ && code_->create(Resource::Shader)) { - address cpuCodePtr = static_cast
(code_->map(NULL, Resource::WriteOnly)); - // Copy only amd_kernel_code_t - memcpy(cpuCodePtr, reinterpret_cast
(akc), codeSize_); - code_->unmap(NULL); - } - else { - LogError("Failed to allocate ISA code!"); - return false; + + // Allocate HW resources for the real program only + if (!prog().isNull()) { + code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, akc_align)); + // Initialize kernel ISA code + if (code_ && code_->create(Resource::Shader)) { + address cpuCodePtr = static_cast
(code_->map(NULL, Resource::WriteOnly)); + // Copy only amd_kernel_code_t + memcpy(cpuCodePtr, reinterpret_cast
(akc), codeSize_); + code_->unmap(NULL); + } + else { + LogError("Failed to allocate ISA code!"); + return false; + } } assert((akc->workitem_private_segment_byte_size & 3) == 0 && "Scratch must be DWORD aligned"); workGroupInfo_.scratchRegs_ = amd::alignUp(akc->workitem_private_segment_byte_size, 16) / sizeof(uint); - workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable(); - workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable(); - workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize; workGroupInfo_.privateMemSize_ = akc->workitem_private_segment_byte_size; workGroupInfo_.availableLDSSize_ = dev().info().localMemSize_; workGroupInfo_.localMemSize_ = @@ -180,8 +181,19 @@ HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym) workGroupInfo_.usedSGPRs_ = akc->wavefront_sgpr_count; workGroupInfo_.usedStackSize_ = 0; workGroupInfo_.usedVGPRs_ = akc->workitem_vgpr_count; - workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize; + if (!prog().isNull()) { + workGroupInfo_.availableSGPRs_ = dev().gslCtx()->getNumSGPRsAvailable(); + workGroupInfo_.availableVGPRs_ = dev().gslCtx()->getNumVGPRsAvailable(); + workGroupInfo_.preferredSizeMultiple_ = dev().getAttribs().wavefrontSize; + workGroupInfo_.wavefrontPerSIMD_ = dev().getAttribs().wavefrontSize; + } + else { + workGroupInfo_.availableSGPRs_ = 104; + workGroupInfo_.availableVGPRs_ = 256; + workGroupInfo_.preferredSizeMultiple_ = + workGroupInfo_.wavefrontPerSIMD_ = 64; + } return true; } } // namespace gpu