diff --git a/projects/clr/rocclr/compiler/lib/amdoclcl.def.in b/projects/clr/rocclr/compiler/lib/amdoclcl.def.in index e63d96bfe5..a4f923e8c8 100644 --- a/projects/clr/rocclr/compiler/lib/amdoclcl.def.in +++ b/projects/clr/rocclr/compiler/lib/amdoclcl.def.in @@ -50,6 +50,3 @@ aclJITObjectImageDisassembleKernel #endif aclJITObjectImageIterateSymbols aclJITObjectImageGetGlobalsSize -#if defined(WITH_TARGET_HSAIL) -_aclHsaLoader -#endif diff --git a/projects/clr/rocclr/compiler/lib/amdoclcl.map.in b/projects/clr/rocclr/compiler/lib/amdoclcl.map.in index 981529d8c9..a56ae07c08 100644 --- a/projects/clr/rocclr/compiler/lib/amdoclcl.map.in +++ b/projects/clr/rocclr/compiler/lib/amdoclcl.map.in @@ -53,9 +53,6 @@ global: #endif aclJITObjectImageIterateSymbols; aclJITObjectImageGetGlobalsSize; -#if defined(WITH_TARGET_HSAIL) - _aclHsaLoader; -#endif #if defined(OPENCL_MAINLINE) local: *; diff --git a/projects/clr/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp b/projects/clr/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp index b910ad34c6..100b4397f3 100644 --- a/projects/clr/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp +++ b/projects/clr/rocclr/compiler/lib/backends/common/v0_8/if_acl.cpp @@ -2092,11 +2092,17 @@ if_aclGetDeviceBinary(aclCompiler *cl, size_t *size, acl_error *error_code) { - const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symISABinary); - assert(symbol && "symbol not found"); - std::string kernelName = symbol->str[PRE] + std::string(kernel) + symbol->str[POST]; - return cl->clAPI.extSym(cl, bin, size, - symbol->sections[0], kernelName.c_str(), error_code); +#ifdef WITH_TARGET_HSAIL + if (isHSAILTarget(bin->target)) { + return cl->clAPI.extSec(cl, bin, size, aclTEXT, error_code); + } else +#endif + { + const oclBIFSymbolStruct* sym = findBIF30SymStruct(symISABinary); + assert(sym && "symbol not found"); + std::string name = sym->str[PRE] + std::string(kernel) + sym->str[POST]; + return cl->clAPI.extSym(cl, bin, size, sym->sections[0], name.c_str(), error_code); + } } acl_error ACL_API_ENTRY diff --git a/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.cpp b/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.cpp index 963b5ab158..0342406dbd 100644 --- a/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.cpp +++ b/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.cpp @@ -17,6 +17,12 @@ extern aclBinary* constructBinary(size_t struct_version, const aclTargetInfo *target, const aclBinaryOptions *opts); +static const std::string sgfx700 = "AMD:AMDGPU:7:0:0"; +static const std::string sgfx701 = "AMD:AMDGPU:7:0:1"; +static const std::string sgfx800 = "AMD:AMDGPU:8:0:0"; +static const std::string sgfx801 = "AMD:AMDGPU:8:0:1"; +static const std::string sgfx900 = "AMD:AMDGPU:9:0:0"; + // Utility function to set a flag in option structure // of the aclDevCaps. void @@ -501,6 +507,54 @@ unsigned getChipEnum(const aclTargetInfo *target) return Mapping.chip_enum; } +/*! Function that returns isa type name (compute capability) from + *the TargetMapping table for the specific target device id. + */ +const std::string &getIsaTypeName(const aclTargetInfo *target) +{ + const TargetMapping& Mapping = getTargetMapping(*target); + switch (Mapping.family_enum) { + default: return sgfx700; + case FAMILY_KV: + switch (Mapping.chip_enum) { + default: return sgfx700; + case KV_SPECTRE_A0: + case KV_SPOOKY_A0: + case KB_KALINDI_A0: + // ??? + case ML_GODAVARI_A0: return sgfx700; + } + case FAMILY_CI: + switch (Mapping.chip_enum) { + default: return sgfx700; + case CI_BONAIRE_M_A0: + case CI_BONAIRE_M_A1: return sgfx700; + case CI_HAWAII_P_A0: return sgfx701; + case CI_TIRAN_P_A0: + case CI_MAUI_P_A0: return sgfx700; + } + case FAMILY_VI: + switch (Mapping.chip_enum) { + default: return sgfx800; + case VI_ICELAND_M_A0: + case VI_TONGA_P_A0: return sgfx800; + case VI_ELLESMERE_P_A0: + case VI_BAFFIN_M_A0: + case VI_FIJI_P_A0: return sgfx801; + } + case FAMILY_CZ: + switch (Mapping.chip_enum) { + default: return sgfx801; + case CARRIZO_A0: return sgfx801; + } + case FAMILY_AI: + switch (Mapping.chip_enum) { + default: return sgfx900; + case AI_GREENLAND_P_A0: return sgfx900; + } + } +} + void appendLogToCL(aclCompiler *cl, const std::string &logStr) { diff --git a/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.h b/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.h index d90ee94a2e..d3a068573d 100644 --- a/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.h +++ b/projects/clr/rocclr/compiler/lib/utils/v0_8/libUtils.h @@ -36,16 +36,19 @@ initElfDeviceCaps(aclBinary *elf); void appendLogToCL(aclCompiler *cl, const std::string &logStr); -const char *getDeviceName(const aclTargetInfo &Target); +const char *getDeviceName(const aclTargetInfo &target); // Select the correct library from the target information. -amd::LibrarySelector getLibraryType(const aclTargetInfo *Target); +amd::LibrarySelector getLibraryType(const aclTargetInfo *target); // get family_enum from the target information. -unsigned getFamilyEnum(const aclTargetInfo *Target); +unsigned getFamilyEnum(const aclTargetInfo *target); // get chip_enum from the target information. -unsigned getChipEnum(const aclTargetInfo *Target); +unsigned getChipEnum(const aclTargetInfo *target); + +// get isa type name (compute capability) from the target information. +const std::string &getIsaTypeName(const aclTargetInfo *target); // Create a copy of an ELF and duplicate all sections/symbols aclBinary* diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp index 6f229b7ddf..6bff20f27a 100644 --- a/projects/clr/rocclr/runtime/device/device.hpp +++ b/projects/clr/rocclr/runtime/device/device.hpp @@ -940,6 +940,8 @@ public: //! Return the build log const std::string& buildLog() const { return buildLog_; } + static std::string openclMangledName(const std::string& name) { return "&__OpenCL_" + name + "_kernel"; } + protected: std::string name_; //!< kernel name WorkGroupInfo workGroupInfo_; //!< device kernel info structure diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudefs.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudefs.hpp index 0fb0e9b133..d9e8e18b6c 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpudefs.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpudefs.hpp @@ -128,6 +128,13 @@ static const AMDDeviceInfo DeviceInfo[] = { /* CAL_TARGET_GREENLAND */ { ED_ATI_CAL_MACHINE_GREENLAND_ISA, "", "", 4, 16, 1, 256, 64 * Ki, 32, 900 }, }; +static const char* Gfx700 = "AMD:AMDGPU:7:0:0"; +static const char* Gfx701 = "AMD:AMDGPU:7:0:1"; +static const char* Gfx800 = "AMD:AMDGPU:8:0:0"; +static const char* Gfx801 = "AMD:AMDGPU:8:0:1"; +static const char* Gfx810 = "AMD:AMDGPU:8:1:0"; +static const char* Gfx900 = "AMD:AMDGPU:9:0:0"; + // Supported OpenCL versions enum OclVersion { OpenCL10, diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp index e2d61d0982..a5f3e8af4f 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp @@ -3552,12 +3552,12 @@ HSAILKernel::~HSAILKernel() } bool -HSAILKernel::init(bool finalize) +HSAILKernel::init(amd::hsa::loader::Symbol *sym, bool finalize) { acl_error error; - const oclBIFSymbolStruct* sym = findBIF30SymStruct(symOpenclKernel); - assert(sym && "symbol not found"); - std::string openClKernelName(std::string("&") + sym->str[PRE] + name() + sym->str[POST]); + const oclBIFSymbolStruct* bifSym = findBIF30SymStruct(symOpenclKernel); + assert(bifSym && "symbol not found"); + std::string openClKernelName(std::string("&") + bifSym->str[PRE] + name() + bifSym->str[POST]); //compile kernel down to ISA if (finalize) { std::string options(compileOptions_.c_str()); @@ -3578,19 +3578,10 @@ HSAILKernel::init(bool finalize) return false; } } - // Get the ISA out - size_t size_isa; - void* shader_isa = NULL; - shader_isa = const_cast(aclGetDeviceBinary(dev().hsaCompiler(), - prog().binaryElf(), openClKernelName.c_str(), &size_isa, &error)); - if (shader_isa == NULL) { - LogError("Failed find the ISA"); - return false; - } // Allocate HW resources for the real program only if (!prog().isNull()) { - aqlCreateHWInfo(shader_isa, size_isa); + aqlCreateHWInfo(sym); } // Pull out metadata from the ELF @@ -4131,8 +4122,8 @@ HSAILKernel::loadArguments( memList.push_back(cb); memList.push_back(gpuAqlCode()); - if (NULL != prog().globalStore()) { - memList.push_back(prog().globalStore()); + for (gpu::Memory * mem : prog().globalStores()) { + memList.push_back(mem); } if (AMD_HSA_BITS_GET(cpuAqlCode_->kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp index 7147128ff9..870f0313de 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp @@ -17,6 +17,15 @@ #include "device/gpu/gpuprintf.hpp" #include "device/gpu/gpuwavelimiter.hpp" #include "hsa.h" + +namespace amd { +namespace hsa { +namespace loader { +class Symbol; +} // loader +} // hsa +} // amd + //! \namespace gpu GPU Device Implementation namespace gpu { @@ -847,7 +856,7 @@ public: //! Initializes the metadata required for this kernel, //! finalizes the kernel if needed - bool init(bool finalize = true); + bool init(amd::hsa::loader::Symbol *sym, bool finalize = false); //! Returns true if memory is valid for execution virtual bool validateMemory(uint idx, amd::Memory* amdMem) const; @@ -927,10 +936,7 @@ private: HSAILKernel& operator=(const HSAILKernel&); //! Creates AQL kernel HW info - bool aqlCreateHWInfo( - const void* kernel, //!< Kernel's packed binary info and code - size_t kernelSize //!< Size of the kernel's packed binary - ); + bool aqlCreateHWInfo(amd::hsa::loader::Symbol *sym); //! Initializes arguments_ and the abstraction layer kernel parameters void initArgList( diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuprogram.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuprogram.cpp index cd38fbae9c..5780243b8b 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuprogram.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuprogram.cpp @@ -18,135 +18,6 @@ #include "hsa.h" #include "hsa_ext_image.h" -extern "C" bool -ACL_API_ENTRY _aclHsaLoader( - aclCompiler* compiler_handle, - aclBinary* bin, - void* userData, - void (*allocateGPUMemory)(void* userData, size_t size, uint64_t* GPUMemory), - bool (*DmaMemoryCopy)(void* userData, uint64_t offset, const void* pSrc, size_t size), - void (*getSamplerObjectParam)(uint32_t* size, uint32_t* alignment), - void (*initializeSamplerObject)(void* userData, uint64_t offset, bool unnormalize, - uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW)); - -bool -DmaMemoryCopy(void* userData, uint64_t offset, const void* pSrc, size_t size) -{ - gpu::HSAILProgram* prog = reinterpret_cast(userData); - gpu::Memory* mem = const_cast(prog->globalStore()); - if (mem == NULL) { - return false; - } - size_t maxCopySize = prog->globalVariableTotalSize(); - if (maxCopySize >= size) { - maxCopySize = size; - } - amd::Coord3D origin(offset); - amd::Coord3D region(maxCopySize); - // memcpy mode - if (pSrc) { - const bool Entire = true; - return prog->dev().xferMgr().writeBuffer(pSrc, *mem, origin, region, Entire); - } - // memset mode - else { - char pattern = 0; - return prog->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), - origin, region); - } -} - -void -AllocateGPUMemory(void* userData, size_t size, uint64_t* GPUMemory) -{ - gpu::Memory* mem = NULL; - void* cpuPtr = NULL; - gpu::HSAILProgram* prog = reinterpret_cast(userData); - - mem = new gpu::Memory(prog->dev(), amd::alignUp(size, gpu::ConstBuffer::VectorSize)); - - // Initialize constant buffer - if ((mem == NULL) || !mem->create(gpu::Resource::Local)) { - delete mem; - *GPUMemory = 0; - return; - } - *GPUMemory = mem->vmAddress(); - prog->setGlobalStore(mem); - prog->setGlobalVariableTotalSize(size); -} - -void -GetSamplerObjectParams(uint32_t* size, uint32_t* alignment) -{ - if (GPU_DIRECT_SRD) { - *size = gpu::HsaSamplerObjectSize; - *alignment = gpu::HsaSamplerObjectAlignment; - } - else { - *size = sizeof(uint64_t); - *alignment = sizeof(uint64_t); - } -} - -void -InitializeSamplerObject(void* userData, uint64_t offset, bool unnormalize, - uint8_t fltr, uint8_t addrU, uint8_t addrV, uint8_t addrW) -{ - assert((addrU == addrV && addrV == addrW) && "GSL supports single address mode"); - hsa_ext_sampler_filter_mode_t filter = - static_cast(fltr); - hsa_ext_sampler_addressing_mode_t boundaryU = - static_cast(addrU); - - uint32_t state = (unnormalize) ? - amd::Sampler::StateNormalizedCoordsFalse : amd::Sampler::StateNormalizedCoordsTrue; - if (filter == HSA_EXT_SAMPLER_FILTER_MODE_LINEAR) { - state |= amd::Sampler::StateFilterNearest; - } - else if (filter == HSA_EXT_SAMPLER_FILTER_MODE_LINEAR) { - state |= amd::Sampler::StateFilterLinear; - } - switch (boundaryU) { - case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: - state |= amd::Sampler::StateAddressClampToEdge; - break; - case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: - state |= amd::Sampler::StateAddressClamp; - break; - case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT: - state |= amd::Sampler::StateAddressRepeat; - break; - case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: - state |= amd::Sampler::StateAddressMirroredRepeat; - break; - case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: - default: - break; - } - - gpu::HSAILProgram* prog = reinterpret_cast(userData); - if (prog->dev().settings().hsailDirectSRD_) { - char *pCPUbuf = new char[gpu::HsaSamplerObjectSize]; - if (!pCPUbuf) { - assert(false); - return; - } - prog->dev().fillHwSampler(state, pCPUbuf, gpu::HsaSamplerObjectSize); - DmaMemoryCopy(userData, offset, pCPUbuf, gpu::HsaSamplerObjectSize); - delete pCPUbuf; - } - else { - gpu::Sampler* sampler = new gpu::Sampler(prog->dev()); - if ((sampler != NULL) && sampler->create(state)) { - uint64_t hwSrd = sampler->hwSrd(); - DmaMemoryCopy(userData, offset, &hwSrd, sizeof(uint64_t)); - prog->addSampler(sampler); - } - } - return; -} - namespace gpu { bool @@ -1768,10 +1639,11 @@ HSAILProgram::HSAILProgram(Device& device) , llvmBinary_() , binaryElf_(NULL) , rawBinary_(NULL) - , globalStore_(NULL) , kernels_(NULL) , maxScratchRegs_(0) , isNull_(false) + , executable_(NULL) + , loaderContext_(this) { memset(&binOpts_, 0, sizeof(binOpts_)); binOpts_.struct_size = sizeof(binOpts_); @@ -1786,10 +1658,11 @@ HSAILProgram::HSAILProgram(NullDevice& device) , llvmBinary_() , binaryElf_(NULL) , rawBinary_(NULL) - , globalStore_(NULL) , kernels_(NULL) , maxScratchRegs_(0) , isNull_(true) + , executable_(NULL) + , loaderContext_(this) { memset(&binOpts_, 0, sizeof(binOpts_)); binOpts_.struct_size = sizeof(binOpts_); @@ -1817,7 +1690,9 @@ HSAILProgram::~HSAILProgram() } } releaseClBinary(); - delete globalStore_; + if (executable_ != NULL) { + Executable::Destroy(executable_); + } delete kernels_; } @@ -2163,21 +2038,46 @@ HSAILProgram::linkImpl(amd::option::Options* options) break; } case ACL_TYPE_CG: - hsaLoad = false; break; case ACL_TYPE_ISA: - hsaLoad = false; finalize = false; break; default: buildLog_ += "Error while BRIG Codegen phase: the binary is incomplete \n" ; return false; } + if (finalize) { + std::string fin_options(options->origOptionStr + hsailOptions()); + // Append an option so that we can selectively enable a SCOption on CZ + // whenever IOMMUv2 is enabled. + if (dev().settings().svmFineGrainSystem_) { + fin_options.append(" -sc-xnack-iommu"); + } + errorCode = aclCompile(dev().hsaCompiler(), binaryElf_, + fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, NULL); + buildLog_ += aclGetCompilerLog(dev().hsaCompiler()); + if (errorCode != ACL_SUCCESS) { + LogError("Failed to finalize"); + return false; + } + } // ACL_TYPE_CG stage is not performed for offline compilation + hsa_agent_t agent; + agent.handle = 1; if (!isNull() && hsaLoad) { - if (!_aclHsaLoader(dev().hsaCompiler(), binaryElf_, this, &AllocateGPUMemory, - &DmaMemoryCopy, &GetSamplerObjectParams, &InitializeSamplerObject)) { - buildLog_ += "Error while BRIG Codegen phase: loading BRIG globals in the ELF \n"; + executable_ = Executable::Create(HSA_PROFILE_BASE, &loaderContext_, NULL); + if (executable_ == NULL) { + return false; + } + size_t size = 0; + hsa_code_object_t code_object; + code_object.handle = reinterpret_cast(aclExtractSection(dev().hsaCompiler(), binaryElf_, &size, aclTEXT, &errorCode)); + if (errorCode != ACL_SUCCESS) { + return false; + } + hsa_status_t status = executable_->LoadCodeObject(agent, code_object, NULL); + if (status != HSA_STATUS_SUCCESS) { + buildLog_ += "Error while HSA Loader phase: loading HSA Code Object \n"; return false; } } @@ -2187,7 +2087,7 @@ HSAILProgram::linkImpl(amd::option::Options* options) buildLog_ += "Error while Finalization phase: kernel names query from the ELF failed\n"; return false; } - if (kernelNamesSize > 0) { + if (!isNull() && kernelNamesSize > 0) { char* kernelNames = new char[kernelNamesSize]; errorCode = aclQueryInfo(dev().hsaCompiler(), binaryElf_, RT_KERNEL_NAMES, NULL, kernelNames, &kernelNamesSize); if (errorCode != ACL_SUCCESS) { @@ -2202,12 +2102,18 @@ HSAILProgram::linkImpl(amd::option::Options* options) for (it; it != vKernels.end(); ++it) { std::string kernelName = *it; HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions()); - if (!aKernel->init(finalize)) { + kernels()[kernelName] = aKernel; + amd::hsa::loader::Symbol *sym = executable_->GetSymbol("", Kernel::openclMangledName(kernelName).c_str(), agent, 0); + if (!sym) { + LogError("Failed to get kernel ISA code"); + return false; + } + if (!aKernel->init(sym, false)) { + LogError("Failed to init HSAILKernel"); return false; } buildLog_ += aKernel->buildLog(); aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize); - kernels()[kernelName] = aKernel; dynamicParallelism |= aKernel->dynamicParallelism(); // Find max scratch regs used in the program. It's used for scratch buffer preallocation // with dynamic parallelism, since runtime doesn't know which child kernel will be called @@ -2333,5 +2239,208 @@ HSAILProgram::info(const char * str) { return info_; } +hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) { + hsa_isa_t isa = {0}; + if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; } + if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; } + if (!strcmp(Gfx800, name)) { isa.handle = gfx800; return isa; } + if (!strcmp(Gfx801, name)) { isa.handle = gfx801; return isa; } + if (!strcmp(Gfx810, name)) { isa.handle = gfx810; return isa; } + if (!strcmp(Gfx900, name)) { isa.handle = gfx900; return isa; } + return isa; +} + +bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) { + switch (program_->dev().hwInfo()->gfxipVersion_) { + default: + LogError("Unsupported gfxip version"); + return false; + case gfx700: + case gfx701: + case gfx702: + // gfx701 only differs from gfx700 by faster fp operations and can be loaded on either device. + return isa.handle == gfx700 || isa.handle == gfx701; + case gfx800: + if (ED_ATI_CAL_MACHINE_ICELAND_ISA == program_->dev().hwInfo()->machine_ || + ED_ATI_CAL_MACHINE_TONGA_ISA == program_->dev().hwInfo()->machine_ ) { + return isa.handle == gfx800; + } else { + // gfx800 has only sgrps limited and can be loaded on later chips. + return isa.handle == gfx800 || isa.handle == gfx801; + } + case gfx900: + return isa.handle == gfx900; + } +} + +void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, size_t size, size_t align, bool zero) { + assert(size); + assert(align); + switch (segment) { + case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: + case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: + return AgentGlobalAlloc(agent, size, align, zero); + case AMDGPU_HSA_SEGMENT_CODE_AGENT: + return KernelCodeAlloc(agent, size, align, zero); + default: + assert(false); return 0; + } +} + +bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) { + switch (segment) { + case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: + case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: + return AgentGlobalCopy(dst, offset, src, size); + case AMDGPU_HSA_SEGMENT_CODE_AGENT: + return KernelCodeCopy(dst, offset, src, size); + default: + assert(false); return false; + } +} + +void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* seg, size_t size) { + switch (segment) { + case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: + case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: AgentGlobalFree(seg, size); break; + case AMDGPU_HSA_SEGMENT_CODE_AGENT: KernelCodeFree(seg, size); break; + default: + assert(false); return; + } +} + +void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* seg, size_t offset) { + assert(seg); + switch (segment) { + case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM: + case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT: + case AMDGPU_HSA_SEGMENT_READONLY_AGENT: { + gpu::Memory *gpuMem = reinterpret_cast(seg); + return reinterpret_cast(gpuMem->vmAddress()); + } + case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset; + default: + assert(false); return NULL; + } +} + +hsa_status_t ORCAHSALoaderContext::SamplerCreate( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler_handle) { + if (!agent.handle) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + if (!sampler_descriptor || !sampler_handle) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + uint32_t state = 0; + switch (sampler_descriptor->coordinate_mode) { + case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED: state = amd::Sampler::StateNormalizedCoordsFalse; break; + case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED: state = amd::Sampler::StateNormalizedCoordsTrue; break; + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + switch (sampler_descriptor->filter_mode) { + case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: state |= amd::Sampler::StateFilterNearest; break; + case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR: state |= amd::Sampler::StateFilterLinear; break; + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + + } + switch (sampler_descriptor->address_mode) { + case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: state |= amd::Sampler::StateAddressClampToEdge; break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT: state |= amd::Sampler::StateAddressRepeat; break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break; + case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: + default: + assert(false); + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + assert(!program_->dev().settings().hsailDirectSRD_); + gpu::Sampler* sampler = new gpu::Sampler(program_->dev()); + if (!sampler || !sampler->create(state)) { + delete sampler; + return HSA_STATUS_ERROR; + } + program_->addSampler(sampler); + sampler_handle->handle = sampler->hwSrd(); + return HSA_STATUS_SUCCESS; +} + +hsa_status_t ORCAHSALoaderContext::SamplerDestroy( + hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) { + if (!agent.handle) { + return HSA_STATUS_ERROR_INVALID_AGENT; + } + if (!sampler_handle.handle) { + return HSA_STATUS_ERROR_INVALID_ARGUMENT; + } + return HSA_STATUS_SUCCESS; +} + +void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) { + assert(size); + assert(align); + assert(sizeof(void*) == 8 || sizeof(void*) == 4); + void* ptr = amd::Os::alignedMalloc(size, align); + if (zero) { + memset(ptr, 0, size); + } + return ptr; +} + +bool ORCAHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size) { + if (!dst || !src || dst == src) { + return false; + } + if (0 == size) { + return true; + } + amd::Os::fastMemcpy((char*)dst + offset, src, size); + return true; +} + +void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) { + assert(size); + assert(align); + assert(sizeof(void*) == 8 || sizeof(void*) == 4); + gpu::Memory* mem = new gpu::Memory(program_->dev(), amd::alignUp(size, align)); + if (!mem || !mem->create(gpu::Resource::Local)) { + delete mem; + return NULL; + } + assert(program_->dev().xferQueue()); + if (zero) { + char pattern = 0; + program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size)); + } + program_->addGlobalStore(mem); + program_->setGlobalVariableTotalSize(program_->globalVariableTotalSize() + size); + return mem; +} + +bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) { + if (!dst || !src || dst == src) { + return false; + } + if (0 == size) { + return true; + } + assert(program_->dev().xferQueue()); + gpu::Memory* mem = reinterpret_cast(dst); + return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true); + return true; +} } // namespace gpu diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuprogram.hpp b/projects/clr/rocclr/runtime/device/gpu/gpuprogram.hpp index 8945c5383f..916c293b06 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuprogram.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuprogram.hpp @@ -7,11 +7,18 @@ #include "device/gpu/gpukernel.hpp" #include "device/gpu/gpubinary.hpp" +#include "amd_hsa_loader.hpp" namespace amd { namespace option { class Options; } // option +namespace hsa { +namespace loader { +class Executable; +class Context; +} // loader +} // hsa } // amd //! \namespace gpu GPU Device Implementation @@ -369,6 +376,121 @@ private: gpu::Memory* glbData_; //!< Global data store }; +using namespace amd::hsa::loader; +class HSAILProgram; + +class ORCAHSALoaderContext final: public Context { +public: + ORCAHSALoaderContext(HSAILProgram* program): program_(program) {} + + virtual ~ORCAHSALoaderContext() {} + + hsa_isa_t IsaFromName(const char *name) override; + + bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override; + + void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, size_t size, size_t align, bool zero) override; + + bool SegmentCopy(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* dst, size_t offset, + const void* src, size_t size) override; + + void SegmentFree(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* seg, size_t size = 0) override; + + void* SegmentAddress(amdgpu_hsa_elf_segment_t segment, + hsa_agent_t agent, void* seg, size_t offset) override; + + bool ImageExtensionSupported() override { return false; } + + hsa_status_t ImageCreate( + hsa_agent_t agent, + hsa_access_permission_t image_permission, + const hsa_ext_image_descriptor_t *image_descriptor, + const void *image_data, + hsa_ext_image_t *image_handle) override { + // not supported + assert(false); + return HSA_STATUS_ERROR; + } + + hsa_status_t ImageDestroy( + hsa_agent_t agent, hsa_ext_image_t image_handle) override { + // not supported + assert(false); + return HSA_STATUS_ERROR; + } + + hsa_status_t SamplerCreate( + hsa_agent_t agent, + const hsa_ext_sampler_descriptor_t *sampler_descriptor, + hsa_ext_sampler_t *sampler_handle) override; + + //! All samplers are owned by HSAILProgram and are deleted in its destructor. + hsa_status_t SamplerDestroy( + hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override; + +private: + + void* AgentGlobalAlloc( + hsa_agent_t agent, size_t size, size_t align, bool zero) { + return GpuMemAlloc(size, align, zero); + } + + bool AgentGlobalCopy(void *dst, size_t offset, const void *src, size_t size) { + return GpuMemCopy(dst, offset, src, size); + } + + void AgentGlobalFree(void *ptr, size_t size) { + GpuMemFree(ptr, size); + } + + void* KernelCodeAlloc( + hsa_agent_t agent, size_t size, size_t align, bool zero) { + return CpuMemAlloc(size, align, zero); + } + + bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size) { + return CpuMemCopy(dst, offset, src, size); + } + + void KernelCodeFree(void *ptr, size_t size) { + CpuMemFree(ptr, size); + } + + void* CpuMemAlloc(size_t size, size_t align, bool zero); + + bool CpuMemCopy(void *dst, size_t offset, const void* src, size_t size); + + void CpuMemFree(void *ptr, size_t size) { + amd::Os::alignedFree(ptr); + } + + void* GpuMemAlloc(size_t size, size_t align, bool zero); + + bool GpuMemCopy(void *dst, size_t offset, const void *src, size_t size); + + void GpuMemFree(void *ptr, size_t size = 0) { + delete reinterpret_cast(ptr); + } + + ORCAHSALoaderContext(const ORCAHSALoaderContext &c); + + ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext &c); + + enum gfx_handle { + gfx700 = 700, + gfx701 = 701, + gfx702 = 702, + gfx800 = 800, + gfx801 = 801, + gfx810 = 810, + gfx900 = 900 + }; + + gpu::HSAILProgram* program_; +}; //! \class HSAIL program class HSAILProgram : public device::Program @@ -385,9 +507,9 @@ public: aclBinary* binaryElf() const { return static_cast(binaryElf_); } - void setGlobalStore(Memory* mem) { globalStore_ = mem; } + void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); } - const Memory* globalStore() const { return globalStore_; } + const std::vector& globalStores() const { return globalStores_; } //! Return a typecasted GPU device gpu::Device& dev() @@ -497,11 +619,13 @@ private: aclBinary* binaryElf_; //!< Binary for the new compiler library void* rawBinary_; //!< Pointer to the raw binary aclBinaryOptions binOpts_; //!< Binary options to create aclBinary - Memory* globalStore_; //!< Global memory for the program + std::vector globalStores_; //!< Global memory for the program Memory* kernels_; //!< Table with kernel object pointers uint maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel std::list staticSamplers_; //!< List od internal static samplers bool isNull_; //!< Null program no memory allocations + amd::hsa::loader::Executable* executable_; //!< Executable for HSA Loader + ORCAHSALoaderContext loaderContext_; //!< Context for HSA Loader }; /*@}*/} // namespace gpu diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp index 5f5e548b84..a8e5524fa1 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp @@ -14,6 +14,7 @@ #include #include #include +#include "amd_hsa_loader.hpp" namespace gpu { @@ -137,54 +138,36 @@ NullKernel::siCreateHwInfo(const void* shader, AMUabiAddEncoding& encoding) } bool -HSAILKernel::aqlCreateHWInfo(const void* shader, size_t shaderSize) +HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol *sym) { - // Copy the shader_isa into a buffer - hwMetaData_ = new char[shaderSize]; - if (hwMetaData_ == NULL) { + if (!sym) { return false; } - memcpy(hwMetaData_, shader, shaderSize); - - SC_SI_HWSHADER_CS* siMetaData = reinterpret_cast(hwMetaData_); - - // Code to patch the pointers in the shader object. - // Must be preferably done in the compiler library - size_t offset = siMetaData->common.uSizeInBytes; - if (siMetaData->common.u32PvtDataSizeInBytes > 0) { - siMetaData->common.pPvtData = - reinterpret_cast( - reinterpret_cast(siMetaData) + offset); - offset += siMetaData->common.u32PvtDataSizeInBytes; + uint64_t akc_addr = 0; + if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&akc_addr))) { + return false; } - if (siMetaData->common.codeLenInByte > 0) { - siMetaData->common.hShaderMemHandle = - reinterpret_cast(siMetaData) + offset; - offset += siMetaData->common.codeLenInByte; + amd_kernel_code_t *akc = reinterpret_cast(akc_addr); + cpuAqlCode_ = akc; + if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, reinterpret_cast(&codeSize_))) { + return false; } - - char* headerBaseAddress = - reinterpret_cast(siMetaData->common.hShaderMemHandle); - amd_kernel_code_t* akc = reinterpret_cast( - headerBaseAddress); - - address codeStartAddress = reinterpret_cast
(akc); - address codeEndAddress = reinterpret_cast
(akc) + siMetaData->common.codeLenInByte; - codeSize_ = codeEndAddress - codeStartAddress; - code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, gpu::ConstBuffer::VectorSize)); - + size_t akc_align = 0; + if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_ALIGN, reinterpret_cast(&akc_align))) { + return false; + } + code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, akc_align)); // Initialize kernel ISA code - if ((code_ != NULL) && code_->create(Resource::Shader)) { + if (code_ && code_->create(Resource::Shader)) { address cpuCodePtr = static_cast
(code_->map(NULL, Resource::WriteOnly)); // Copy only amd_kernel_code_t - memcpy(cpuCodePtr, codeStartAddress, codeSize_); + memcpy(cpuCodePtr, reinterpret_cast
(akc), codeSize_); code_->unmap(NULL); } else { LogError("Failed to allocate ISA code!"); return false; } - cpuAqlCode_ = akc; assert((akc->workitem_private_segment_byte_size & 3) == 0 && "Scratch must be DWORD aligned"); diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp index 13f8d8495a..3ee22c2da4 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -3293,10 +3293,10 @@ VirtualGPU::processMemObjectsHSA( } } - if (hsaKernel.prog().globalStore() != NULL) { + for (gpu::Memory* mem : hsaKernel.prog().globalStores()) { const static bool IsReadOnly = false; // Validate global store for a dependency in the queue - memoryDependency().validate(*this, hsaKernel.prog().globalStore(), IsReadOnly); + memoryDependency().validate(*this, mem, IsReadOnly); } }