// // Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved. // #include "device/pal/palkernel.hpp" #include "device/pal/palprogram.hpp" #include "device/pal/palblit.hpp" #include "device/pal/palconstbuf.hpp" #include "device/pal/palsched.hpp" #include "platform/commandqueue.hpp" #include "utils/options.hpp" #include "acl.h" #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) #include "llvm/Support/AMDGPUMetadata.h" typedef llvm::AMDGPU::HSAMD::Kernel::Metadata KernelMD; #endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) #include #include #include #include #include #include #include namespace pal { void HSAILKernel::setWorkGroupInfo(const uint32_t privateSegmentSize, const uint32_t groupSegmentSize, const uint16_t numSGPRs, const uint16_t numVGPRs) { workGroupInfo_.scratchRegs_ = amd::alignUp(privateSegmentSize, 16) / sizeof(uint32_t); // Make sure runtime matches HW alignment, which is 256 scratch regs (DWORDs) per wave constexpr uint32_t ScratchRegAlignment = 256; workGroupInfo_.scratchRegs_ = amd::alignUp((workGroupInfo_.scratchRegs_ * dev().info().wavefrontWidth_), ScratchRegAlignment) / dev().info().wavefrontWidth_; workGroupInfo_.privateMemSize_ = workGroupInfo_.scratchRegs_ * sizeof(uint32_t); workGroupInfo_.localMemSize_ = workGroupInfo_.usedLDSSize_ = groupSegmentSize; workGroupInfo_.usedSGPRs_ = numSGPRs; workGroupInfo_.usedStackSize_ = 0; workGroupInfo_.usedVGPRs_ = numVGPRs; if (!prog().isNull()) { workGroupInfo_.availableLDSSize_ = dev().properties().gfxipProperties.shaderCore.ldsSizePerCu; workGroupInfo_.availableSGPRs_ = dev().properties().gfxipProperties.shaderCore.numAvailableSgprs; workGroupInfo_.availableVGPRs_ = dev().properties().gfxipProperties.shaderCore.numAvailableVgprs; workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = dev().info().wavefrontWidth_; } else { workGroupInfo_.availableLDSSize_ = 64 * Ki; workGroupInfo_.availableSGPRs_ = 104; workGroupInfo_.availableVGPRs_ = 256; workGroupInfo_.preferredSizeMultiple_ = workGroupInfo_.wavefrontPerSIMD_ = 64; } } bool HSAILKernel::setKernelCode(amd::hsa::loader::Symbol* sym, amd_kernel_code_t* akc) { if (!sym) { return false; } if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&code_))) { return false; } // Copy code object of this kernel from the program CPU segment memcpy(akc, reinterpret_cast(prog().findHostKernelAddress(code_)), sizeof(amd_kernel_code_t)); return true; } bool HSAILKernel::aqlCreateHWInfo(amd::hsa::loader::Symbol* sym) { amd_kernel_code_t* akc = &akc_; if (!setKernelCode(sym, akc)) { return false; } if (!sym->GetInfo(HSA_EXT_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT_SIZE, reinterpret_cast(&codeSize_))) { return false; } // Setup the the workgroup info setWorkGroupInfo(akc->workitem_private_segment_byte_size, akc->workgroup_group_segment_byte_size, akc->wavefront_sgpr_count, akc->workitem_vgpr_count); workgroupGroupSegmentByteSize_ = workGroupInfo_.usedLDSSize_; kernargSegmentByteSize_ = akc->kernarg_segment_byte_size; return true; } HSAILKernel::HSAILKernel(std::string name, HSAILProgram* prog, std::string compileOptions) : device::Kernel(prog->dev(), name, *prog), compileOptions_(compileOptions), index_(0), code_(0), codeSize_(0), workgroupGroupSegmentByteSize_(0), kernargSegmentByteSize_(0) { flags_.hsa_ = true; } HSAILKernel::~HSAILKernel() {} bool HSAILKernel::init(amd::hsa::loader::Symbol* sym, bool finalize) { #if defined(WITH_COMPILER_LIB) acl_error error = ACL_SUCCESS; std::string openClKernelName = openclMangledName(name()); flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; // compile kernel down to ISA if (finalize) { std::string options(compileOptions_.c_str()); options.append(" -just-kernel="); options.append(openClKernelName.c_str()); // Append an option so that we can selectively enable a SCOption on CZ // whenever IOMMUv2 is enabled. if (dev().settings().svmFineGrainSystem_) { options.append(" -sc-xnack-iommu"); } error = aclCompile(dev().compiler(), prog().binaryElf(), options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, nullptr); buildLog_ += aclGetCompilerLog(dev().compiler()); if (error != ACL_SUCCESS) { LogError("Failed to finalize kernel"); return false; } } aqlCreateHWInfo(sym); // Pull out metadata from the ELF size_t sizeOfArgList; error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY, openClKernelName.c_str(), nullptr, &sizeOfArgList); if (error != ACL_SUCCESS) { return false; } char* aclArgList = new char[sizeOfArgList]; if (nullptr == aclArgList) { return false; } error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_ARGUMENT_ARRAY, openClKernelName.c_str(), aclArgList, &sizeOfArgList); if (error != ACL_SUCCESS) { return false; } // Set the argList InitParameters(reinterpret_cast(aclArgList), argsBufferSize()); delete[] aclArgList; size_t sizeOfWorkGroupSize; error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE, openClKernelName.c_str(), nullptr, &sizeOfWorkGroupSize); if (error != ACL_SUCCESS) { return false; } error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE, openClKernelName.c_str(), workGroupInfo_.compileSize_, &sizeOfWorkGroupSize); if (error != ACL_SUCCESS) { return false; } // Copy wavefront size workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_; // Find total workgroup size if (workGroupInfo_.compileSize_[0] != 0) { workGroupInfo_.size_ = workGroupInfo_.compileSize_[0] * workGroupInfo_.compileSize_[1] * workGroupInfo_.compileSize_[2]; } else { workGroupInfo_.size_ = dev().info().preferredWorkGroupSize_; } // Pull out printf metadata from the ELF size_t sizeOfPrintfList; error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), nullptr, &sizeOfPrintfList); if (error != ACL_SUCCESS) { return false; } // Make sure kernel has any printf info if (0 != sizeOfPrintfList) { char* aclPrintfList = new char[sizeOfPrintfList]; if (nullptr == aclPrintfList) { return false; } error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_GPU_PRINTF_ARRAY, openClKernelName.c_str(), aclPrintfList, &sizeOfPrintfList); if (error != ACL_SUCCESS) { return false; } // Set the PrintfList InitPrintf(reinterpret_cast(aclPrintfList)); delete[] aclPrintfList; } aclMetadata md; md.enqueue_kernel = false; size_t sizeOfDeviceEnqueue = sizeof(md.enqueue_kernel); error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_DEVICE_ENQUEUE, openClKernelName.c_str(), &md.enqueue_kernel, &sizeOfDeviceEnqueue); if (error != ACL_SUCCESS) { return false; } flags_.dynamicParallelism_ = md.enqueue_kernel; md.kernel_index = -1; size_t sizeOfIndex = sizeof(md.kernel_index); error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_KERNEL_INDEX, openClKernelName.c_str(), &md.kernel_index, &sizeOfIndex); if (error != ACL_SUCCESS) { return false; } index_ = md.kernel_index; size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_); error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint); if (error != ACL_SUCCESS) { return false; } waveLimiter_.enable(); size_t sizeOfWorkGroupSizeHint = sizeof(workGroupInfo_.compileSizeHint_); error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WORK_GROUP_SIZE_HINT, openClKernelName.c_str(), workGroupInfo_.compileSizeHint_, &sizeOfWorkGroupSizeHint); if (error != ACL_SUCCESS) { return false; } size_t sizeOfVecTypeHint; error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT, openClKernelName.c_str(), NULL, &sizeOfVecTypeHint); if (error != ACL_SUCCESS) { return false; } if (0 != sizeOfVecTypeHint) { char* VecTypeHint = new char[sizeOfVecTypeHint + 1]; if (NULL == VecTypeHint) { return false; } error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_VEC_TYPE_HINT, openClKernelName.c_str(), VecTypeHint, &sizeOfVecTypeHint); if (error != ACL_SUCCESS) { return false; } VecTypeHint[sizeOfVecTypeHint] = '\0'; workGroupInfo_.compileVecTypeHint_ = std::string(VecTypeHint); delete[] VecTypeHint; } #endif // defined(WITH_COMPILER_LIB) return true; } const Device& HSAILKernel::dev() const { return reinterpret_cast(dev_); } const HSAILProgram& HSAILKernel::prog() const { return reinterpret_cast(prog_); } hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments(VirtualGPU& gpu, const amd::Kernel& kernel, const amd::NDRangeContainer& sizes, const_address params, size_t ldsAddress, uint64_t vmDefQueue, uint64_t* vmParentWrap) const { const_address parameters = params; uint64_t argList; address aqlArgBuf = gpu.managedBuffer().reserve( argsBufferSize() + sizeof(hsa_kernel_dispatch_packet_t), &argList); gpu.addVmMemory(gpu.managedBuffer().activeMemory()); if (dynamicParallelism()) { // Provide the host parent AQL wrap object to the kernel AmdAqlWrap wrap = {}; wrap.state = AQL_WRAP_BUSY; *vmParentWrap = gpu.cb(1)->UploadDataToHw(&wrap, sizeof(AmdAqlWrap)); gpu.addVmMemory(gpu.cb(1)->ActiveMemory()); } // The check below handles a special case of single context with multiple devices // when the devices use different compilers(HSAIL and LC) and have different signatures const amd::KernelSignature& signature = (this->signature().version() == kernel.signature().version()) ? kernel.signature() : this->signature(); // If signatures don't match, then patch the parameters if (signature.version() != kernel.signature().version()) { WriteAqlArgAt(aqlArgBuf, parameters, signature.paramsSize() - signature.at(0).offset_, signature.at(0).offset_); parameters = aqlArgBuf; } // Check if runtime has to setup hidden arguments for (uint32_t i = signature.numParameters(); i < signature.numParametersAll(); ++i) { const auto it = signature.at(i); size_t offset; switch (it.info_.oclObject_) { case amd::KernelParameterDescriptor::HiddenNone: // void* zero = 0; // WriteAqlArgAt(const_cast
(parameters), &zero, it.size_, it.offset_); break; case amd::KernelParameterDescriptor::HiddenGlobalOffsetX: offset = sizes.offset()[0]; WriteAqlArgAt(const_cast
(parameters), &offset, it.size_, it.offset_); break; case amd::KernelParameterDescriptor::HiddenGlobalOffsetY: if (sizes.dimensions() >= 2) { offset = sizes.offset()[1]; WriteAqlArgAt(const_cast
(parameters), &offset, it.size_, it.offset_); } break; case amd::KernelParameterDescriptor::HiddenGlobalOffsetZ: if (sizes.dimensions() >= 3) { offset = sizes.offset()[2]; WriteAqlArgAt(const_cast
(parameters), &offset, it.size_, it.offset_); } break; case amd::KernelParameterDescriptor::HiddenPrintfBuffer: if ((printfInfo().size() > 0) && // and printf buffer was allocated (gpu.printfDbgHSA().dbgBuffer() != nullptr)) { // and set the fourth argument as the printf_buffer pointer size_t bufferPtr = static_cast(gpu.printfDbgHSA().dbgBuffer()->vmAddress()); gpu.addVmMemory(gpu.printfDbgHSA().dbgBuffer()); WriteAqlArgAt(const_cast
(parameters), &bufferPtr, it.size_, it.offset_); } break; case amd::KernelParameterDescriptor::HiddenDefaultQueue: if (vmDefQueue != 0) { WriteAqlArgAt(const_cast
(parameters), &vmDefQueue, it.size_, it.offset_); } break; case amd::KernelParameterDescriptor::HiddenCompletionAction: if (*vmParentWrap != 0) { WriteAqlArgAt(const_cast
(parameters), vmParentWrap, it.size_, it.offset_); } break; case amd::KernelParameterDescriptor::HiddenMultiGridSync: break; } } // Load all kernel arguments if (signature.version() == kernel.signature().version()) { WriteAqlArgAt(aqlArgBuf, parameters, argsBufferSize(), 0); } // Note: In a case of structs the size won't match, // since HSAIL compiler expects a reference... assert(argsBufferSize() <= signature.paramsSize() && "A mismatch of sizes of arguments between compiler and runtime!"); // hsa_kernel_dispatch_packet_t disp; hsa_kernel_dispatch_packet_t* hsaDisp = reinterpret_cast(gpu.cb(0)->SysMemCopy()); amd::NDRange local(sizes.local()); const amd::NDRange& global = sizes.global(); // Check if runtime has to find local workgroup size FindLocalWorkSize(sizes.dimensions(), sizes.global(), local); constexpr uint16_t kDispatchPacketHeader = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | (1 << HSA_PACKET_HEADER_BARRIER) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); hsaDisp->header = kDispatchPacketHeader; hsaDisp->setup = sizes.dimensions(); hsaDisp->workgroup_size_x = local[0]; hsaDisp->workgroup_size_y = (sizes.dimensions() > 1) ? local[1] : 1; hsaDisp->workgroup_size_z = (sizes.dimensions() > 2) ? local[2] : 1; hsaDisp->grid_size_x = global[0]; hsaDisp->grid_size_y = (sizes.dimensions() > 1) ? global[1] : 1; hsaDisp->grid_size_z = (sizes.dimensions() > 2) ? global[2] : 1; hsaDisp->reserved2 = 0; // Initialize kernel ISA and execution buffer requirements hsaDisp->private_segment_size = spillSegSize(); hsaDisp->group_segment_size = ldsAddress; hsaDisp->kernel_object = gpuAqlCode(); hsaDisp->kernarg_address = reinterpret_cast(argList); hsaDisp->reserved2 = 0; hsaDisp->completion_signal.handle = 0; memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t)); if (AMD_HSA_BITS_GET(akc_.kernel_code_properties, AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { gpu.addVmMemory(gpu.hsaQueueMem()); } return hsaDisp; } const LightningProgram& LightningKernel::prog() const { return reinterpret_cast(prog_); } #if defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) static const KernelMD* FindKernelMetadata(const CodeObjectMD* programMD, const std::string& name) { for (const KernelMD& kernelMD : programMD->mKernels) { if (kernelMD.mName == name) { return &kernelMD; } } return nullptr; } #endif // defined(WITH_LIGHTNING_COMPILER) || defined(USE_COMGR_LIBRARY) #if defined(USE_COMGR_LIBRARY) bool LightningKernel::init() { flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; const amd_comgr_metadata_node_t* kernelMetaNode = prog().getKernelMetadata(name()); if (kernelMetaNode == nullptr) { return false; } KernelMD kernelMD; if (!GetAttrCodePropMetadata(*kernelMetaNode, &kernelMD)) { return false; } symbolName_ = (codeObjectVer() == 2) ? name() : kernelMD.mSymbolName; workgroupGroupSegmentByteSize_ = kernelMD.mCodeProps.mGroupSegmentFixedSize; kernargSegmentByteSize_ = kernelMD.mCodeProps.mKernargSegmentSize; // Copy codeobject of this kernel from the program CPU segment hsa_agent_t agent; agent.handle = 1; auto sym = prog().GetSymbol(symbolName().c_str(), const_cast(&agent)); if (!setKernelCode(sym, &akc_)) { return false; } if (!prog().isNull()) { codeSize_ = prog().codeSegGpu().owner()->getSize(); // handle device enqueue if (!kernelMD.mAttrs.mRuntimeHandle.empty()) { hsa_agent_t agent; agent.handle = 1; amd::hsa::loader::Symbol* rth_symbol; // Get the runtime handle symbol GPU address rth_symbol = prog().GetSymbol(const_cast(kernelMD.mAttrs.mRuntimeHandle.c_str()), const_cast(&agent)); uint64_t symbol_address; rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address); // Copy the kernel_object pointer to the runtime handle symbol GPU address const Memory& codeSegGpu = prog().codeSegGpu(); uint64_t offset = symbol_address - codeSegGpu.vmAddress(); uint64_t kernel_object = gpuAqlCode(); VirtualGPU* gpu = codeSegGpu.dev().xferQueue(); const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()}; codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true); } } // Setup the the workgroup info setWorkGroupInfo(kernelMD.mCodeProps.mPrivateSegmentFixedSize, kernelMD.mCodeProps.mGroupSegmentFixedSize, kernelMD.mCodeProps.mNumSGPRs, kernelMD.mCodeProps.mNumVGPRs); // Copy wavefront size workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_; workGroupInfo_.size_ = kernelMD.mCodeProps.mMaxFlatWorkGroupSize; if (workGroupInfo_.size_ == 0) { return false; } // handle the printf metadata if any std::vector printfStr; if (!GetPrintfStr(&printfStr)) { return false; } if (!printfStr.empty()) { InitPrintf(printfStr); } return true; } #endif // defined(USE_COMGR_LIBRARY) bool LightningKernel::init(amd::hsa::loader::Symbol* symbol) { #if defined(WITH_LIGHTNING_COMPILER) && !defined(USE_COMGR_LIBRARY) flags_.internalKernel_ = (compileOptions_.find("-cl-internal-kernel") != std::string::npos) ? true : false; aqlCreateHWInfo(symbol); const CodeObjectMD* programMD = prog().metadata(); assert(programMD != nullptr); const KernelMD* kernelMD = FindKernelMetadata(programMD, name()); if (kernelMD == nullptr) { return false; } // Set the argList InitParameters(*kernelMD, argsBufferSize()); if (!kernelMD->mAttrs.mReqdWorkGroupSize.empty()) { const auto& requiredWorkgroupSize = kernelMD->mAttrs.mReqdWorkGroupSize; workGroupInfo_.compileSize_[0] = requiredWorkgroupSize[0]; workGroupInfo_.compileSize_[1] = requiredWorkgroupSize[1]; workGroupInfo_.compileSize_[2] = requiredWorkgroupSize[2]; } if (!kernelMD->mAttrs.mWorkGroupSizeHint.empty()) { const auto& workgroupSizeHint = kernelMD->mAttrs.mWorkGroupSizeHint; workGroupInfo_.compileSizeHint_[0] = workgroupSizeHint[0]; workGroupInfo_.compileSizeHint_[1] = workgroupSizeHint[1]; workGroupInfo_.compileSizeHint_[2] = workgroupSizeHint[2]; } if (!kernelMD->mAttrs.mVecTypeHint.empty()) { workGroupInfo_.compileVecTypeHint_ = kernelMD->mAttrs.mVecTypeHint.c_str(); } if (!kernelMD->mAttrs.mRuntimeHandle.empty()) { hsa_agent_t agent; agent.handle = 1; amd::hsa::loader::Symbol* rth_symbol; // Get the runtime handle symbol GPU address rth_symbol = prog().GetSymbol(const_cast(kernelMD->mAttrs.mRuntimeHandle.c_str()), const_cast(&agent)); uint64_t symbol_address; rth_symbol->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &symbol_address); // Copy the kernel_object pointer to the runtime handle symbol GPU address const Memory& codeSegGpu = prog().codeSegGpu(); uint64_t offset = symbol_address - codeSegGpu.vmAddress(); VirtualGPU* gpu = codeSegGpu.dev().xferQueue(); const struct RuntimeHandle runtime_handle = {gpuAqlCode(), spillSegSize(), ldsSize()}; codeSegGpu.writeRawData(*gpu, offset, sizeof(runtime_handle), &runtime_handle, true); } // Copy wavefront size workGroupInfo_.wavefrontSize_ = dev().info().wavefrontWidth_; workGroupInfo_.size_ = kernelMD->mCodeProps.mMaxFlatWorkGroupSize; if (workGroupInfo_.size_ == 0) { return false; } InitPrintf(programMD->mPrintf); /*FIXME_lmoriche: size_t sizeOfWavesPerSimdHint = sizeof(workGroupInfo_.wavesPerSimdHint_); error = aclQueryInfo(dev().compiler(), prog().binaryElf(), RT_WAVES_PER_SIMD_HINT, openClKernelName.c_str(), &workGroupInfo_.wavesPerSimdHint_, &sizeOfWavesPerSimdHint); if (error != ACL_SUCCESS) { return false; } waveLimiter_.enable(); */ #endif // defined(WITH_LIGHTNING_COMPILER) && ! defined(USE_COMGR_LIBRARY) return true; } } // namespace pal