diff --git a/projects/clr/rocclr/cmake/ROCclrPAL.cmake b/projects/clr/rocclr/cmake/ROCclrPAL.cmake index 5702b7877a..94fc2e74b5 100644 --- a/projects/clr/rocclr/cmake/ROCclrPAL.cmake +++ b/projects/clr/rocclr/cmake/ROCclrPAL.cmake @@ -20,7 +20,7 @@ set(PAL_CLIENT "OCL") -set(PAL_CLIENT_INTERFACE_MAJOR_VERSION 916) +set(PAL_CLIENT_INTERFACE_MAJOR_VERSION 932) set(GPUOPEN_CLIENT_INTERFACE_MAJOR_VERSION 42) set(GPUOPEN_CLIENT_INTERFACE_MINOR_VERSION 0) set(AMD_DK_ROOT $ENV{DK_ROOT}) diff --git a/projects/clr/rocclr/device/devkernel.hpp b/projects/clr/rocclr/device/devkernel.hpp index 0a4fc1edd6..eca2963a17 100644 --- a/projects/clr/rocclr/device/devkernel.hpp +++ b/projects/clr/rocclr/device/devkernel.hpp @@ -336,6 +336,7 @@ class Kernel : public amd::HeapObject { const uint32_t WorkitemPrivateSegmentByteSize() const { return workitemPrivateSegmentByteSize_; } void SetWorkitemPrivateSegmentByteSize(uint32_t size) { workitemPrivateSegmentByteSize_ = size; } + const bool KernalHasDynamicCallStack() const { return kernelHasDynamicCallStack_; } const uint32_t KernargSegmentByteSize() const { return kernargSegmentByteSize_; } diff --git a/projects/clr/rocclr/device/pal/palkernel.cpp b/projects/clr/rocclr/device/pal/palkernel.cpp index 470851282d..8ae1ea278d 100644 --- a/projects/clr/rocclr/device/pal/palkernel.cpp +++ b/projects/clr/rocclr/device/pal/palkernel.cpp @@ -483,8 +483,10 @@ hsa_kernel_dispatch_packet_t* HSAILKernel::loadArguments( hsaDisp->completion_signal.handle = 0; memcpy(aqlArgBuf + argsBufferSize(), hsaDisp, sizeof(hsa_kernel_dispatch_packet_t)); - if (AMD_HSA_BITS_GET(akc_.kernel_code_properties, - AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_QUEUE_PTR)) { + static_assert(offsetof(amd_kernel_code_t, kernel_code_properties) == + offsetof(llvm::amdhsa::kernel_descriptor_t, kernel_code_properties)); + if (AMD_HSA_BITS_GET(akd_.kernel_code_properties, + llvm::amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR)) { gpu.addVmMemory(gpu.hsaQueueMem()); } @@ -511,7 +513,7 @@ bool LightningKernel::postLoad() { auto sym = prog().getSymbol(symbolName().c_str(), &agent); - if (!setKernelCode(sym, &akc_)) { + if (!setKernelDescriptor(sym, &akd_)) { return false; } if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_DYNAMIC_CALLSTACK, @@ -571,6 +573,23 @@ bool LightningKernel::postLoad() { return true; } + +bool LightningKernel::setKernelDescriptor(amd::hsa::loader::Symbol* sym, + llvm::amdhsa::kernel_descriptor_t* akd) { + if (!sym) { + return false; + } + if (!sym->GetInfo(HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, reinterpret_cast(&code_))) { + return false; + } + + // Copy code object of this kernel from the program CPU segment + memcpy(akd, reinterpret_cast(prog().findHostKernelAddress(code_)), + sizeof(llvm::amdhsa::kernel_descriptor_t)); + + return true; +} + #endif // defined(USE_COMGR_LIBRARY) } // namespace amd::pal diff --git a/projects/clr/rocclr/device/pal/palkernel.hpp b/projects/clr/rocclr/device/pal/palkernel.hpp index e2625f817f..6cc5859e10 100644 --- a/projects/clr/rocclr/device/pal/palkernel.hpp +++ b/projects/clr/rocclr/device/pal/palkernel.hpp @@ -29,6 +29,7 @@ #include "device/pal/paldevice.hpp" #include "device/pal/palvirtual.hpp" #include "amd_hsa_kernel_code.h" +#include "AMDHSAKernelDescriptor.h" #include "device/pal/palprintf.hpp" #include "hsa.h" @@ -87,6 +88,9 @@ class HSAILKernel : public device::Kernel { //! Returns pointer on CPU to AQL code info const amd_kernel_code_t* cpuAqlCode() const { return &akc_; } + //! Returns pointer on CPU to AQL kernel descriptor info + const llvm::amdhsa::kernel_descriptor_t* cpuAqlKd() const { return &akd_; } + //! Returns memory object with AQL code uint64_t gpuAqlCode() const { return code_; } @@ -130,8 +134,11 @@ class HSAILKernel : public device::Kernel { void setWorkGroupInfo(const uint32_t privateSegmentSize, const uint32_t groupSegmentSize, const uint16_t numSGPRs, const uint16_t numVGPRs); - amd_kernel_code_t akc_; //!< AQL kernel code on CPU - uint index_; //!< Kernel index in the program + union { + amd_kernel_code_t akc_; //!< AQL kernel code on CPU, used by HSAIL + llvm::amdhsa::kernel_descriptor_t akd_; //!< AQL kernel descriptor on CPU, used by LC + }; + uint index_; //!< Kernel index in the program uint64_t code_; //!< GPU memory pointer to the kernel size_t codeSize_; //!< Size of ISA code @@ -146,6 +153,8 @@ class LightningKernel : public HSAILKernel { const LightningProgram& prog() const; #if defined(USE_COMGR_LIBRARY) + //! Get the kernel descriptor and copy the code object from the program CPU segment + bool setKernelDescriptor(amd::hsa::loader::Symbol* sym, llvm::amdhsa::kernel_descriptor_t* akd); //! Initializes the metadata required for this kernel bool init(); diff --git a/projects/clr/rocclr/device/pal/palvirtual.cpp b/projects/clr/rocclr/device/pal/palvirtual.cpp index 3bd4e60889..8f813a22a2 100644 --- a/projects/clr/rocclr/device/pal/palvirtual.cpp +++ b/projects/clr/rocclr/device/pal/palvirtual.cpp @@ -2718,7 +2718,7 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes, dispatchParam.scratchOffset = scratch->offset_; dispatchParam.workitemPrivateSegmentSize = privateMemSize; } - dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlCode(); + dispatchParam.pCpuAqlCode = hsaKernel.cpuAqlKd(); dispatchParam.hsaQueueVa = hsaQueueMem_->vmAddress(); if (!hsaKernel.prog().isLC() && hsaKernel.workGroupInfo()->wavesPerSimdHint_ != 0) { constexpr uint32_t kWavesPerSimdLimit = 4;