diff --git a/rocclr/runtime/device/gpu/gpudebugger.hpp b/rocclr/runtime/device/gpu/gpudebugger.hpp index 34a78b50d0..2098808c6e 100644 --- a/rocclr/runtime/device/gpu/gpudebugger.hpp +++ b/rocclr/runtime/device/gpu/gpudebugger.hpp @@ -18,6 +18,7 @@ #include "sc-hsa/Interface/SCHSAInterface.h" #include "device/device.hpp" #include "device/hwdebug.hpp" +#include "acl.h" static const int NumberReserveVgprs = 4; @@ -93,6 +94,8 @@ struct DebugToolInfo amd::Memory* trapHandler_; //! Trap handler address amd::Memory* trapBuffer_; //! Trap buffer address bool sqPerfcounterEnable_; //! whether SQ perf counters are enabled + aclBinary* aclBinary_; //! pointer of the kernel ACL binary + amd::Event* event_; //! pointer of the kernel event in the enqueue command }; /*! \brief Message used by the KFD wave control for CI diff --git a/rocclr/runtime/device/gpu/gpudebugmanager.cpp b/rocclr/runtime/device/gpu/gpudebugmanager.cpp index 426f58e13e..c08ffcf1ce 100644 --- a/rocclr/runtime/device/gpu/gpudebugmanager.cpp +++ b/rocclr/runtime/device/gpu/gpudebugmanager.cpp @@ -85,6 +85,9 @@ GpuDebugManager::executePreDispatchCallBack(void* aqlPacket, // // for invalidate cache (BuildEndOfKernelNotifyCommands) // aqlPacket->release_fence_scope = 2; + aclBinary_ = reinterpret_cast(info->aclBinary_); + oclEventHandle_ = reinterpret_cast(as_cl(info->event_)); + cl_device_id clDeviceId = as_cl(device_); preDispatchCallBackFunc_(clDeviceId, oclEventHandle_, diff --git a/rocclr/runtime/device/gpu/gpuvirtual.cpp b/rocclr/runtime/device/gpu/gpuvirtual.cpp index cad8d7e4dc..f4e656413e 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -1627,7 +1627,8 @@ VirtualGPU::submitKernel(amd::NDRangeKernelCommand& vcmd) profilingBegin(vcmd); // Submit kernel to HW - if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false)) { + if (!submitKernelInternal(vcmd.sizes(), vcmd.kernel(), vcmd.parameters(), false, + &vcmd.event())) { vcmd.setStatus(CL_INVALID_OPERATION); } @@ -1639,7 +1640,8 @@ VirtualGPU::submitKernelInternalHSA( const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, - bool nativeMem) + bool nativeMem, + amd::Event* enqueueEvent) { uint64_t vmParentWrap = 0; uint64_t vmDefQueue = 0; @@ -1766,7 +1768,7 @@ VirtualGPU::submitKernelInternalHSA( HwDbgKernelInfo *pKernelInfo = NULL; if (useHwDebug_) { - buildKernelInfo(hsaKernel, aqlPkt, kernelInfo); + buildKernelInfo(hsaKernel, aqlPkt, kernelInfo, enqueueEvent); pKernelInfo = &kernelInfo; } @@ -1982,7 +1984,8 @@ VirtualGPU::submitKernelInternal( const amd::NDRangeContainer& sizes, const amd::Kernel& kernel, const_address parameters, - bool nativeMem) + bool nativeMem, + amd::Event* enqueueEvent) { bool result = true; uint i; @@ -1999,7 +2002,7 @@ VirtualGPU::submitKernelInternal( Kernel& gpuKernelOpt = static_cast(*devKernel); if (gpuKernelOpt.hsa()) { - return submitKernelInternalHSA(sizes, kernel, parameters, nativeMem); + return submitKernelInternalHSA(sizes, kernel, parameters, nativeMem, enqueueEvent); } else if (state_.hsailKernel_) { // Reload GSL state to HW, so runtime could run AMDIL kernel @@ -3458,7 +3461,8 @@ VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask) void VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, hsa_kernel_dispatch_packet_t* aqlPkt, - HwDbgKernelInfo& kernelInfo) + HwDbgKernelInfo& kernelInfo, + amd::Event* enqueueEvent) { amd::HwDebugManager * dbgManager = dev().hwDebugMgr(); assert (dbgManager && "No HW Debug Manager!"); @@ -3517,6 +3521,8 @@ VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr; dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes; dbgSetting.globalAddress_ = kernelInfo.heapBufAddr; + dbgSetting.aclBinary_ = hsaKernel.prog().binaryElf(); + dbgSetting.event_ = enqueueEvent; // Call the predispatch callback function & set the trap info AqlCodeInfo aqlCodeInfo; diff --git a/rocclr/runtime/device/gpu/gpuvirtual.hpp b/rocclr/runtime/device/gpu/gpuvirtual.hpp index 5585f51823..76d156cfac 100644 --- a/rocclr/runtime/device/gpu/gpuvirtual.hpp +++ b/rocclr/runtime/device/gpu/gpuvirtual.hpp @@ -223,13 +223,15 @@ public: const amd::NDRangeContainer& sizes, //!< Workload sizes const amd::Kernel& kernel, //!< Kernel for execution const_address parameters, //!< Parameters for the kernel - bool nativeMem = true //!< Native memory objects + bool nativeMem = true, //!< Native memory objects + amd::Event* enqueueEvent = NULL //!< Event provided in the enqueue kernel command ); bool submitKernelInternalHSA( const amd::NDRangeContainer& sizes, //!< Workload sizes const amd::Kernel& kernel, //!< Kernel for execution const_address parameters, //!< Parameters for the kernel - bool nativeMem = true //!< Native memory objects + bool nativeMem = true, //!< Native memory objects + amd::Event* enqueueEvent = NULL //!< Event provided in the enqueue kernel command ); void submitNativeFn(amd::NativeFnCommand& vcmd); void submitFillMemory(amd::FillMemoryCommand& vcmd); @@ -505,7 +507,8 @@ private: void buildKernelInfo( const HSAILKernel& hsaKernel, //!< hsa kernel hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch - HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch + HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch + amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command ); void assignTrapHandler(