diff --git a/projects/clr/rocclr/runtime/device/device.cpp b/projects/clr/rocclr/runtime/device/device.cpp index 674611d5d6..22fb11b9d5 100644 --- a/projects/clr/rocclr/runtime/device/device.cpp +++ b/projects/clr/rocclr/runtime/device/device.cpp @@ -521,6 +521,7 @@ Settings::Settings() waitCommand_ = AMD_OCL_WAIT_COMMAND; supportDepthsRGB_ = false; assumeAliases_ = false; + enableHwDebug_ = false; } bool diff --git a/projects/clr/rocclr/runtime/device/device.hpp b/projects/clr/rocclr/runtime/device/device.hpp index f9e191f28c..9d2e67cae7 100644 --- a/projects/clr/rocclr/runtime/device/device.hpp +++ b/projects/clr/rocclr/runtime/device/device.hpp @@ -63,7 +63,6 @@ class SvmFillMemoryCommand; class SvmMapMemoryCommand; class SvmUnmapMemoryCommand; class HwDebugManager; -class RunHwDbgCommand; class Device; struct KernelParameterDescriptor; struct Coord3D; @@ -500,7 +499,7 @@ struct Info : public amd::EmbeddedObject //! List of supported video attributes (profile/format pairs) cl_video_attrib_amd* videoAttribs_; cl_uint numVideoAttribs_; - //Encoder + //Encoder cl_video_attrib_encode_amd* videoEncAttribs_; cl_uint numVideoEncAttribs_; #endif //cl_amd_open_video @@ -574,9 +573,6 @@ struct Info : public amd::EmbeddedObject //! The maximum size of global scope variables size_t maxGlobalVariableSize_; size_t globalVariablePreferredTotalSize_; - - //! Enable HW Debug support - cl_bool enableHwDebug_; }; //! Device settings @@ -586,7 +582,7 @@ public: uint64_t extensions_; //!< Supported OCL extensions union { struct { - uint partialDispatch_: 1; //!< Enables partial dispatch + uint partialDispatch_: 1; //!< Enables partial dispatch uint supportRA_: 1; //!< Support RA channel order format uint largeHostMemAlloc_: 1; //!< Allow large host mem allocations (> maxSingleAlloc) uint waitCommand_: 1; //!< Enables a wait for every submitted command @@ -594,7 +590,8 @@ public: // that replaces generic OS allocation routines uint supportDepthsRGB_: 1; //!< Support DEPTH and sRGB channel order format uint assumeAliases_: 1; //!< Assume aliases in the compilation process - uint reserved_: 25; + uint enableHwDebug_: 1; //!< Enable HW debug support + uint reserved_: 24; }; uint value_; }; @@ -776,8 +773,8 @@ protected: volatile size_t version_; //!< The version we're currently shadowing - //! NB, the map data below is for an API-level map (from clEnqueueMapBuffer), - //! not a physical map. When a memory object does not use USE_HOST_PTR we + //! NB, the map data below is for an API-level map (from clEnqueueMapBuffer), + //! not a physical map. When a memory object does not use USE_HOST_PTR we //! can use a remote resource and DMA, avoiding the additional CPU memcpy. amd::Memory* mapMemory_; //!< Memory used as map target buffer volatile size_t indirectMapCount_; //!< Number of maps @@ -898,7 +895,7 @@ public: workGroupInfo_.compileSize_[1] = y; workGroupInfo_.compileSize_[2] = z; } - + size_t getReqdWorkGroupSize(int dim) { return workGroupInfo_.compileSize_[dim]; } @@ -1139,11 +1136,11 @@ public: never called in storing routines */ bool setBinary(char* theBinary, size_t theBinarySize, bool allocated=false); - //! setin elfIn_ + //! setin elfIn_ bool setElfIn(unsigned char eclass); void resetElfIn(); - //! set out elf + //! set out elf bool setElfOut(unsigned char eclass, const char* outFile); void resetElfOut(); @@ -1232,7 +1229,7 @@ public: // Return the encrypt code for this input binary ( "> 0" means encrypted) int getEncryptCode() { return encryptCode_; } - + // Returns TRUE of binary file is SPIR bool isSPIR() const; protected: @@ -1413,9 +1410,6 @@ public: virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) = 0; virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) = 0; virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) = 0; -#if 0 // exclude this until more HW DEBUG codes are submitted - virtual void submitHwDbgCommand(amd::RunHwDbgCommand& cmd) = 0; -#endif //! Get the blit manager object device::BlitManager& blitMgr() const { return *blitMgr_; } @@ -1698,6 +1692,9 @@ public: //! Initialize the Hardware Debug Manager virtual cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage) { return CL_SUCCESS; } + //! Remove the Hardware Debug Manager + virtual void hwDebugManagerRemove() {} + protected: //! Enable the specified extension char* getExtensionString(); diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudebugger.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudebugger.hpp new file mode 100644 index 0000000000..34a78b50d0 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/gpu/gpudebugger.hpp @@ -0,0 +1,127 @@ +/******************************************************************************* + * + * Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished) + * + * All rights reserved. This notice is intended as a precaution against + * inadvertent publication and does not imply publication or any waiver + * of confidentiality. The year included in the foregoing notice is the + * year of creation of the work. + * + ******************************************************************************/ + +#ifndef HWDBG_GPUDEBGGER_H_ +#define HWDBG_GPUDEBGGER_H_ + +#include +#include +#include "hsa.h" +#include "sc-hsa/Interface/SCHSAInterface.h" +#include "device/device.hpp" +#include "device/hwdebug.hpp" + +static const int NumberReserveVgprs = 4; + +namespace gpu { + +/** + * \defgroup Services_API OCL Runtime Services API + * @{ + */ + + +/*! \brief Dispatch packet information + * + * This structure contains the packet information for kernel dispatch + */ +struct PacketAmdInfo +{ + uint32_t trapReservedVgprIndex_; //!< reserved VGPR index, -1 when they are not valid + uint32_t scratchBufferWaveOffset_; //!< scratch buffer wave offset, -1 when no scratch buffer + void* pointerToIsaBuffer_; //!< pointer to the buffer containing ISA + size_t sizeOfIsaBuffer_; //!< size of the ISA buffer + uint32_t numberOfVgprs_; //!< number of VGPRs used by the kernel + uint32_t numberOfSgprs_; //!< number of SGPRs used by the kernel + size_t sizeOfStaticGroupMemory_; //!< Static local memory used by the kernel +}; + +/*! \brief Cache mask for invalidation + */ +struct HwDbgGpuCacheMask +{ + HwDbgGpuCacheMask() :ui32All_(0) {} + + HwDbgGpuCacheMask(uint32_t mask) :ui32All_(mask) {} + + union { + struct { + uint32_t sqICache_ : 1; //!< Instruction cache + uint32_t sqKCache_ : 1; //!< Data cache + uint32_t tcL1_ : 1; //!< tcL1 cache + uint32_t tcL2_ : 1; //!< tcL2 cache + uint32_t reserved_ : 28; + }; + uint32_t ui32All_; + }; +}; + +/*! \brief Address watch information + * + * Information about each watch point - address, mask, mode and event + */ +struct HwDbgAddressWatch +{ + void* watchAddress_; //! The address of watch point + uint64_t watchMask_; //! The mask for watch point (lower 24 bits) + cl_dbg_address_watch_mode_amd watchMode_; //! The watch mode for this watch + DebugEvent event_; //! Event of the watch point (not used for now) +}; + +/*! \brief Runtime structure used to communicate debug information + * between Ocl services and core for a kernel dispatch. + */ +struct DebugToolInfo +{ + uint64_t scratchAddress_; //! Scratch memory address + size_t scratchSize_; //! Scratch memory size + uint64_t globalAddress_; //! Global memory address + uint32_t cacheDisableMask_; //! Cache mask, indicating caches disabled + uint32_t exceptionMask_; //! Exception mask + uint32_t reservedCuNum_; //! Number of reserved CUs for display, + //! which ranges from 0 to 7 in the current implementation. + bool monitorMode_; //! Debug or profiler mode + bool gpuSingleStepMode_; //! SQ debug mode + amd::Memory* trapHandler_; //! Trap handler address + amd::Memory* trapBuffer_; //! Trap buffer address + bool sqPerfcounterEnable_; //! whether SQ perf counters are enabled +}; + +/*! \brief Message used by the KFD wave control for CI + * + * Structure indicates the various information used by the wave control function. + */ +struct HwDebugWaveAddr +{ + uint32_t VMID_ : 4; //! Virtual memory id + uint32_t wave_ : 4; //! Wave id + uint32_t SIMD_ : 2; //! SIMD id + uint32_t CU_ : 4; //! Compute unit + uint32_t SH_ : 1; //! Shader array + uint32_t SE_ : 1; //! Shader engine +}; + +/*! \brief Kernel code information +* +* This structure contains the pointer of mapped kernel code for host access +* and its size (in bytes) +*/ +struct AqlCodeInfo +{ + amd_kernel_code_t * aqlCode_; //! pointer of AQL code to allow host access + uint32_t aqlCodeSize_; //! size of AQL code +}; + +/**@}*/ + +} // namespace gpu + +#endif // HWDBG_GPUDEBGGER_H_ diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp new file mode 100644 index 0000000000..426f58e13e --- /dev/null +++ b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.cpp @@ -0,0 +1,361 @@ +/******************************************************************************* + * + * Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished) + * + * All rights reserved. This notice is intended as a precaution against + * inadvertent publication and does not imply publication or any waiver + * of confidentiality. The year included in the foregoing notice is the + * year of creation of the work. + * + ******************************************************************************/ + +#include "gpudebugmanager.hpp" +#include "gpudevice.hpp" +#include "platform/commandqueue.hpp" + +#include "device/device.hpp" +#include "device/gpu/gpumemory.hpp" +#include +#include +#include + +namespace gpu { + +class VirtualGPU; +class Device; +class Memory; + +/* + *************************************************************************** + * Implementation of GPU Debug Manager class + *************************************************************************** + */ + +GpuDebugManager::GpuDebugManager(amd::Device* device) + : HwDebugManager(device) + , vGpu_(NULL) + , debugMessages_(0) + , addressWatch_(NULL) + , addressWatchSize_(0) + , oclEventHandle_(NULL) +{ + // Initialize the exception info and the kernel execution mode + excpPolicy_.exceptionMask = 0x0; + excpPolicy_.waveAction = CL_DBG_WAVES_RESUME; + excpPolicy_.hostAction = CL_DBG_HOST_IGNORE; + excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST; + + execMode_.ui32All = 0; + + rtTrapHandlerInfo_.trap_.trapHandler_ = NULL; + rtTrapHandlerInfo_.trap_.trapBuffer_ = NULL; + + aqlPacket_ = (hsa_kernel_dispatch_packet_t *) NULL; + + return; +} + +GpuDebugManager::~GpuDebugManager() +{ + if (NULL != addressWatch_) { + delete [] addressWatch_; + } +} + +void +GpuDebugManager::executePreDispatchCallBack(void* aqlPacket, + void* toolInfo) +{ + DebugToolInfo* info = reinterpret_cast(toolInfo); + + aqlPacket_ = reinterpret_cast(aqlPacket); + + // Only if the pre-dispatch callback is set, will we update cache + // flush configuration and build the memory descriptor. + if (NULL != preDispatchCallBackFunc_) { + // Build the scratch memory descriptor + device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_, + info->scratchAddress_, + info->scratchSize_); + + // Build the global memory descriptor + device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_, + info->globalAddress_); + +// // for invalidate cache (BuildEndOfKernelNotifyCommands) +// aqlPacket->release_fence_scope = 2; + + cl_device_id clDeviceId = as_cl(device_); + preDispatchCallBackFunc_(clDeviceId, + oclEventHandle_, + aqlPacket_, + aclBinary_, + deviceTrapInfo_, + preDispatchCallBackArgs_); + } + + // Copy the various info set by the debugger/profiler to the tool info structure + setupTrapInformation(info); +} + +void +GpuDebugManager::executePostDispatchCallBack() +{ + if (NULL != postDispatchCallBackFunc_) { + cl_device_id clDeviceId = as_cl(device_); + postDispatchCallBackFunc_(clDeviceId, + aqlPacket_->completion_signal.handle, + postDispatchCallBackArgs_); + } +} + + +cl_int +GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage) +{ + //! @todo: obtain the global mutex of HW debug to make sure only one debugger process exist + + if (!device()->settings().enableHwDebug_) { + LogError("debugmanager: Register debugger error - HW DEBUG is not enable"); + return CL_DEBUGGER_REGISTER_FAILURE_AMD; + } + + // first time register - set the message storage, flush queue and enable hw debug + if (!isRegistered()) { + debugMessages_ = messageStorage; + dbgMsgBufferReady_ = true; + isRegistered_ = false; + } + + context_ = context; + + return CL_SUCCESS; +} + +void +GpuDebugManager::unregisterDebugger() +{ + if (isRegistered()) { + //! @todo: release the global mutex of HW debug + + // reset the debugger registration flag + isRegistered_ = false; + dbgMsgBufferReady_ = false; + + context_ = NULL; + } +} + +cl_int +GpuDebugManager::registerDebuggerOnQueue(device::VirtualDevice* vDevice) +{ + if (!isMsgBufferReady()) { + return CL_DEBUGGER_REGISTER_FAILURE_AMD; + } + + if (isRegistered()) { // The debugger has already been registered, + return CL_SUCCESS; // nothing to be done + } + + VirtualGPU* vGpu = reinterpret_cast(vDevice); + + // populate the fields in the debugMessages structure used by the GPU exception notification + if (vGpu->RegisterHwDebugger(debugMessages_)) { + vGpu_ = vGpu; + isRegistered_ = true; + return CL_SUCCESS; + } + + return CL_DEBUGGER_REGISTER_FAILURE_AMD; +} + +void +GpuDebugManager::flushCache(uint32_t mask) +{ + HwDbgGpuCacheMask cacheMask(mask); + device()->xferQueue()->flushCuCaches(cacheMask); +} + + +void +GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo) +{ + toolInfo->scratchAddress_ = 0; + toolInfo->scratchSize_ = 0; + toolInfo->globalAddress_ = 0; + toolInfo->sqPerfcounterEnable_ = false; + + // Set up trap related info in the kernel info structure to be + // used in the kernel dispatch. + toolInfo->exceptionMask_ = excpPolicy_.exceptionMask; + toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode; + toolInfo->monitorMode_ = execMode_.monitorMode; + + // The order of these three bits is determined by the definition + // of the register COMPUTE_DISPATCH_INITIATOR + toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2) + | (execMode_.disableL2Cache << 1) + | (execMode_.disableL1Vector)); + + toolInfo->reservedCuNum_ = execMode_.reservedCuNum; + + toolInfo->trapHandler_ = + as_amd(reinterpret_cast(deviceTrapInfo_[kDebugTrapHandlerLocation])); + toolInfo->trapBuffer_ = + as_amd(reinterpret_cast(deviceTrapInfo_[kDebugTrapBufferLocation])); +} + + +void +GpuDebugManager::getPacketAmdInfo( + const void* aqlCodeInfo, + void* packetInfo) const + +{ + const AqlCodeInfo* codeInfo = + reinterpret_cast(aqlCodeInfo); + + const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_; + + PacketAmdInfo* packet = + reinterpret_cast(packetInfo); + + const amd_kernel_code_t* akc = hostAqlCode; + + packet->numberOfSgprs_ = akc->wavefront_sgpr_count; + packet->numberOfVgprs_ = akc->workitem_vgpr_count; + + // use mapped kernel_object_address for host accessing of ISA buffer + packet->pointerToIsaBuffer_ = (char*) (hostAqlCode) + + akc->kernel_code_entry_byte_offset; + + packet->scratchBufferWaveOffset_ = + akc->debug_wavefront_private_segment_offset_sgpr; + + packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_; + + packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size; + + // The trap_reserved_vgpr_index will be 4 less the original + // This value must be used only by the debugger + packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs; +} + +DebugEvent +GpuDebugManager::createDebugEvent( + const bool autoReset) +{ + if (!isRegistered()) { + LogError("debugmanager: Failed to flush cache - hw debug is not available"); + return 0; + } + + + // create the event object + osEventHandle shaderEvent = osEventCreate(!autoReset); + + // event object has been created, set the initial state + if (shaderEvent != 0) { + + osEventReset(shaderEvent); // initial state is non-signaled + + if (vGpu_->ExceptionNotification(shaderEvent)) { + isRegistered_ = true; + return shaderEvent; + } + } + + return 0; +} + +cl_int +GpuDebugManager::waitDebugEvent( + DebugEvent pEvent, + uint32_t timeOut) const +{ + if (osEventTimedWait(pEvent, timeOut)) { + return CL_SUCCESS; + } + else { + return CL_EVENT_TIMEOUT_AMD; + } +} + +void +GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent) +{ + osEventDestroy(*pEvent); + *pEvent = 0; + + vGpu_->ExceptionNotification(0); + +} + +void +GpuDebugManager::wavefrontControl( + uint32_t waveAction, + uint32_t waveMode, + uint32_t trapId, + void* waveAddr) const +{ + device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr); +} + +void +GpuDebugManager::setAddressWatch( + uint32_t numWatchPoints, + void** watchAddress, + uint64_t* watchMask, + uint64_t* watchMode, + DebugEvent* event) +{ + size_t requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch); + + // previously allocated size is not big enough, allocate new memory + if (addressWatchSize_ < requiredSize) { + if (NULL != addressWatch_) { // free the smaller address watch storage + delete [] addressWatch_; + } + addressWatch_ = new HwDbgAddressWatch[numWatchPoints]; + addressWatchSize_ = requiredSize; + } + + // fill in the address watch structure + memset(addressWatch_, 0, addressWatchSize_); + + for (uint32_t i = 0; i < numWatchPoints; i++) + { + amd::Memory* watchMem = as_amd(reinterpret_cast(watchAddress[i])); + Memory* watchMemAddress = device()->getGpuMemory(watchMem); + + addressWatch_[i].watchAddress_ = reinterpret_cast(watchMemAddress->vmAddress()); + addressWatch_[i].watchMask_ = watchMask[i]; + addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd) watchMode[i]; + addressWatch_[i].event_ = (0 != event) ? event[i] : 0; + } + + // setup the watch addresses + device()->gslCtx()->setAddressWatch(numWatchPoints, (void*) addressWatch_); + +} + +void +GpuDebugManager::setGlobalMemory( + amd::Memory* memObj, + uint32_t offset, + void* srcPtr, + uint32_t size) +{ + gpu::Memory* globalMem = device()->getGpuMemory(memObj); + + address mappedMem = static_cast
(globalMem->map(NULL,0)); + assert(mappedMem != 0); + + void* dest_ptr = reinterpret_cast(mappedMem + offset); + memcpy(dest_ptr, srcPtr, size); + + globalMem->unmap(NULL); +} + + +} // namespace gpu diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp new file mode 100644 index 0000000000..ddda1e27d4 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/gpu/gpudebugmanager.hpp @@ -0,0 +1,132 @@ +/******************************************************************************* + * + * Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished) + * + * All rights reserved. This notice is intended as a precaution against + * inadvertent publication and does not imply publication or any waiver + * of confidentiality. The year included in the foregoing notice is the + * year of creation of the work. + * + ******************************************************************************/ +#ifndef HWDBG_DEBUGMANAGER_H__ +#define HWDBG_DEBUGMANAGER_H__ + +#include "gpuvirtual.hpp" +#include "gpudebugger.hpp" + +namespace gpu { + +class GpuDebugManager; +class Device; +class Memory; + + +/*! \brief Debug Manager Class + * + * The debug manager class is used to pass all the trap info to the + * kernel dispatch and then the kernel execution can use such trap information + * for kernel execution. This class contains the trap handler and shader event + * objects. The trap handler is setup by users and passed to the kernel dispatch. + * The shader event is to receive interrupts from the GPU and then users can + * perform various operations. + * + * This class also provides the interface for setting up the pre-dispatch + * callback functions used by the profiler and debugger. It also provides + * a way to retrieve various debug information for the kernel execution. + * + */ +class GpuDebugManager : public amd::HwDebugManager { +public: + + //! Constructor of the debug manager class + GpuDebugManager(amd::Device* device); + + //! Destructor of the debug manager class + ~GpuDebugManager(); + + //! Get the single instance of the GpuDebugManager class + static GpuDebugManager* getDefaultInstance(); + + //! Destroy the GpuDebugManager class object + static void destroyInstances(); + + //! Flush cache + void flushCache(uint32_t mask); + + //! Create the debug event + DebugEvent createDebugEvent(const bool autoReset); + + //! Wait for the debug event + cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const; + + //! Destroy the debug event + void destroyDebugEvent(DebugEvent* pEvent); + + //! Register the debugger + cl_int registerDebugger(amd::Context*context, uintptr_t messageStorage); + + //! Register the debugger with KMD after command queue has been created + cl_int registerDebuggerOnQueue(device::VirtualDevice* vDevice); + + //! Unregister the debugger + void unregisterDebugger(); + + //! Send the wavefront control cmmand + void wavefrontControl(uint32_t waveAction, + uint32_t waveMode, + uint32_t trapId, + void* waveAddr) const; + + //! Set address watching point + void setAddressWatch(uint32_t numWatchPoints, + void** watchAddress, + uint64_t* watchMask, + uint64_t* watchMode, + DebugEvent* pEvent); + + //! Get the packet information for dispatch + void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const; + + //! Set global memory values + void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size); + + //! Execute the post-dispatch callback function + void executePostDispatchCallBack(); + + //! Execute the pre-dispatch callback function + void executePreDispatchCallBack(void* aqlPacket, + void* toolInfo); + +private: + + //! Setup trap handler info for kernel execution + void setupTrapInformation(DebugToolInfo* toolInfo); + + +protected: + + const VirtualGPU* vGpu() const { return vGpu_; } + +private: + + const gpu::Device* device() const { + return reinterpret_cast(device_); } + + VirtualGPU* vGpu_; //!< the virtual GPU + + uintptr_t debugMessages_; //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD + + HwDbgAddressWatch* addressWatch_; //!< Address watch data + size_t addressWatchSize_; //!< Size of address watch data + + //! Arguments used by the callback function + void* oclEventHandle_; //!< event handler + const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet +}; + + + + +} // namespace gpu + +#endif // HWDBG_DEBUGMANAGER_H__ diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp index e9fef63720..586fea9129 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.cpp @@ -38,6 +38,8 @@ #include #include +#include "gpudebugmanager.hpp" + bool DeviceLoad() { bool ret = false; @@ -890,6 +892,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices) } } + #ifdef DEBUG std::stringstream message; if (settings().remoteAlloc_) { @@ -1225,7 +1228,7 @@ Device::init() { CALuint numDevices = 0; bool result = false; - bool useDeviceList = false; + bool useDeviceList = false; requestedDevices_t requestedDevices; const char *library = getenv("COMPILER_LIBRARY"); @@ -2662,4 +2665,27 @@ Device::SrdManager::fillResourceList(std::vector& memList) } } +cl_int +Device::hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage) +{ + hwDebugMgr_ = new GpuDebugManager(this); + cl_int status = hwDebugMgr_->registerDebugger(context, messageStorage); + + if (CL_SUCCESS != status) { + delete hwDebugMgr_; + hwDebugMgr_ = NULL; + } + + return status; +} + +void +Device::hwDebugManagerRemove() +{ + hwDebugMgr_->unregisterDebugger(); + + delete hwDebugMgr_; + hwDebugMgr_ = NULL; +} + } // namespace gpu diff --git a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp index e51b3dcd30..f2d3732cfc 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpudevice.hpp @@ -560,6 +560,12 @@ public: //! Returns SRD manger object SrdManager& srds() const { return *srdManager_; } + //! Initial the Hardware Debug Manager + cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage); + + //! Remove the Hardware Debug Manager + void hwDebugManagerRemove(); + private: //! Disable copy constructor Device(const Device&); diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp index ba28c4675e..5c7be0c394 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.cpp @@ -3510,6 +3510,7 @@ HSAILKernel::HSAILKernel(std::string name, , prog_(*prog) , index_(0) , code_(NULL) + , codeSize_(0) , hwMetaData_(NULL) { hsa_ = true; @@ -3924,6 +3925,11 @@ HSAILKernel::loadArguments( mem->signalWrite(&dev()); } memList.push_back(gpuMem); + + // save the memory object pointer to allow global memory access + if (NULL != dev().hwDebugMgr()) { + dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner()); + } } // If it is a local pointer else { diff --git a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp index 2c2d5a15b6..0be944897b 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpukernel.hpp @@ -862,7 +862,10 @@ public: const void* cpuAqlCode() const { return cpuAqlCode_; } //! Returns memory object with AQL code - const gpu::Memory* gpuAqlCode() const { return code_; } + gpu::Memory* gpuAqlCode() const { return code_; } + + //! Returns size of AQL code + size_t aqlCodeSize() const { return codeSize_; } //! Returns the size of argument buffer size_t argsBufferSize() const @@ -883,7 +886,7 @@ public: amd::NDRange& lclWorkSize //!< Local work size ) const; - //! Returns AQL packet in CPU memory + //! Returns AQL packet in CPU memory //! if the kerenl arguments were successfully loaded, otherwise NULL hsa_kernel_dispatch_packet_t* loadArguments( VirtualGPU& gpu, //!< Running GPU context @@ -939,6 +942,8 @@ private: uint index_; //!< Kernel index in the program gpu::Memory* code_; //!< Memory object with ISA code + size_t codeSize_; //!< Size of ISA code + char* hwMetaData_; //!< SI metadata union Flags { diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuresource.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuresource.cpp index f826cc0a8e..cf776edff9 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuresource.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuresource.cpp @@ -363,7 +363,8 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap) elementSize_ = static_cast(memoryFormatSize(cal()->format_).size_); cal_.type_ = memType; if (memType == Scratch) { - cal_.type_ = Local; + // use local memory for scratch buffer unless it is using HW DEBUG + cal_.type_ = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC; cal_.scratch_ = true; } @@ -463,7 +464,7 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap) else if ((gslRef_ != NULL) && (!dev().settings().use64BitPtr_)) { // Make sure runtime didn't pick a resource with > 4GB address if ((cal()->dimension_ == GSL_MOA_BUFFER) && - (static_cast(gslRef_->gslResource()->getSurfaceAddress() + + (static_cast(gslRef_->gslResource()->getSurfaceAddress() + gslRef_->gslResource()->getSurfaceSize()) > (uint64_t(4) * Gi))) { gslRef_->release(); gslRef_ = NULL; diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp index 7cbc8f3bfc..444871f6d6 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuscsi.cpp @@ -172,13 +172,17 @@ HSAILKernel::aqlCreateHWInfo(const void* shader, size_t shaderSize) address codeStartAddress = reinterpret_cast
(akc); address codeEndAddress = reinterpret_cast
(hcd) + siMetaData->common.codeLenInByte; - uint64_t codeSize = codeEndAddress - codeStartAddress; - code_ = new gpu::Memory(dev(), amd::alignUp(codeSize, gpu::ConstBuffer::VectorSize)); + codeSize_ = codeEndAddress - codeStartAddress; + code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, gpu::ConstBuffer::VectorSize)); + + // force to use remote memory for HW DEBUG + Resource::MemoryType resMemType = (!dev().settings().enableHwDebug_) ? Resource::Local : Resource::RemoteUSWC; + // Initialize kernel ISA code - if ((code_ != NULL) && code_->create(Resource::Local)) { + if ((code_ != NULL) && code_->create(resMemType)) { address cpuCodePtr = static_cast
(code_->map(NULL, Resource::WriteOnly)); // Copy only amd_kernel_code_t - memcpy(cpuCodePtr, codeStartAddress, codeSize); + memcpy(cpuCodePtr, codeStartAddress, codeSize_); code_->unmap(NULL); } else { diff --git a/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp b/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp index 813f9ddc4d..c435491771 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpusettings.cpp @@ -134,6 +134,7 @@ Settings::Settings() // Use host queue for device enqueuing by default useDeviceQueue_ = GPU_USE_DEVICE_QUEUE; + } bool @@ -311,7 +312,7 @@ Settings::create( calAttr.isWorkstation || hsail_) : GPU_FORCE_64BIT_PTR; } else { - if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_ + if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_ || (oclVersion_ >= OpenCL20)))) { use64BitPtr_ = true; } @@ -440,6 +441,11 @@ Settings::create( if (oclVersion_ >= OpenCL20) { enableExtension(ClKhrSubGroups); enableExtension(ClKhrDepthImages); + + // Enable HW debug + if (GPU_ENABLE_HW_DEBUG) { + enableHwDebug_ = true; + } } if (apuSystem_ && diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp index affaaaf85c..cad8d7e4dc 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.cpp @@ -14,6 +14,7 @@ #include "device/gpu/gputhreadtrace.hpp" #include "device/gpu/gputimestamp.hpp" #include "device/gpu/gpublit.hpp" +#include "device/gpu/gpudebugger.hpp" #include "hsa.h" #include "sc-hsa/Interface/SCHSAInterface.h" #include @@ -402,6 +403,7 @@ VirtualGPU::VirtualGPU( , schedParamIdx_(0) , deviceQueueSize_(0) , hsaQueueMem_(NULL) + , useHwDebug_(false) { memset(&cal_, 0, sizeof(CalVirtualDesc)); for (uint i = 0; i < AllEngines; ++i) { @@ -585,6 +587,14 @@ VirtualGPU::create( return false; } + // Check if HW Debug is used and register the debugger if not done yet + amd::HwDebugManager * dbgManager = dev().hwDebugMgr(); + + if ( dbgManager && dbgManager->isMsgBufferReady() ) { + if ( dbgManager->registerDebuggerOnQueue(this) == CL_SUCCESS ) { + useHwDebug_ = true; + } + } return true; } @@ -1720,6 +1730,12 @@ VirtualGPU::submitKernelInternalHSA( hsaKernel.prog().kernelTable()->vmAddress()); } + // setup the storage for the memory pointers of the kernel parameters + uint numParams = kernel.signature().numParameters(); + if (useHwDebug_) { + dev().hwDebugMgr()->allocParamMemList(numParams); + } + // Program the kernel arguments for the GPU execution hsa_kernel_dispatch_packet_t* aqlPkt = hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem, @@ -1745,10 +1761,25 @@ VirtualGPU::submitKernelInternalHSA( addVmMemory(memList[i]); } + // HW Debug for the kernel? + HwDbgKernelInfo kernelInfo; + HwDbgKernelInfo *pKernelInfo = NULL; + + if (useHwDebug_) { + buildKernelInfo(hsaKernel, aqlPkt, kernelInfo); + pKernelInfo = &kernelInfo; + } + GpuEvent gpuEvent; // Run AQL dispatch in HW runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_, - scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress()); + scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo); + + if (useHwDebug_) { + if (NULL != dev().hwDebugMgr()->postDispatchCallBackFunc()) { + dev().hwDebugMgr()->executePostDispatchCallBack(); + } + } if (hsaKernel.dynamicParallelism()) { // Make sure exculsive access to the device queue @@ -3410,4 +3441,155 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable) virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait); } +void +VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask) +{ + //! @todo: fix issue of no event available for the flush/invalidate cache command + InvalidateSqCaches(cache_mask.sqICache_, + cache_mask.sqKCache_, + cache_mask.tcL1_, + cache_mask.tcL2_); + + flushDMA(engineID_); + + return; +} + +void +VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel, + hsa_kernel_dispatch_packet_t* aqlPkt, + HwDbgKernelInfo& kernelInfo) +{ + amd::HwDebugManager * dbgManager = dev().hwDebugMgr(); + assert (dbgManager && "No HW Debug Manager!"); + + // Initialize structure with default values + + if (hsaKernel.prog().maxScratchRegs() > 0) { + gpu::Memory* scratchBuf = dev().scratch(hwRing())->memObjs_[0]; + kernelInfo.scratchBufAddr = scratchBuf->vmAddress(); + kernelInfo.scratchBufferSizeInBytes = scratchBuf->size(); + + // Get the address of the scratch buffer and its size for CPU access + address scratchRingAddr = NULL; + scratchRingAddr = static_cast
(scratchBuf->map(NULL, 0)); + dbgManager->setScratchRing(scratchRingAddr,scratchBuf->size()); + scratchBuf->unmap(NULL); + } + else { + kernelInfo.scratchBufAddr = 0; + kernelInfo.scratchBufferSizeInBytes = 0; + dbgManager->setScratchRing(NULL, 0); + } + + + //! @todo: need to verify what is wanted for the global memory + kernelInfo.heapBufAddr = (dev().globalMem()).vmAddress(); + + kernelInfo.pAqlDispatchPacket = aqlPkt; + kernelInfo.pAqlQueuePtr = reinterpret_cast(hsaQueueMem_->vmAddress()); + + // Get the address of the kernel code and its size for CPU access + gpu::Memory* aqlCode = hsaKernel.gpuAqlCode(); + if (NULL != aqlCode) { + address aqlCodeAddr = static_cast
(aqlCode->map(NULL, 0)); + dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize()); + aqlCode->unmap(NULL); + } + else { + dbgManager->setKernelCodeInfo(NULL, 0); + } + + kernelInfo.trapPresent = false; + kernelInfo.trapHandler = NULL; + kernelInfo.trapHandlerBuffer = NULL; + + kernelInfo.excpEn = 0; + kernelInfo.cacheDisableMask = 0; + kernelInfo.sqDebugMode = 0; + + kernelInfo.mgmtSe0Mask = 0xFFFFFFFF; + kernelInfo.mgmtSe1Mask = 0xFFFFFFFF; + + // set kernel info for HW debug and call the callback function + if (NULL != dbgManager->preDispatchCallBackFunc()) { + DebugToolInfo dbgSetting; + dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr; + dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes; + dbgSetting.globalAddress_ = kernelInfo.heapBufAddr; + + // Call the predispatch callback function & set the trap info + AqlCodeInfo aqlCodeInfo; + aqlCodeInfo.aqlCode_ = (amd_kernel_code_t *) hsaKernel.cpuAqlCode(); + aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize(); + + // Execute the pre-dispatch call back function + dbgManager->executePreDispatchCallBack(reinterpret_cast(aqlPkt), &dbgSetting); + + // assign the TMA and TBA for kernel dispatch + if (NULL != dbgSetting.trapHandler_ && NULL != dbgSetting.trapBuffer_) { + assignTrapHandler(dbgSetting, kernelInfo); + } + + kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false; + + // Execption policy + kernelInfo.excpEn = dbgSetting.exceptionMask_; + kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_; + kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_; + + // Compute the mask for reserved CUs. These two dwords correspond to + // two registers used for reserving CUs for display. In the current + // implementation, the number of CUs reserved can be 0 to 7, and it + // is set by debugger users. + if (dbgSetting.monitorMode_) { + uint32_t i = dbgSetting.reservedCuNum_ / 2; + kernelInfo.mgmtSe0Mask <<= i; + i = dbgSetting.reservedCuNum_ - i; + kernelInfo.mgmtSe1Mask <<= i; + } + + // flush/invalidate the instruction, data, L1 and L2 caches + InvalidateSqCaches(); + } +} + +void +VirtualGPU::assignTrapHandler(const DebugToolInfo& dbgSetting, + HwDbgKernelInfo& kernelInfo) +{ + + Memory * trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_); + Memory * trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_); + + addVmMemory(trapHandlerMem); + addVmMemory(trapBufferMem); + + // Handle TMA corruption hw bug workaround - + // The trap handler buffer has extra 256 bytes allocated, the TMA address + // is stored in the first two DWORDs and the actual trap handler code + // is stored starting at the location of 256 bytes. + // + // - kernelInfo.trapHandler points directly to the trap handler code + // - kernelInfo.trapHandlerBuffer points directly to the trap buffer (TMA) + // + kernelInfo.trapHandler = reinterpret_cast(trapHandlerMem->vmAddress() + TbaStartOffset); + kernelInfo.trapHandlerBuffer = reinterpret_cast(trapBufferMem->vmAddress()); + + // Address of the trap handler code/buffer should be 256-byte aligned + uint64_t tmaAddress = reinterpret_cast(kernelInfo.trapHandlerBuffer); + if ((reinterpret_cast(kernelInfo.trapHandler) & 0xFF) != 0 + || (tmaAddress & 0xFF) != 0) { + assert(false && "Trap handler/buffer is not 256-byte aligned"); + } + + // map the trap handler buffer address for host access, and store the trap + // buffer address at the beginning of the allocated buffer + address trapHandlerAddress = static_cast
(trapHandlerMem->map(NULL,0)); + uint32_t * tmaStorage = reinterpret_cast(trapHandlerAddress); + tmaStorage[0] = tmaAddress & 0xFFFFFFFF; + tmaStorage[1] = (tmaAddress >> 32) & 0xFFFFFFFF; + trapHandlerMem->unmap(NULL); +} + } // namespace gpu diff --git a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp index daa6433e0e..5585f51823 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp +++ b/projects/clr/rocclr/runtime/device/gpu/gpuvirtual.hpp @@ -12,6 +12,9 @@ #include "device/gpu/gpusched.hpp" #include "device/blit.hpp" +#include "device/gpu/gpudebugger.hpp" + + /*! \addtogroup GPU GPU Resource Implementation * @{ */ @@ -28,6 +31,7 @@ class VirtualGPU; class Program; class BlitManager; class ThreadTrace; +class HSAILKernel; //! Virtual GPU class VirtualGPU : public device::VirtualDevice, public CALGSLContext @@ -400,6 +404,8 @@ public: State state_; //!< virtual GPU current state CalVirtualDesc cal_; //!< CAL virtual device descriptor + void flushCuCaches(HwDbgGpuCacheMask cache_mask); //!< flush/invalidate SQ cache + protected: virtual void profileEvent(EngineType engine, bool type) const; @@ -496,6 +502,17 @@ private: const amd::BufferRect& dstRect //!< region of destination for copy ); + void buildKernelInfo( + const HSAILKernel& hsaKernel, //!< hsa kernel + hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch + HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch + ); + + void assignTrapHandler( + const DebugToolInfo& dbgSetting, //!< debug settings + HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch + ); + GslKernels gslKernels_; //!< GSL kernel descriptors GslKernelDesc* activeKernelDesc_; //!< active GSL kernel descriptors GpuEvents gpuEvents_; //!< GPU events @@ -534,6 +551,8 @@ private: uint deviceQueueSize_; //!< Device queue size Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object + + bool useHwDebug_; //!< Flag of using HW debug }; /*@}*/} // namespace gpu diff --git a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp index 7889b46b1b..00a0878d8e 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp +++ b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp @@ -440,7 +440,7 @@ CALGSLContext::isDone(GpuEvent* event) if (m_eventQueue[event->engineId_].isDone(event->id)) { event->invalidate(); - return true; + return true; } return false; } @@ -1269,10 +1269,10 @@ CALGSLContext::writeTimer(bool sdma, const gslMemObject mem, uint32 offset) cons void CALGSLContext::runAqlDispatch(GpuEvent& event, const void* aqlPacket, const gslMemObject* mems, uint32 numMems, gslMemObject scratch, uint32 scratchOffset, - const void* cpuKernelCode, uint64 hsaQueueVA) + const void* cpuKernelCode, uint64 hsaQueueVA, const void* kernelInfo) { eventBegin(MainEngine); - m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, scratchOffset, cpuKernelCode, hsaQueueVA); + m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, scratchOffset, cpuKernelCode, hsaQueueVA, kernelInfo); eventEnd(MainEngine, event); } @@ -1299,3 +1299,30 @@ CALGSLContext::virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mc m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal, dedicatedQueue); eventEnd(MainEngine, event); } + +bool +CALGSLContext::RegisterHwDebugger(uint64 debugMessages) +{ + return m_cs->registerHwDebugger(debugMessages); +} + +bool +CALGSLContext::ExceptionNotification(osEventHandle debugEvent) +{ + return m_cs->exceptionNotification(debugEvent); +} + +void +CALGSLContext::InvalidateSqCaches(bool instInvalidate, bool dataInvalidate, bool tcL1, bool tcL2) +{ + // invalidating instruction/data L1 caches using Escape + if (instInvalidate || dataInvalidate) { + m_cs->invalidateSqCaches(instInvalidate, dataInvalidate); + } + + if (tcL1) { + flushCUCaches(tcL2); + } + +} + diff --git a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h index 7310fd6266..84d662c09d 100644 --- a/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h +++ b/projects/clr/rocclr/runtime/device/gpu/gslbe/src/rt/GSLContext.h @@ -44,7 +44,8 @@ public: bool runProgramGrid(GpuEvent& event, const ProgramGrid* pProgramGrid, const gslMemObject* mems, uint32 numMems); bool runProgramVideoDecode(GpuEvent& event, gslMemObject mo, const CALprogramVideoDecode& decode); void runAqlDispatch(GpuEvent& event, const void* aqlPacket, const gslMemObject* mems, - uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode, uint64 hsaQueueVA); + uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode, + uint64 hsaQueueVA, const void* kernelInfo); mcaddr virtualQueueDispatcherStart(); void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems, mcaddr signal, mcaddr loopStart, uint32 numTemplates); @@ -140,6 +141,11 @@ public: void writeTimer(bool sdma, const gslMemObject mem, uint32 offset) const; void writeSurfRaw(GpuEvent& event, gslMemObject mem, size_t size, const void* data); + /// HW Debug support functions + bool RegisterHwDebugger(uint64 debugMessages); + bool ExceptionNotification(osEventHandle debugEvent); + void InvalidateSqCaches(bool instInvalidate = true, bool dataInvalidate = true, bool tcL1 = true, bool tcL2 = true); + protected: void setScratchBuffer(gslMemObject mem, int32 engineId); virtual void profileEvent(EngineType engine, bool type) const {} diff --git a/projects/clr/rocclr/runtime/device/hwdebug.cpp b/projects/clr/rocclr/runtime/device/hwdebug.cpp new file mode 100644 index 0000000000..8cfa01fa21 --- /dev/null +++ b/projects/clr/rocclr/runtime/device/hwdebug.cpp @@ -0,0 +1,175 @@ +/******************************************************************************* + * + * Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished) + * + * All rights reserved. This notice is intended as a precaution against + * inadvertent publication and does not imply publication or any waiver + * of confidentiality. The year included in the foregoing notice is the + * year of creation of the work. + * + ******************************************************************************/ + +#include "hwdebug.hpp" + +#include +#include +#include + +namespace amd { + +class Device; + +/* + *************************************************************************** + * Implementation of GPU Debug Manager class + *************************************************************************** + */ + +//! Constructor of the debug manager class +HwDebugManager::HwDebugManager(amd::Device* device) + : context_(NULL) + , device_(device) + , preDispatchCallBackFunc_(NULL) + , postDispatchCallBackFunc_(NULL) + , preDispatchCallBackArgs_(NULL) + , postDispatchCallBackArgs_(NULL) + , paramMemory_(NULL) + , numParams_(0) + , aclBinary_(NULL) + , aqlCodeAddr_(NULL) + , aqlCodeSize_(0) + , scratchRingAddr_(NULL) + , scratchRingSize_(0) + , isRegistered_(false) + , dbgMsgBufferReady_(false) +{ + memset(&debugInfo_, 0, sizeof(debugInfo_)); + + memset(deviceTrapInfo_, 0, sizeof(uint64_t) * kDebugTrapLocationMax); +} + +HwDebugManager::~HwDebugManager() +{ + if (NULL != paramMemory_) { + delete[] paramMemory_; + } +} + +//! Setup the call back function pointer +void +HwDebugManager::setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFunction, + cl_PostDispatchCallBackFunctionAMD postDispatchFunction) +{ + preDispatchCallBackFunc_ = preDispatchFunction; + postDispatchCallBackFunc_ = postDispatchFunction; +} + +//! Setup the call back argument pointers +void +HwDebugManager::setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs) +{ + preDispatchCallBackArgs_ = preDispatchArgs; + postDispatchCallBackArgs_ = postDispatchArgs; +} + +//! Get dispatch debug info +void +HwDebugManager::getDispatchDebugInfo(void* debugInfo) const +{ + memcpy(debugInfo, (void*) &debugInfo_, sizeof(DispatchDebugInfo)); +} + + +//! Set the kernel code address and its size +void +HwDebugManager::setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize) +{ + aqlCodeAddr_ = aqlCodeAddr; + aqlCodeSize_ = aqlCodeSize; +} + +//! Get the scratch ring +void +HwDebugManager::setScratchRing(address scratchRingAddr, uint32_t scratchRingSize) +{ + scratchRingAddr_ = scratchRingAddr; + scratchRingSize_ = scratchRingSize; +} + +//! Map the shader (AQL code) for host access +void +HwDebugManager::mapKernelCode(uint64_t* aqlCodeAddr, uint32_t* aqlCodeSize) const +{ + *aqlCodeAddr = reinterpret_cast(aqlCodeAddr_); + *aqlCodeSize = aqlCodeSize_; +} + +//! Map the scratch ring for host access +void +HwDebugManager::mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const +{ + *scratchRingAddr = reinterpret_cast(scratchRingAddr_); + *scratchRingSize = scratchRingSize_; +} + +void +HwDebugManager::setExceptionPolicy(void* exceptionPolicy) +{ + memcpy(&excpPolicy_, exceptionPolicy, sizeof(cl_dbg_exception_policy_amd)); +} + +void +HwDebugManager::getExceptionPolicy(void* exceptionPolicy) const +{ + memcpy(exceptionPolicy, &excpPolicy_, sizeof(cl_dbg_exception_policy_amd)); +} + +void +HwDebugManager::setKernelExecutionMode(void* mode) +{ + cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast(mode); + execMode_.ui32All = execMode->ui32All; +} + + +void +HwDebugManager::getKernelExecutionMode(void* mode) const +{ + cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast(mode); + execMode->ui32All = execMode_.ui32All; +} + +void +HwDebugManager::setAclBinary(void* aclBinary) +{ + aclBinary_ = aclBinary; +} + +void +HwDebugManager::allocParamMemList(uint32_t numParams) +{ + if (NULL != paramMemory_) { + delete [] paramMemory_; + } + + numParams_ = numParams; + paramMemory_ = new amd::Memory*[numParams]; +} + +cl_mem +HwDebugManager::getKernelParamMem(uint32_t paramIdx) const +{ + assert((paramIdx < numParams_) && "Invalid kernel parameter index too big"); + + return as_cl(paramMemory_[paramIdx]); +} + +void +HwDebugManager::assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem) +{ + assert((paramIdx < numParams_) && "Invalid kernel parameter index too big"); + + paramMemory_[paramIdx] = mem; +} + +} // namespace amd diff --git a/projects/clr/rocclr/runtime/device/hwdebug.hpp b/projects/clr/rocclr/runtime/device/hwdebug.hpp index 090fb0faf9..bca608a75f 100644 --- a/projects/clr/rocclr/runtime/device/hwdebug.hpp +++ b/projects/clr/rocclr/runtime/device/hwdebug.hpp @@ -5,42 +5,67 @@ #ifndef HWDEBUG_H_ #define HWDEBUG_H_ +#include "device.hpp" #include "amdocl/cl_debugger_amd.h" -#define TBA_START_OFFSET 256 +static const int TbaStartOffset = 256; -/** - ******************************************************************************* - * @brief Debug information required by the AMD debugger - * This might have to be moved to a private header. We could provide - * these services as a seperate dll. - * @details The information is populated by the function oclGetDebugInfo - ******************************************************************************* +static const int RtTrapBufferWaveSize = 64; +static const int RtTrapBufferSeNum = 4; +static const int RtTrapBufferShNum = 2; +static const int RtTrapBufferCuNum = 16; +static const int RtTrapBufferSimdNum = 4; +static const int RtTrapBufferWaveNum = 16; +static const int RtTrapBufferTotalWaveNum = + ((RtTrapBufferSeNum) * \ + (RtTrapBufferShNum) * \ + (RtTrapBufferCuNum) * \ + (RtTrapBufferSimdNum) * \ + (RtTrapBufferWaveNum)); + + +/*! \brief Debug trap handler location in the runtime trap buffer + * + * This enumeration is used to indicate the location where the debug + * trap handler and debug trap buffer are set in the device trap buffer. */ -struct PacketAmdInfo +enum DebugTrapLocation { - uint32_t trapReservedVgprIndex; //!< reserved VGPR index, -1 when they are not valid - uint32_t scratchBufferWaveOffset; //!< scratch buffer wave offset, -1 when no scratch buffer - void *pointerToIsaBuffer; //!< pointer to the buffer containing ISA - size_t sizeOfIsaBuffer; //!< size of the ISA buffer - uint32_t numberOfVgprs; //!< number of VGPRs used by the kernel - uint32_t numberOfSgprs; //!< number of SGPRs used by the kernel - size_t sizeOfStaticGroupMemory; //!< Static local memory used by the kernel + kDebugTrapHandlerLocation = 0, //! Debug Trap handler location, this location must be 0 + kDebugTrapBufferLocation = 1, //! Debug Trap buffer location, this location must be 1 + kDebugTrapLocationMax = 2 }; -//! Cache mask for invalidation -struct HwDbgGpuCacheMask + +/*! \brief This structure is for the debug info in each kernel dispatch. + * + * Contains the memory descriptor information of the scratch memory and the global + * memory + */ +struct DispatchDebugInfo { - union { - struct { - uint32_t sqICache : 1; //!< Instruction cache - uint32_t sqKCache : 1; //!< Data cache - uint32_t tcL1 : 1; //!< tcL1 cache - uint32_t tcL2 : 1; //!< tcL2 cache - uint32_t reserved : 28; - }; - uint32_t ui32All; - }; + uint32_t scratchMemoryDescriptor_[4]; //! Scratch memory descriptor + uint32_t globalMemoryDescriptor_[4]; //! Global memory descriptor +}; + +/*! \brief Trap handler descriptor + * + * The trap handler descriptor contains the details of a given trap handler. + */ +struct TrapHandlerInfo { + amd::Memory* trapHandler_; //!< Device memory for the trap handler + amd::Memory* trapBuffer_; //!< Device memory for the trap buffer +}; + +/*! \brief Structure of the runtime trap handler buffer, which includes the following + * information: information of the runtime trap handler and buffer, information of + * the level-2 trap handlers and buffers. + */ +struct RuntimeTrapInfo { + TrapHandlerInfo trap_; //!< Structure of the address of all trap handlers + uint32_t dispatchId_; //!< Dispatch ID that signals the shader event + uint32_t vgpr_backup_[RtTrapBufferTotalWaveNum][RtTrapBufferWaveSize]; + //!< Buffer to backup the VGPR used by the runtime trap handler }; @@ -48,10 +73,16 @@ struct HwDbgGpuCacheMask /** * Opaque pointer to trap event */ -typedef uint64_t DebugEvent; //! opaque pointer to trap event +typedef uintptr_t DebugEvent; namespace amd { + +class Context; +class Device; +class HostQueue; + + /*! \class HwDebugManager * * \brief The device interface class for the hardware debug manager @@ -61,32 +92,73 @@ class HwDebugManager public: //! Constructor for the Hardware Debug Manager - HwDebugManager() : isRegistered_(false), useHwDebug_(false) {} + HwDebugManager(amd::Device* device); //! Destructor for Hardware Debug Manager - ~HwDebugManager() {}; + virtual ~HwDebugManager(); //! Setup the call back function pointer - virtual void setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFn, - cl_PostDispatchCallBackFunctionAMD postDispatchFn) = 0; + void setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFn, + cl_PostDispatchCallBackFunctionAMD postDispatchFn); //! Setup the call back argument pointers - virtual void setCallBackArguments(void *preDispatchArgs, void *postDispatchArgs) = 0; + void setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs); - //! Flush cache - virtual cl_int flushCache(uint32_t mask) = 0; + //! Get dispatch debug info + void getDispatchDebugInfo(void* debugInfo) const; + + //! Set the kernel code address and its size + void setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize); + + //! Get the scratch ring + void setScratchRing(address scratchRingAddr, uint32_t scratchRingSize); + + //! Map the shader (AQL code) for host access + void mapKernelCode(uint64_t* aqlCodeAddr, uint32_t* aqlCodeSize) const; + + //! Map the scratch ring for host access + void mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const; + + //! Retrieve the pre-dispatch callback function + cl_PreDispatchCallBackFunctionAMD preDispatchCallBackFunc() const + { return preDispatchCallBackFunc_; } + + //! Retrieve the post-dispatch callback function + cl_PostDispatchCallBackFunctionAMD postDispatchCallBackFunc() const + { return postDispatchCallBackFunc_; } + + //! Retrieve the pre-dispatch callback function arguments + void* preDispatchCallBackArgs() const { return preDispatchCallBackArgs_; } + + //! Retrieve the post-dispatch callback function arguments + void* postDispatchCallBackArgs() const { return postDispatchCallBackArgs_; } //! Set exception policy - virtual cl_int setExceptionPolicy(void *policy) = 0; + void setExceptionPolicy(void* policy); //! Get exception policy - virtual cl_int getExceptionPolicy(void *policy) const = 0; + void getExceptionPolicy(void* policy) const; //! Set the kernel execution mode - virtual cl_int setKernelExecutionMode(void *mode) = 0; + void setKernelExecutionMode(void* mode); //! Get the kernel execution mode - virtual cl_int getKernelExecutionMode(void *mode) const = 0; + void getKernelExecutionMode(void* mode) const; + + //! Setup the pointer to the aclBinary within the debug manager + void setAclBinary(void* aclBinary); + + //! Allocate storage to keep the memory pointers of the kernel parameters + void allocParamMemList(uint32_t numParams); + + //! Assign the kernel parameter memory + void assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem); + + //! Get kernel parameter memory object + cl_mem getKernelParamMem(uint32_t paramIdx) const; + + //! Flush cache + virtual void flushCache(uint32_t mask) = 0; //! Create the debug event virtual DebugEvent createDebugEvent(const bool autoReset) = 0; @@ -95,95 +167,99 @@ public: virtual cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const = 0; //! Destroy the debug event - virtual cl_int destroyDebugEvent(DebugEvent pEvent) = 0; + virtual void destroyDebugEvent(DebugEvent* pEvent) = 0; //! Register the debugger - virtual cl_int registerDebugger(amd::Context *context, uintptr_t pMessageStorage) = 0; + virtual cl_int registerDebugger(amd::Context* context, uintptr_t pMessageStorage) = 0; //! Call KMD to register the debugger - virtual cl_int registerDebuggerOnQueue(device::VirtualDevice *vDevice) = 0; + virtual cl_int registerDebuggerOnQueue(device::VirtualDevice* vDevice) = 0; //! Unregister the debugger - virtual cl_int unregisterDebugger() = 0; + virtual void unregisterDebugger() = 0; - //! Setup the pointer to the aclBinary within the debug manager - virtual void setAclBinary(void *aclBinary) = 0; //! Send the wavefront control cmmand - virtual cl_int wavefrontControl(uint32_t waveAction, + virtual void wavefrontControl(uint32_t waveAction, uint32_t waveMode, uint32_t trapId, - void * waveAddr) const = 0; + void* waveAddr) const = 0; //! Set address watching point - virtual cl_int setAddressWatch(uint32_t numWatchPoints, - void ** watchAddress, - uint64_t * watchMask, - uint64_t * watchMode, - DebugEvent * event) = 0; + virtual void setAddressWatch(uint32_t numWatchPoints, + void** watchAddress, + uint64_t* watchMask, + uint64_t* watchMode, + DebugEvent* event) = 0; //! Get the packet information for dispatch - virtual cl_int getPacketAmdInfo(const void * aqlCodeInfo, - void * packetInfo) const = 0; - - //! Get dispatch debug info - virtual cl_int getDispatchDebugInfo(void * debugInfo) const = 0; - - //! Map the AQL code for host access - virtual cl_int mapKernelCode(uint64_t *aqlCode, uint32_t *aqlCodeSize) const = 0; - - //! Map the scratch ring for host access - virtual cl_int mapScratchRing(uint64_t *scratchRingAddr, uint32_t *scratchRingSize) const = 0; + virtual void getPacketAmdInfo(const void* aqlCodeInfo, + void* packetInfo) const = 0; //! Set global memory values - virtual cl_int setGlobalMemory(void * memObj, - uint32_t offset, - void * srcPtr, - uint32_t size) = 0; + virtual void setGlobalMemory(amd::Memory* memObj, + uint32_t offset, + void* srcPtr, + uint32_t size) = 0; - //! Set kernel parameter memory object list - virtual cl_int setKernelParamMemList(void ** paramMem, uint32_t numParams) = 0; + //! Execute the post-dispatch callback function + virtual void executePostDispatchCallBack() = 0; - //! Get kernel parameter memory object - virtual uint64_t getKernelParamMem(uint32_t paramIdx) const = 0; + //! Execute the pre-dispatch callback function + virtual void executePreDispatchCallBack(void* aqlPacket, + void* toolInfo) = 0; - //! Set the kernel code address and its size - virtual void setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize) = 0; + //! Return the use of HW DEBUG flag + bool isMsgBufferReady() const { return dbgMsgBufferReady_; } - //! Get the scratch ring - virtual void setScratchRing(address scratchRingAddr, uint32_t scratchRingSize) = 0; +protected: + //! Return the context + const amd::Context* context() const { return context_; } - //! Retrieve the pre-dispatch callback function - virtual cl_PreDispatchCallBackFunctionAMD getPreDispatchCallBackFunction() const = 0; - - //! Retrieve the post-dispatch callback function - virtual void * getPreDispatchCallBackArguments() const = 0; - - //! Retrieve the pre-dispatch callback function arguments - virtual cl_PostDispatchCallBackFunctionAMD getPostDispatchCallBackFunction() const = 0; - - //! Retrieve the post-dispatch callback function arguments - virtual void * getPostDispatchCallBackArguments() const = 0; - - //! Set the register flag - void setRegisterFlag(bool regFlag) { isRegistered_ = regFlag; } - - //! Set the use of HW DEBUG flag - void setUseHwDebugFlag(bool flag) { useHwDebug_ = flag; } + //! Get the debug device + const amd::Device* device() const { return device_; } //! Return the register flag bool isRegistered() const { return isRegistered_; } - //! Return the use of HW DEBUG flag - bool useHwDebug() const { return useHwDebug_; } - + //! Return the device trap handler information + const uint64_t* deviceTrapInfo() const { return deviceTrapInfo_; } protected: - bool isRegistered_; //! flag to indicate the debugger has been registered - bool useHwDebug_; //! flag to indicate the HW DEBUG is using + + const amd::Context* context_; ///< context that used to create host queue for the debugger + amd::Device* device_; ///< Device to run the debugger + + cl_PreDispatchCallBackFunctionAMD preDispatchCallBackFunc_; //!< pre-dispatch callback function + cl_PostDispatchCallBackFunctionAMD postDispatchCallBackFunc_; //!< post-dispatch callback function + void* preDispatchCallBackArgs_; //!< pre-dispatch callback function arguments + void* postDispatchCallBackArgs_; //!< post-dispatch callback function arguments + + DispatchDebugInfo debugInfo_; //!< Debug setting/information for kernel dispatch + uint64_t deviceTrapInfo_[kDebugTrapLocationMax]; //!< Device trap buffer, to store various trap handlers on the device + + amd::Memory** paramMemory_; //!< list of memory pointers for kernel parameters + uint32_t numParams_; //!< number of kernel parameters + + void* aclBinary_; //!< ACL binary + + address aqlCodeAddr_; //!< The mapped AQL code to allow host access + uint32_t aqlCodeSize_; //!< The size of the AQL code info + + address scratchRingAddr_; //!< The mapped address of the scratch buffer + uint32_t scratchRingSize_; //!< The size of the scratch ring + + bool isRegistered_; //! flag to indicate the debugger has been registered + bool dbgMsgBufferReady_; //! flag to indicate the HW DEBUG is using + + cl_dbg_exception_policy_amd excpPolicy_; //!< exception policy + cl_dbg_kernel_exec_mode_amd execMode_; //!< kernel execution mode + RuntimeTrapInfo rtTrapHandlerInfo_; //!< Runtime trap information + }; + /**@}*/ /** diff --git a/projects/clr/rocclr/runtime/utils/flags.hpp b/projects/clr/rocclr/runtime/utils/flags.hpp index 136872f37a..5cdee5b26c 100644 --- a/projects/clr/rocclr/runtime/utils/flags.hpp +++ b/projects/clr/rocclr/runtime/utils/flags.hpp @@ -174,6 +174,8 @@ debug(bool, GPU_FORCE_SINGLE_FP_DENORM, false, \ "Forces reporting CL_FP_DENORM bit for single precision") \ debug(bool, OCL_FORCE_CPU_SVM, false, \ "force svm support for CPU") \ +debug(bool, GPU_ENABLE_HW_DEBUG, false, \ + "Enable HW DEBUG for GPU")