P4 to Git Change 1110409 by wchau@wchau_WINDOWS7_OCL on 2015/01/09 15:46:34
ECR #399840 - re-checkin of CL1109955 with the fix of OpenCL sanity check timeout (hw debug flag initialization)
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_debugger_amd.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_debugger_amd.h#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#174 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#238 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugger.hpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.cpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.hpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#490 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#137 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#275 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#106 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#200 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuscsi.cpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#297 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#346 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#124 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#69 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.cpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#223 edit
[ROCm/clr commit: 647aba6ed2]
This commit is contained in:
@@ -521,6 +521,7 @@ Settings::Settings()
|
||||
waitCommand_ = AMD_OCL_WAIT_COMMAND;
|
||||
supportDepthsRGB_ = false;
|
||||
assumeAliases_ = false;
|
||||
enableHwDebug_ = false;
|
||||
}
|
||||
|
||||
bool
|
||||
|
||||
@@ -63,7 +63,6 @@ class SvmFillMemoryCommand;
|
||||
class SvmMapMemoryCommand;
|
||||
class SvmUnmapMemoryCommand;
|
||||
class HwDebugManager;
|
||||
class RunHwDbgCommand;
|
||||
class Device;
|
||||
struct KernelParameterDescriptor;
|
||||
struct Coord3D;
|
||||
@@ -500,7 +499,7 @@ struct Info : public amd::EmbeddedObject
|
||||
//! List of supported video attributes (profile/format pairs)
|
||||
cl_video_attrib_amd* videoAttribs_;
|
||||
cl_uint numVideoAttribs_;
|
||||
//Encoder
|
||||
//Encoder
|
||||
cl_video_attrib_encode_amd* videoEncAttribs_;
|
||||
cl_uint numVideoEncAttribs_;
|
||||
#endif //cl_amd_open_video
|
||||
@@ -574,9 +573,6 @@ struct Info : public amd::EmbeddedObject
|
||||
//! The maximum size of global scope variables
|
||||
size_t maxGlobalVariableSize_;
|
||||
size_t globalVariablePreferredTotalSize_;
|
||||
|
||||
//! Enable HW Debug support
|
||||
cl_bool enableHwDebug_;
|
||||
};
|
||||
|
||||
//! Device settings
|
||||
@@ -586,7 +582,7 @@ public:
|
||||
uint64_t extensions_; //!< Supported OCL extensions
|
||||
union {
|
||||
struct {
|
||||
uint partialDispatch_: 1; //!< Enables partial dispatch
|
||||
uint partialDispatch_: 1; //!< Enables partial dispatch
|
||||
uint supportRA_: 1; //!< Support RA channel order format
|
||||
uint largeHostMemAlloc_: 1; //!< Allow large host mem allocations (> maxSingleAlloc)
|
||||
uint waitCommand_: 1; //!< Enables a wait for every submitted command
|
||||
@@ -594,7 +590,8 @@ public:
|
||||
// that replaces generic OS allocation routines
|
||||
uint supportDepthsRGB_: 1; //!< Support DEPTH and sRGB channel order format
|
||||
uint assumeAliases_: 1; //!< Assume aliases in the compilation process
|
||||
uint reserved_: 25;
|
||||
uint enableHwDebug_: 1; //!< Enable HW debug support
|
||||
uint reserved_: 24;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
@@ -776,8 +773,8 @@ protected:
|
||||
|
||||
volatile size_t version_; //!< The version we're currently shadowing
|
||||
|
||||
//! NB, the map data below is for an API-level map (from clEnqueueMapBuffer),
|
||||
//! not a physical map. When a memory object does not use USE_HOST_PTR we
|
||||
//! NB, the map data below is for an API-level map (from clEnqueueMapBuffer),
|
||||
//! not a physical map. When a memory object does not use USE_HOST_PTR we
|
||||
//! can use a remote resource and DMA, avoiding the additional CPU memcpy.
|
||||
amd::Memory* mapMemory_; //!< Memory used as map target buffer
|
||||
volatile size_t indirectMapCount_; //!< Number of maps
|
||||
@@ -898,7 +895,7 @@ public:
|
||||
workGroupInfo_.compileSize_[1] = y;
|
||||
workGroupInfo_.compileSize_[2] = z;
|
||||
}
|
||||
|
||||
|
||||
size_t getReqdWorkGroupSize(int dim) {
|
||||
return workGroupInfo_.compileSize_[dim];
|
||||
}
|
||||
@@ -1139,11 +1136,11 @@ public:
|
||||
never called in storing routines */
|
||||
bool setBinary(char* theBinary, size_t theBinarySize, bool allocated=false);
|
||||
|
||||
//! setin elfIn_
|
||||
//! setin elfIn_
|
||||
bool setElfIn(unsigned char eclass);
|
||||
void resetElfIn();
|
||||
|
||||
//! set out elf
|
||||
//! set out elf
|
||||
bool setElfOut(unsigned char eclass, const char* outFile);
|
||||
void resetElfOut();
|
||||
|
||||
@@ -1232,7 +1229,7 @@ public:
|
||||
|
||||
// Return the encrypt code for this input binary ( "> 0" means encrypted)
|
||||
int getEncryptCode() { return encryptCode_; }
|
||||
|
||||
|
||||
// Returns TRUE of binary file is SPIR
|
||||
bool isSPIR() const;
|
||||
protected:
|
||||
@@ -1413,9 +1410,6 @@ public:
|
||||
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) = 0;
|
||||
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) = 0;
|
||||
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) = 0;
|
||||
#if 0 // exclude this until more HW DEBUG codes are submitted
|
||||
virtual void submitHwDbgCommand(amd::RunHwDbgCommand& cmd) = 0;
|
||||
#endif
|
||||
|
||||
//! Get the blit manager object
|
||||
device::BlitManager& blitMgr() const { return *blitMgr_; }
|
||||
@@ -1698,6 +1692,9 @@ public:
|
||||
//! Initialize the Hardware Debug Manager
|
||||
virtual cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage) { return CL_SUCCESS; }
|
||||
|
||||
//! Remove the Hardware Debug Manager
|
||||
virtual void hwDebugManagerRemove() {}
|
||||
|
||||
protected:
|
||||
//! Enable the specified extension
|
||||
char* getExtensionString();
|
||||
|
||||
@@ -0,0 +1,127 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
|
||||
*
|
||||
* All rights reserved. This notice is intended as a precaution against
|
||||
* inadvertent publication and does not imply publication or any waiver
|
||||
* of confidentiality. The year included in the foregoing notice is the
|
||||
* year of creation of the work.
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#ifndef HWDBG_GPUDEBGGER_H_
|
||||
#define HWDBG_GPUDEBGGER_H_
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include "hsa.h"
|
||||
#include "sc-hsa/Interface/SCHSAInterface.h"
|
||||
#include "device/device.hpp"
|
||||
#include "device/hwdebug.hpp"
|
||||
|
||||
static const int NumberReserveVgprs = 4;
|
||||
|
||||
namespace gpu {
|
||||
|
||||
/**
|
||||
* \defgroup Services_API OCL Runtime Services API
|
||||
* @{
|
||||
*/
|
||||
|
||||
|
||||
/*! \brief Dispatch packet information
|
||||
*
|
||||
* This structure contains the packet information for kernel dispatch
|
||||
*/
|
||||
struct PacketAmdInfo
|
||||
{
|
||||
uint32_t trapReservedVgprIndex_; //!< reserved VGPR index, -1 when they are not valid
|
||||
uint32_t scratchBufferWaveOffset_; //!< scratch buffer wave offset, -1 when no scratch buffer
|
||||
void* pointerToIsaBuffer_; //!< pointer to the buffer containing ISA
|
||||
size_t sizeOfIsaBuffer_; //!< size of the ISA buffer
|
||||
uint32_t numberOfVgprs_; //!< number of VGPRs used by the kernel
|
||||
uint32_t numberOfSgprs_; //!< number of SGPRs used by the kernel
|
||||
size_t sizeOfStaticGroupMemory_; //!< Static local memory used by the kernel
|
||||
};
|
||||
|
||||
/*! \brief Cache mask for invalidation
|
||||
*/
|
||||
struct HwDbgGpuCacheMask
|
||||
{
|
||||
HwDbgGpuCacheMask() :ui32All_(0) {}
|
||||
|
||||
HwDbgGpuCacheMask(uint32_t mask) :ui32All_(mask) {}
|
||||
|
||||
union {
|
||||
struct {
|
||||
uint32_t sqICache_ : 1; //!< Instruction cache
|
||||
uint32_t sqKCache_ : 1; //!< Data cache
|
||||
uint32_t tcL1_ : 1; //!< tcL1 cache
|
||||
uint32_t tcL2_ : 1; //!< tcL2 cache
|
||||
uint32_t reserved_ : 28;
|
||||
};
|
||||
uint32_t ui32All_;
|
||||
};
|
||||
};
|
||||
|
||||
/*! \brief Address watch information
|
||||
*
|
||||
* Information about each watch point - address, mask, mode and event
|
||||
*/
|
||||
struct HwDbgAddressWatch
|
||||
{
|
||||
void* watchAddress_; //! The address of watch point
|
||||
uint64_t watchMask_; //! The mask for watch point (lower 24 bits)
|
||||
cl_dbg_address_watch_mode_amd watchMode_; //! The watch mode for this watch
|
||||
DebugEvent event_; //! Event of the watch point (not used for now)
|
||||
};
|
||||
|
||||
/*! \brief Runtime structure used to communicate debug information
|
||||
* between Ocl services and core for a kernel dispatch.
|
||||
*/
|
||||
struct DebugToolInfo
|
||||
{
|
||||
uint64_t scratchAddress_; //! Scratch memory address
|
||||
size_t scratchSize_; //! Scratch memory size
|
||||
uint64_t globalAddress_; //! Global memory address
|
||||
uint32_t cacheDisableMask_; //! Cache mask, indicating caches disabled
|
||||
uint32_t exceptionMask_; //! Exception mask
|
||||
uint32_t reservedCuNum_; //! Number of reserved CUs for display,
|
||||
//! which ranges from 0 to 7 in the current implementation.
|
||||
bool monitorMode_; //! Debug or profiler mode
|
||||
bool gpuSingleStepMode_; //! SQ debug mode
|
||||
amd::Memory* trapHandler_; //! Trap handler address
|
||||
amd::Memory* trapBuffer_; //! Trap buffer address
|
||||
bool sqPerfcounterEnable_; //! whether SQ perf counters are enabled
|
||||
};
|
||||
|
||||
/*! \brief Message used by the KFD wave control for CI
|
||||
*
|
||||
* Structure indicates the various information used by the wave control function.
|
||||
*/
|
||||
struct HwDebugWaveAddr
|
||||
{
|
||||
uint32_t VMID_ : 4; //! Virtual memory id
|
||||
uint32_t wave_ : 4; //! Wave id
|
||||
uint32_t SIMD_ : 2; //! SIMD id
|
||||
uint32_t CU_ : 4; //! Compute unit
|
||||
uint32_t SH_ : 1; //! Shader array
|
||||
uint32_t SE_ : 1; //! Shader engine
|
||||
};
|
||||
|
||||
/*! \brief Kernel code information
|
||||
*
|
||||
* This structure contains the pointer of mapped kernel code for host access
|
||||
* and its size (in bytes)
|
||||
*/
|
||||
struct AqlCodeInfo
|
||||
{
|
||||
amd_kernel_code_t * aqlCode_; //! pointer of AQL code to allow host access
|
||||
uint32_t aqlCodeSize_; //! size of AQL code
|
||||
};
|
||||
|
||||
/**@}*/
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
#endif // HWDBG_GPUDEBGGER_H_
|
||||
@@ -0,0 +1,361 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
|
||||
*
|
||||
* All rights reserved. This notice is intended as a precaution against
|
||||
* inadvertent publication and does not imply publication or any waiver
|
||||
* of confidentiality. The year included in the foregoing notice is the
|
||||
* year of creation of the work.
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#include "gpudebugmanager.hpp"
|
||||
#include "gpudevice.hpp"
|
||||
#include "platform/commandqueue.hpp"
|
||||
|
||||
#include "device/device.hpp"
|
||||
#include "device/gpu/gpumemory.hpp"
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
|
||||
namespace gpu {
|
||||
|
||||
class VirtualGPU;
|
||||
class Device;
|
||||
class Memory;
|
||||
|
||||
/*
|
||||
***************************************************************************
|
||||
* Implementation of GPU Debug Manager class
|
||||
***************************************************************************
|
||||
*/
|
||||
|
||||
GpuDebugManager::GpuDebugManager(amd::Device* device)
|
||||
: HwDebugManager(device)
|
||||
, vGpu_(NULL)
|
||||
, debugMessages_(0)
|
||||
, addressWatch_(NULL)
|
||||
, addressWatchSize_(0)
|
||||
, oclEventHandle_(NULL)
|
||||
{
|
||||
// Initialize the exception info and the kernel execution mode
|
||||
excpPolicy_.exceptionMask = 0x0;
|
||||
excpPolicy_.waveAction = CL_DBG_WAVES_RESUME;
|
||||
excpPolicy_.hostAction = CL_DBG_HOST_IGNORE;
|
||||
excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST;
|
||||
|
||||
execMode_.ui32All = 0;
|
||||
|
||||
rtTrapHandlerInfo_.trap_.trapHandler_ = NULL;
|
||||
rtTrapHandlerInfo_.trap_.trapBuffer_ = NULL;
|
||||
|
||||
aqlPacket_ = (hsa_kernel_dispatch_packet_t *) NULL;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
GpuDebugManager::~GpuDebugManager()
|
||||
{
|
||||
if (NULL != addressWatch_) {
|
||||
delete [] addressWatch_;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::executePreDispatchCallBack(void* aqlPacket,
|
||||
void* toolInfo)
|
||||
{
|
||||
DebugToolInfo* info = reinterpret_cast<DebugToolInfo*>(toolInfo);
|
||||
|
||||
aqlPacket_ = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlPacket);
|
||||
|
||||
// Only if the pre-dispatch callback is set, will we update cache
|
||||
// flush configuration and build the memory descriptor.
|
||||
if (NULL != preDispatchCallBackFunc_) {
|
||||
// Build the scratch memory descriptor
|
||||
device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_,
|
||||
info->scratchAddress_,
|
||||
info->scratchSize_);
|
||||
|
||||
// Build the global memory descriptor
|
||||
device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_,
|
||||
info->globalAddress_);
|
||||
|
||||
// // for invalidate cache (BuildEndOfKernelNotifyCommands)
|
||||
// aqlPacket->release_fence_scope = 2;
|
||||
|
||||
cl_device_id clDeviceId = as_cl(device_);
|
||||
preDispatchCallBackFunc_(clDeviceId,
|
||||
oclEventHandle_,
|
||||
aqlPacket_,
|
||||
aclBinary_,
|
||||
deviceTrapInfo_,
|
||||
preDispatchCallBackArgs_);
|
||||
}
|
||||
|
||||
// Copy the various info set by the debugger/profiler to the tool info structure
|
||||
setupTrapInformation(info);
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::executePostDispatchCallBack()
|
||||
{
|
||||
if (NULL != postDispatchCallBackFunc_) {
|
||||
cl_device_id clDeviceId = as_cl(device_);
|
||||
postDispatchCallBackFunc_(clDeviceId,
|
||||
aqlPacket_->completion_signal.handle,
|
||||
postDispatchCallBackArgs_);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
cl_int
|
||||
GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage)
|
||||
{
|
||||
//! @todo: obtain the global mutex of HW debug to make sure only one debugger process exist
|
||||
|
||||
if (!device()->settings().enableHwDebug_) {
|
||||
LogError("debugmanager: Register debugger error - HW DEBUG is not enable");
|
||||
return CL_DEBUGGER_REGISTER_FAILURE_AMD;
|
||||
}
|
||||
|
||||
// first time register - set the message storage, flush queue and enable hw debug
|
||||
if (!isRegistered()) {
|
||||
debugMessages_ = messageStorage;
|
||||
dbgMsgBufferReady_ = true;
|
||||
isRegistered_ = false;
|
||||
}
|
||||
|
||||
context_ = context;
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::unregisterDebugger()
|
||||
{
|
||||
if (isRegistered()) {
|
||||
//! @todo: release the global mutex of HW debug
|
||||
|
||||
// reset the debugger registration flag
|
||||
isRegistered_ = false;
|
||||
dbgMsgBufferReady_ = false;
|
||||
|
||||
context_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
cl_int
|
||||
GpuDebugManager::registerDebuggerOnQueue(device::VirtualDevice* vDevice)
|
||||
{
|
||||
if (!isMsgBufferReady()) {
|
||||
return CL_DEBUGGER_REGISTER_FAILURE_AMD;
|
||||
}
|
||||
|
||||
if (isRegistered()) { // The debugger has already been registered,
|
||||
return CL_SUCCESS; // nothing to be done
|
||||
}
|
||||
|
||||
VirtualGPU* vGpu = reinterpret_cast<gpu::VirtualGPU*>(vDevice);
|
||||
|
||||
// populate the fields in the debugMessages structure used by the GPU exception notification
|
||||
if (vGpu->RegisterHwDebugger(debugMessages_)) {
|
||||
vGpu_ = vGpu;
|
||||
isRegistered_ = true;
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
return CL_DEBUGGER_REGISTER_FAILURE_AMD;
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::flushCache(uint32_t mask)
|
||||
{
|
||||
HwDbgGpuCacheMask cacheMask(mask);
|
||||
device()->xferQueue()->flushCuCaches(cacheMask);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo)
|
||||
{
|
||||
toolInfo->scratchAddress_ = 0;
|
||||
toolInfo->scratchSize_ = 0;
|
||||
toolInfo->globalAddress_ = 0;
|
||||
toolInfo->sqPerfcounterEnable_ = false;
|
||||
|
||||
// Set up trap related info in the kernel info structure to be
|
||||
// used in the kernel dispatch.
|
||||
toolInfo->exceptionMask_ = excpPolicy_.exceptionMask;
|
||||
toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode;
|
||||
toolInfo->monitorMode_ = execMode_.monitorMode;
|
||||
|
||||
// The order of these three bits is determined by the definition
|
||||
// of the register COMPUTE_DISPATCH_INITIATOR
|
||||
toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2)
|
||||
| (execMode_.disableL2Cache << 1)
|
||||
| (execMode_.disableL1Vector));
|
||||
|
||||
toolInfo->reservedCuNum_ = execMode_.reservedCuNum;
|
||||
|
||||
toolInfo->trapHandler_ =
|
||||
as_amd(reinterpret_cast<cl_mem>(deviceTrapInfo_[kDebugTrapHandlerLocation]));
|
||||
toolInfo->trapBuffer_ =
|
||||
as_amd(reinterpret_cast<cl_mem>(deviceTrapInfo_[kDebugTrapBufferLocation]));
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
GpuDebugManager::getPacketAmdInfo(
|
||||
const void* aqlCodeInfo,
|
||||
void* packetInfo) const
|
||||
|
||||
{
|
||||
const AqlCodeInfo* codeInfo =
|
||||
reinterpret_cast<const AqlCodeInfo*>(aqlCodeInfo);
|
||||
|
||||
const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_;
|
||||
|
||||
PacketAmdInfo* packet =
|
||||
reinterpret_cast<PacketAmdInfo*>(packetInfo);
|
||||
|
||||
const amd_kernel_code_t* akc = hostAqlCode;
|
||||
|
||||
packet->numberOfSgprs_ = akc->wavefront_sgpr_count;
|
||||
packet->numberOfVgprs_ = akc->workitem_vgpr_count;
|
||||
|
||||
// use mapped kernel_object_address for host accessing of ISA buffer
|
||||
packet->pointerToIsaBuffer_ = (char*) (hostAqlCode) +
|
||||
akc->kernel_code_entry_byte_offset;
|
||||
|
||||
packet->scratchBufferWaveOffset_ =
|
||||
akc->debug_wavefront_private_segment_offset_sgpr;
|
||||
|
||||
packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_;
|
||||
|
||||
packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size;
|
||||
|
||||
// The trap_reserved_vgpr_index will be 4 less the original
|
||||
// This value must be used only by the debugger
|
||||
packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs;
|
||||
}
|
||||
|
||||
DebugEvent
|
||||
GpuDebugManager::createDebugEvent(
|
||||
const bool autoReset)
|
||||
{
|
||||
if (!isRegistered()) {
|
||||
LogError("debugmanager: Failed to flush cache - hw debug is not available");
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// create the event object
|
||||
osEventHandle shaderEvent = osEventCreate(!autoReset);
|
||||
|
||||
// event object has been created, set the initial state
|
||||
if (shaderEvent != 0) {
|
||||
|
||||
osEventReset(shaderEvent); // initial state is non-signaled
|
||||
|
||||
if (vGpu_->ExceptionNotification(shaderEvent)) {
|
||||
isRegistered_ = true;
|
||||
return shaderEvent;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
cl_int
|
||||
GpuDebugManager::waitDebugEvent(
|
||||
DebugEvent pEvent,
|
||||
uint32_t timeOut) const
|
||||
{
|
||||
if (osEventTimedWait(pEvent, timeOut)) {
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
else {
|
||||
return CL_EVENT_TIMEOUT_AMD;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent)
|
||||
{
|
||||
osEventDestroy(*pEvent);
|
||||
*pEvent = 0;
|
||||
|
||||
vGpu_->ExceptionNotification(0);
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::wavefrontControl(
|
||||
uint32_t waveAction,
|
||||
uint32_t waveMode,
|
||||
uint32_t trapId,
|
||||
void* waveAddr) const
|
||||
{
|
||||
device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr);
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::setAddressWatch(
|
||||
uint32_t numWatchPoints,
|
||||
void** watchAddress,
|
||||
uint64_t* watchMask,
|
||||
uint64_t* watchMode,
|
||||
DebugEvent* event)
|
||||
{
|
||||
size_t requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch);
|
||||
|
||||
// previously allocated size is not big enough, allocate new memory
|
||||
if (addressWatchSize_ < requiredSize) {
|
||||
if (NULL != addressWatch_) { // free the smaller address watch storage
|
||||
delete [] addressWatch_;
|
||||
}
|
||||
addressWatch_ = new HwDbgAddressWatch[numWatchPoints];
|
||||
addressWatchSize_ = requiredSize;
|
||||
}
|
||||
|
||||
// fill in the address watch structure
|
||||
memset(addressWatch_, 0, addressWatchSize_);
|
||||
|
||||
for (uint32_t i = 0; i < numWatchPoints; i++)
|
||||
{
|
||||
amd::Memory* watchMem = as_amd(reinterpret_cast<cl_mem>(watchAddress[i]));
|
||||
Memory* watchMemAddress = device()->getGpuMemory(watchMem);
|
||||
|
||||
addressWatch_[i].watchAddress_ = reinterpret_cast<void*>(watchMemAddress->vmAddress());
|
||||
addressWatch_[i].watchMask_ = watchMask[i];
|
||||
addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd) watchMode[i];
|
||||
addressWatch_[i].event_ = (0 != event) ? event[i] : 0;
|
||||
}
|
||||
|
||||
// setup the watch addresses
|
||||
device()->gslCtx()->setAddressWatch(numWatchPoints, (void*) addressWatch_);
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::setGlobalMemory(
|
||||
amd::Memory* memObj,
|
||||
uint32_t offset,
|
||||
void* srcPtr,
|
||||
uint32_t size)
|
||||
{
|
||||
gpu::Memory* globalMem = device()->getGpuMemory(memObj);
|
||||
|
||||
address mappedMem = static_cast<address>(globalMem->map(NULL,0));
|
||||
assert(mappedMem != 0);
|
||||
|
||||
void* dest_ptr = reinterpret_cast<void*>(mappedMem + offset);
|
||||
memcpy(dest_ptr, srcPtr, size);
|
||||
|
||||
globalMem->unmap(NULL);
|
||||
}
|
||||
|
||||
|
||||
} // namespace gpu
|
||||
@@ -0,0 +1,132 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
|
||||
*
|
||||
* All rights reserved. This notice is intended as a precaution against
|
||||
* inadvertent publication and does not imply publication or any waiver
|
||||
* of confidentiality. The year included in the foregoing notice is the
|
||||
* year of creation of the work.
|
||||
*
|
||||
******************************************************************************/
|
||||
#ifndef HWDBG_DEBUGMANAGER_H__
|
||||
#define HWDBG_DEBUGMANAGER_H__
|
||||
|
||||
#include "gpuvirtual.hpp"
|
||||
#include "gpudebugger.hpp"
|
||||
|
||||
namespace gpu {
|
||||
|
||||
class GpuDebugManager;
|
||||
class Device;
|
||||
class Memory;
|
||||
|
||||
|
||||
/*! \brief Debug Manager Class
|
||||
*
|
||||
* The debug manager class is used to pass all the trap info to the
|
||||
* kernel dispatch and then the kernel execution can use such trap information
|
||||
* for kernel execution. This class contains the trap handler and shader event
|
||||
* objects. The trap handler is setup by users and passed to the kernel dispatch.
|
||||
* The shader event is to receive interrupts from the GPU and then users can
|
||||
* perform various operations.
|
||||
*
|
||||
* This class also provides the interface for setting up the pre-dispatch
|
||||
* callback functions used by the profiler and debugger. It also provides
|
||||
* a way to retrieve various debug information for the kernel execution.
|
||||
*
|
||||
*/
|
||||
class GpuDebugManager : public amd::HwDebugManager {
|
||||
public:
|
||||
|
||||
//! Constructor of the debug manager class
|
||||
GpuDebugManager(amd::Device* device);
|
||||
|
||||
//! Destructor of the debug manager class
|
||||
~GpuDebugManager();
|
||||
|
||||
//! Get the single instance of the GpuDebugManager class
|
||||
static GpuDebugManager* getDefaultInstance();
|
||||
|
||||
//! Destroy the GpuDebugManager class object
|
||||
static void destroyInstances();
|
||||
|
||||
//! Flush cache
|
||||
void flushCache(uint32_t mask);
|
||||
|
||||
//! Create the debug event
|
||||
DebugEvent createDebugEvent(const bool autoReset);
|
||||
|
||||
//! Wait for the debug event
|
||||
cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const;
|
||||
|
||||
//! Destroy the debug event
|
||||
void destroyDebugEvent(DebugEvent* pEvent);
|
||||
|
||||
//! Register the debugger
|
||||
cl_int registerDebugger(amd::Context*context, uintptr_t messageStorage);
|
||||
|
||||
//! Register the debugger with KMD after command queue has been created
|
||||
cl_int registerDebuggerOnQueue(device::VirtualDevice* vDevice);
|
||||
|
||||
//! Unregister the debugger
|
||||
void unregisterDebugger();
|
||||
|
||||
//! Send the wavefront control cmmand
|
||||
void wavefrontControl(uint32_t waveAction,
|
||||
uint32_t waveMode,
|
||||
uint32_t trapId,
|
||||
void* waveAddr) const;
|
||||
|
||||
//! Set address watching point
|
||||
void setAddressWatch(uint32_t numWatchPoints,
|
||||
void** watchAddress,
|
||||
uint64_t* watchMask,
|
||||
uint64_t* watchMode,
|
||||
DebugEvent* pEvent);
|
||||
|
||||
//! Get the packet information for dispatch
|
||||
void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const;
|
||||
|
||||
//! Set global memory values
|
||||
void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size);
|
||||
|
||||
//! Execute the post-dispatch callback function
|
||||
void executePostDispatchCallBack();
|
||||
|
||||
//! Execute the pre-dispatch callback function
|
||||
void executePreDispatchCallBack(void* aqlPacket,
|
||||
void* toolInfo);
|
||||
|
||||
private:
|
||||
|
||||
//! Setup trap handler info for kernel execution
|
||||
void setupTrapInformation(DebugToolInfo* toolInfo);
|
||||
|
||||
|
||||
protected:
|
||||
|
||||
const VirtualGPU* vGpu() const { return vGpu_; }
|
||||
|
||||
private:
|
||||
|
||||
const gpu::Device* device() const {
|
||||
return reinterpret_cast<const gpu::Device *>(device_); }
|
||||
|
||||
VirtualGPU* vGpu_; //!< the virtual GPU
|
||||
|
||||
uintptr_t debugMessages_; //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD
|
||||
|
||||
HwDbgAddressWatch* addressWatch_; //!< Address watch data
|
||||
size_t addressWatchSize_; //!< Size of address watch data
|
||||
|
||||
//! Arguments used by the callback function
|
||||
void* oclEventHandle_; //!< event handler
|
||||
const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
#endif // HWDBG_DEBUGMANAGER_H__
|
||||
@@ -38,6 +38,8 @@
|
||||
#include <iostream>
|
||||
#include <ctype.h>
|
||||
|
||||
#include "gpudebugmanager.hpp"
|
||||
|
||||
bool DeviceLoad()
|
||||
{
|
||||
bool ret = false;
|
||||
@@ -890,6 +892,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#ifdef DEBUG
|
||||
std::stringstream message;
|
||||
if (settings().remoteAlloc_) {
|
||||
@@ -1225,7 +1228,7 @@ Device::init()
|
||||
{
|
||||
CALuint numDevices = 0;
|
||||
bool result = false;
|
||||
bool useDeviceList = false;
|
||||
bool useDeviceList = false;
|
||||
requestedDevices_t requestedDevices;
|
||||
|
||||
const char *library = getenv("COMPILER_LIBRARY");
|
||||
@@ -2662,4 +2665,27 @@ Device::SrdManager::fillResourceList(std::vector<const Resource*>& memList)
|
||||
}
|
||||
}
|
||||
|
||||
cl_int
|
||||
Device::hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage)
|
||||
{
|
||||
hwDebugMgr_ = new GpuDebugManager(this);
|
||||
cl_int status = hwDebugMgr_->registerDebugger(context, messageStorage);
|
||||
|
||||
if (CL_SUCCESS != status) {
|
||||
delete hwDebugMgr_;
|
||||
hwDebugMgr_ = NULL;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
void
|
||||
Device::hwDebugManagerRemove()
|
||||
{
|
||||
hwDebugMgr_->unregisterDebugger();
|
||||
|
||||
delete hwDebugMgr_;
|
||||
hwDebugMgr_ = NULL;
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
@@ -560,6 +560,12 @@ public:
|
||||
//! Returns SRD manger object
|
||||
SrdManager& srds() const { return *srdManager_; }
|
||||
|
||||
//! Initial the Hardware Debug Manager
|
||||
cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage);
|
||||
|
||||
//! Remove the Hardware Debug Manager
|
||||
void hwDebugManagerRemove();
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
Device(const Device&);
|
||||
|
||||
@@ -3510,6 +3510,7 @@ HSAILKernel::HSAILKernel(std::string name,
|
||||
, prog_(*prog)
|
||||
, index_(0)
|
||||
, code_(NULL)
|
||||
, codeSize_(0)
|
||||
, hwMetaData_(NULL)
|
||||
{
|
||||
hsa_ = true;
|
||||
@@ -3924,6 +3925,11 @@ HSAILKernel::loadArguments(
|
||||
mem->signalWrite(&dev());
|
||||
}
|
||||
memList.push_back(gpuMem);
|
||||
|
||||
// save the memory object pointer to allow global memory access
|
||||
if (NULL != dev().hwDebugMgr()) {
|
||||
dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner());
|
||||
}
|
||||
}
|
||||
// If it is a local pointer
|
||||
else {
|
||||
|
||||
@@ -862,7 +862,10 @@ public:
|
||||
const void* cpuAqlCode() const { return cpuAqlCode_; }
|
||||
|
||||
//! Returns memory object with AQL code
|
||||
const gpu::Memory* gpuAqlCode() const { return code_; }
|
||||
gpu::Memory* gpuAqlCode() const { return code_; }
|
||||
|
||||
//! Returns size of AQL code
|
||||
size_t aqlCodeSize() const { return codeSize_; }
|
||||
|
||||
//! Returns the size of argument buffer
|
||||
size_t argsBufferSize() const
|
||||
@@ -883,7 +886,7 @@ public:
|
||||
amd::NDRange& lclWorkSize //!< Local work size
|
||||
) const;
|
||||
|
||||
//! Returns AQL packet in CPU memory
|
||||
//! Returns AQL packet in CPU memory
|
||||
//! if the kerenl arguments were successfully loaded, otherwise NULL
|
||||
hsa_kernel_dispatch_packet_t* loadArguments(
|
||||
VirtualGPU& gpu, //!< Running GPU context
|
||||
@@ -939,6 +942,8 @@ private:
|
||||
uint index_; //!< Kernel index in the program
|
||||
|
||||
gpu::Memory* code_; //!< Memory object with ISA code
|
||||
size_t codeSize_; //!< Size of ISA code
|
||||
|
||||
char* hwMetaData_; //!< SI metadata
|
||||
|
||||
union Flags {
|
||||
|
||||
@@ -363,7 +363,8 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
|
||||
elementSize_ = static_cast<CALuint>(memoryFormatSize(cal()->format_).size_);
|
||||
cal_.type_ = memType;
|
||||
if (memType == Scratch) {
|
||||
cal_.type_ = Local;
|
||||
// use local memory for scratch buffer unless it is using HW DEBUG
|
||||
cal_.type_ = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC;
|
||||
cal_.scratch_ = true;
|
||||
}
|
||||
|
||||
@@ -463,7 +464,7 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
|
||||
else if ((gslRef_ != NULL) && (!dev().settings().use64BitPtr_)) {
|
||||
// Make sure runtime didn't pick a resource with > 4GB address
|
||||
if ((cal()->dimension_ == GSL_MOA_BUFFER) &&
|
||||
(static_cast<uint64_t>(gslRef_->gslResource()->getSurfaceAddress() +
|
||||
(static_cast<uint64_t>(gslRef_->gslResource()->getSurfaceAddress() +
|
||||
gslRef_->gslResource()->getSurfaceSize()) > (uint64_t(4) * Gi))) {
|
||||
gslRef_->release();
|
||||
gslRef_ = NULL;
|
||||
|
||||
@@ -172,13 +172,17 @@ HSAILKernel::aqlCreateHWInfo(const void* shader, size_t shaderSize)
|
||||
|
||||
address codeStartAddress = reinterpret_cast<address>(akc);
|
||||
address codeEndAddress = reinterpret_cast<address>(hcd) + siMetaData->common.codeLenInByte;
|
||||
uint64_t codeSize = codeEndAddress - codeStartAddress;
|
||||
code_ = new gpu::Memory(dev(), amd::alignUp(codeSize, gpu::ConstBuffer::VectorSize));
|
||||
codeSize_ = codeEndAddress - codeStartAddress;
|
||||
code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, gpu::ConstBuffer::VectorSize));
|
||||
|
||||
// force to use remote memory for HW DEBUG
|
||||
Resource::MemoryType resMemType = (!dev().settings().enableHwDebug_) ? Resource::Local : Resource::RemoteUSWC;
|
||||
|
||||
// Initialize kernel ISA code
|
||||
if ((code_ != NULL) && code_->create(Resource::Local)) {
|
||||
if ((code_ != NULL) && code_->create(resMemType)) {
|
||||
address cpuCodePtr = static_cast<address>(code_->map(NULL, Resource::WriteOnly));
|
||||
// Copy only amd_kernel_code_t
|
||||
memcpy(cpuCodePtr, codeStartAddress, codeSize);
|
||||
memcpy(cpuCodePtr, codeStartAddress, codeSize_);
|
||||
code_->unmap(NULL);
|
||||
}
|
||||
else {
|
||||
|
||||
@@ -134,6 +134,7 @@ Settings::Settings()
|
||||
|
||||
// Use host queue for device enqueuing by default
|
||||
useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
|
||||
|
||||
}
|
||||
|
||||
bool
|
||||
@@ -311,7 +312,7 @@ Settings::create(
|
||||
calAttr.isWorkstation || hsail_) : GPU_FORCE_64BIT_PTR;
|
||||
}
|
||||
else {
|
||||
if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
|
||||
if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
|
||||
|| (oclVersion_ >= OpenCL20)))) {
|
||||
use64BitPtr_ = true;
|
||||
}
|
||||
@@ -440,6 +441,11 @@ Settings::create(
|
||||
if (oclVersion_ >= OpenCL20) {
|
||||
enableExtension(ClKhrSubGroups);
|
||||
enableExtension(ClKhrDepthImages);
|
||||
|
||||
// Enable HW debug
|
||||
if (GPU_ENABLE_HW_DEBUG) {
|
||||
enableHwDebug_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (apuSystem_ &&
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
#include "device/gpu/gputhreadtrace.hpp"
|
||||
#include "device/gpu/gputimestamp.hpp"
|
||||
#include "device/gpu/gpublit.hpp"
|
||||
#include "device/gpu/gpudebugger.hpp"
|
||||
#include "hsa.h"
|
||||
#include "sc-hsa/Interface/SCHSAInterface.h"
|
||||
#include <fstream>
|
||||
@@ -402,6 +403,7 @@ VirtualGPU::VirtualGPU(
|
||||
, schedParamIdx_(0)
|
||||
, deviceQueueSize_(0)
|
||||
, hsaQueueMem_(NULL)
|
||||
, useHwDebug_(false)
|
||||
{
|
||||
memset(&cal_, 0, sizeof(CalVirtualDesc));
|
||||
for (uint i = 0; i < AllEngines; ++i) {
|
||||
@@ -585,6 +587,14 @@ VirtualGPU::create(
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if HW Debug is used and register the debugger if not done yet
|
||||
amd::HwDebugManager * dbgManager = dev().hwDebugMgr();
|
||||
|
||||
if ( dbgManager && dbgManager->isMsgBufferReady() ) {
|
||||
if ( dbgManager->registerDebuggerOnQueue(this) == CL_SUCCESS ) {
|
||||
useHwDebug_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -1720,6 +1730,12 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
hsaKernel.prog().kernelTable()->vmAddress());
|
||||
}
|
||||
|
||||
// setup the storage for the memory pointers of the kernel parameters
|
||||
uint numParams = kernel.signature().numParameters();
|
||||
if (useHwDebug_) {
|
||||
dev().hwDebugMgr()->allocParamMemList(numParams);
|
||||
}
|
||||
|
||||
// Program the kernel arguments for the GPU execution
|
||||
hsa_kernel_dispatch_packet_t* aqlPkt =
|
||||
hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem,
|
||||
@@ -1745,10 +1761,25 @@ VirtualGPU::submitKernelInternalHSA(
|
||||
addVmMemory(memList[i]);
|
||||
}
|
||||
|
||||
// HW Debug for the kernel?
|
||||
HwDbgKernelInfo kernelInfo;
|
||||
HwDbgKernelInfo *pKernelInfo = NULL;
|
||||
|
||||
if (useHwDebug_) {
|
||||
buildKernelInfo(hsaKernel, aqlPkt, kernelInfo);
|
||||
pKernelInfo = &kernelInfo;
|
||||
}
|
||||
|
||||
GpuEvent gpuEvent;
|
||||
// Run AQL dispatch in HW
|
||||
runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_,
|
||||
scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress());
|
||||
scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo);
|
||||
|
||||
if (useHwDebug_) {
|
||||
if (NULL != dev().hwDebugMgr()->postDispatchCallBackFunc()) {
|
||||
dev().hwDebugMgr()->executePostDispatchCallBack();
|
||||
}
|
||||
}
|
||||
|
||||
if (hsaKernel.dynamicParallelism()) {
|
||||
// Make sure exculsive access to the device queue
|
||||
@@ -3410,4 +3441,155 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable)
|
||||
virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait);
|
||||
}
|
||||
|
||||
void
|
||||
VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask)
|
||||
{
|
||||
//! @todo: fix issue of no event available for the flush/invalidate cache command
|
||||
InvalidateSqCaches(cache_mask.sqICache_,
|
||||
cache_mask.sqKCache_,
|
||||
cache_mask.tcL1_,
|
||||
cache_mask.tcL2_);
|
||||
|
||||
flushDMA(engineID_);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
void
|
||||
VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
|
||||
hsa_kernel_dispatch_packet_t* aqlPkt,
|
||||
HwDbgKernelInfo& kernelInfo)
|
||||
{
|
||||
amd::HwDebugManager * dbgManager = dev().hwDebugMgr();
|
||||
assert (dbgManager && "No HW Debug Manager!");
|
||||
|
||||
// Initialize structure with default values
|
||||
|
||||
if (hsaKernel.prog().maxScratchRegs() > 0) {
|
||||
gpu::Memory* scratchBuf = dev().scratch(hwRing())->memObjs_[0];
|
||||
kernelInfo.scratchBufAddr = scratchBuf->vmAddress();
|
||||
kernelInfo.scratchBufferSizeInBytes = scratchBuf->size();
|
||||
|
||||
// Get the address of the scratch buffer and its size for CPU access
|
||||
address scratchRingAddr = NULL;
|
||||
scratchRingAddr = static_cast<address>(scratchBuf->map(NULL, 0));
|
||||
dbgManager->setScratchRing(scratchRingAddr,scratchBuf->size());
|
||||
scratchBuf->unmap(NULL);
|
||||
}
|
||||
else {
|
||||
kernelInfo.scratchBufAddr = 0;
|
||||
kernelInfo.scratchBufferSizeInBytes = 0;
|
||||
dbgManager->setScratchRing(NULL, 0);
|
||||
}
|
||||
|
||||
|
||||
//! @todo: need to verify what is wanted for the global memory
|
||||
kernelInfo.heapBufAddr = (dev().globalMem()).vmAddress();
|
||||
|
||||
kernelInfo.pAqlDispatchPacket = aqlPkt;
|
||||
kernelInfo.pAqlQueuePtr = reinterpret_cast<void*>(hsaQueueMem_->vmAddress());
|
||||
|
||||
// Get the address of the kernel code and its size for CPU access
|
||||
gpu::Memory* aqlCode = hsaKernel.gpuAqlCode();
|
||||
if (NULL != aqlCode) {
|
||||
address aqlCodeAddr = static_cast<address>(aqlCode->map(NULL, 0));
|
||||
dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize());
|
||||
aqlCode->unmap(NULL);
|
||||
}
|
||||
else {
|
||||
dbgManager->setKernelCodeInfo(NULL, 0);
|
||||
}
|
||||
|
||||
kernelInfo.trapPresent = false;
|
||||
kernelInfo.trapHandler = NULL;
|
||||
kernelInfo.trapHandlerBuffer = NULL;
|
||||
|
||||
kernelInfo.excpEn = 0;
|
||||
kernelInfo.cacheDisableMask = 0;
|
||||
kernelInfo.sqDebugMode = 0;
|
||||
|
||||
kernelInfo.mgmtSe0Mask = 0xFFFFFFFF;
|
||||
kernelInfo.mgmtSe1Mask = 0xFFFFFFFF;
|
||||
|
||||
// set kernel info for HW debug and call the callback function
|
||||
if (NULL != dbgManager->preDispatchCallBackFunc()) {
|
||||
DebugToolInfo dbgSetting;
|
||||
dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr;
|
||||
dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes;
|
||||
dbgSetting.globalAddress_ = kernelInfo.heapBufAddr;
|
||||
|
||||
// Call the predispatch callback function & set the trap info
|
||||
AqlCodeInfo aqlCodeInfo;
|
||||
aqlCodeInfo.aqlCode_ = (amd_kernel_code_t *) hsaKernel.cpuAqlCode();
|
||||
aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize();
|
||||
|
||||
// Execute the pre-dispatch call back function
|
||||
dbgManager->executePreDispatchCallBack(reinterpret_cast<void*>(aqlPkt), &dbgSetting);
|
||||
|
||||
// assign the TMA and TBA for kernel dispatch
|
||||
if (NULL != dbgSetting.trapHandler_ && NULL != dbgSetting.trapBuffer_) {
|
||||
assignTrapHandler(dbgSetting, kernelInfo);
|
||||
}
|
||||
|
||||
kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false;
|
||||
|
||||
// Execption policy
|
||||
kernelInfo.excpEn = dbgSetting.exceptionMask_;
|
||||
kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_;
|
||||
kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_;
|
||||
|
||||
// Compute the mask for reserved CUs. These two dwords correspond to
|
||||
// two registers used for reserving CUs for display. In the current
|
||||
// implementation, the number of CUs reserved can be 0 to 7, and it
|
||||
// is set by debugger users.
|
||||
if (dbgSetting.monitorMode_) {
|
||||
uint32_t i = dbgSetting.reservedCuNum_ / 2;
|
||||
kernelInfo.mgmtSe0Mask <<= i;
|
||||
i = dbgSetting.reservedCuNum_ - i;
|
||||
kernelInfo.mgmtSe1Mask <<= i;
|
||||
}
|
||||
|
||||
// flush/invalidate the instruction, data, L1 and L2 caches
|
||||
InvalidateSqCaches();
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
VirtualGPU::assignTrapHandler(const DebugToolInfo& dbgSetting,
|
||||
HwDbgKernelInfo& kernelInfo)
|
||||
{
|
||||
|
||||
Memory * trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_);
|
||||
Memory * trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_);
|
||||
|
||||
addVmMemory(trapHandlerMem);
|
||||
addVmMemory(trapBufferMem);
|
||||
|
||||
// Handle TMA corruption hw bug workaround -
|
||||
// The trap handler buffer has extra 256 bytes allocated, the TMA address
|
||||
// is stored in the first two DWORDs and the actual trap handler code
|
||||
// is stored starting at the location of 256 bytes.
|
||||
//
|
||||
// - kernelInfo.trapHandler points directly to the trap handler code
|
||||
// - kernelInfo.trapHandlerBuffer points directly to the trap buffer (TMA)
|
||||
//
|
||||
kernelInfo.trapHandler = reinterpret_cast<void *>(trapHandlerMem->vmAddress() + TbaStartOffset);
|
||||
kernelInfo.trapHandlerBuffer = reinterpret_cast<void *>(trapBufferMem->vmAddress());
|
||||
|
||||
// Address of the trap handler code/buffer should be 256-byte aligned
|
||||
uint64_t tmaAddress = reinterpret_cast<uint64_t>(kernelInfo.trapHandlerBuffer);
|
||||
if ((reinterpret_cast<uint64_t>(kernelInfo.trapHandler) & 0xFF) != 0
|
||||
|| (tmaAddress & 0xFF) != 0) {
|
||||
assert(false && "Trap handler/buffer is not 256-byte aligned");
|
||||
}
|
||||
|
||||
// map the trap handler buffer address for host access, and store the trap
|
||||
// buffer address at the beginning of the allocated buffer
|
||||
address trapHandlerAddress = static_cast<address>(trapHandlerMem->map(NULL,0));
|
||||
uint32_t * tmaStorage = reinterpret_cast<uint32_t *>(trapHandlerAddress);
|
||||
tmaStorage[0] = tmaAddress & 0xFFFFFFFF;
|
||||
tmaStorage[1] = (tmaAddress >> 32) & 0xFFFFFFFF;
|
||||
trapHandlerMem->unmap(NULL);
|
||||
}
|
||||
|
||||
} // namespace gpu
|
||||
|
||||
@@ -12,6 +12,9 @@
|
||||
#include "device/gpu/gpusched.hpp"
|
||||
#include "device/blit.hpp"
|
||||
|
||||
#include "device/gpu/gpudebugger.hpp"
|
||||
|
||||
|
||||
/*! \addtogroup GPU GPU Resource Implementation
|
||||
* @{
|
||||
*/
|
||||
@@ -28,6 +31,7 @@ class VirtualGPU;
|
||||
class Program;
|
||||
class BlitManager;
|
||||
class ThreadTrace;
|
||||
class HSAILKernel;
|
||||
|
||||
//! Virtual GPU
|
||||
class VirtualGPU : public device::VirtualDevice, public CALGSLContext
|
||||
@@ -400,6 +404,8 @@ public:
|
||||
State state_; //!< virtual GPU current state
|
||||
CalVirtualDesc cal_; //!< CAL virtual device descriptor
|
||||
|
||||
void flushCuCaches(HwDbgGpuCacheMask cache_mask); //!< flush/invalidate SQ cache
|
||||
|
||||
protected:
|
||||
virtual void profileEvent(EngineType engine, bool type) const;
|
||||
|
||||
@@ -496,6 +502,17 @@ private:
|
||||
const amd::BufferRect& dstRect //!< region of destination for copy
|
||||
);
|
||||
|
||||
void buildKernelInfo(
|
||||
const HSAILKernel& hsaKernel, //!< hsa kernel
|
||||
hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch
|
||||
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
|
||||
);
|
||||
|
||||
void assignTrapHandler(
|
||||
const DebugToolInfo& dbgSetting, //!< debug settings
|
||||
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
|
||||
);
|
||||
|
||||
GslKernels gslKernels_; //!< GSL kernel descriptors
|
||||
GslKernelDesc* activeKernelDesc_; //!< active GSL kernel descriptors
|
||||
GpuEvents gpuEvents_; //!< GPU events
|
||||
@@ -534,6 +551,8 @@ private:
|
||||
uint deviceQueueSize_; //!< Device queue size
|
||||
|
||||
Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object
|
||||
|
||||
bool useHwDebug_; //!< Flag of using HW debug
|
||||
};
|
||||
|
||||
/*@}*/} // namespace gpu
|
||||
|
||||
@@ -440,7 +440,7 @@ CALGSLContext::isDone(GpuEvent* event)
|
||||
if (m_eventQueue[event->engineId_].isDone(event->id))
|
||||
{
|
||||
event->invalidate();
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@@ -1269,10 +1269,10 @@ CALGSLContext::writeTimer(bool sdma, const gslMemObject mem, uint32 offset) cons
|
||||
void
|
||||
CALGSLContext::runAqlDispatch(GpuEvent& event, const void* aqlPacket,
|
||||
const gslMemObject* mems, uint32 numMems, gslMemObject scratch, uint32 scratchOffset,
|
||||
const void* cpuKernelCode, uint64 hsaQueueVA)
|
||||
const void* cpuKernelCode, uint64 hsaQueueVA, const void* kernelInfo)
|
||||
{
|
||||
eventBegin(MainEngine);
|
||||
m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, scratchOffset, cpuKernelCode, hsaQueueVA);
|
||||
m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, scratchOffset, cpuKernelCode, hsaQueueVA, kernelInfo);
|
||||
eventEnd(MainEngine, event);
|
||||
}
|
||||
|
||||
@@ -1299,3 +1299,30 @@ CALGSLContext::virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mc
|
||||
m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal, dedicatedQueue);
|
||||
eventEnd(MainEngine, event);
|
||||
}
|
||||
|
||||
bool
|
||||
CALGSLContext::RegisterHwDebugger(uint64 debugMessages)
|
||||
{
|
||||
return m_cs->registerHwDebugger(debugMessages);
|
||||
}
|
||||
|
||||
bool
|
||||
CALGSLContext::ExceptionNotification(osEventHandle debugEvent)
|
||||
{
|
||||
return m_cs->exceptionNotification(debugEvent);
|
||||
}
|
||||
|
||||
void
|
||||
CALGSLContext::InvalidateSqCaches(bool instInvalidate, bool dataInvalidate, bool tcL1, bool tcL2)
|
||||
{
|
||||
// invalidating instruction/data L1 caches using Escape
|
||||
if (instInvalidate || dataInvalidate) {
|
||||
m_cs->invalidateSqCaches(instInvalidate, dataInvalidate);
|
||||
}
|
||||
|
||||
if (tcL1) {
|
||||
flushCUCaches(tcL2);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -44,7 +44,8 @@ public:
|
||||
bool runProgramGrid(GpuEvent& event, const ProgramGrid* pProgramGrid, const gslMemObject* mems, uint32 numMems);
|
||||
bool runProgramVideoDecode(GpuEvent& event, gslMemObject mo, const CALprogramVideoDecode& decode);
|
||||
void runAqlDispatch(GpuEvent& event, const void* aqlPacket, const gslMemObject* mems,
|
||||
uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode, uint64 hsaQueueVA);
|
||||
uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode,
|
||||
uint64 hsaQueueVA, const void* kernelInfo);
|
||||
mcaddr virtualQueueDispatcherStart();
|
||||
void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems,
|
||||
mcaddr signal, mcaddr loopStart, uint32 numTemplates);
|
||||
@@ -140,6 +141,11 @@ public:
|
||||
void writeTimer(bool sdma, const gslMemObject mem, uint32 offset) const;
|
||||
void writeSurfRaw(GpuEvent& event, gslMemObject mem, size_t size, const void* data);
|
||||
|
||||
/// HW Debug support functions
|
||||
bool RegisterHwDebugger(uint64 debugMessages);
|
||||
bool ExceptionNotification(osEventHandle debugEvent);
|
||||
void InvalidateSqCaches(bool instInvalidate = true, bool dataInvalidate = true, bool tcL1 = true, bool tcL2 = true);
|
||||
|
||||
protected:
|
||||
void setScratchBuffer(gslMemObject mem, int32 engineId);
|
||||
virtual void profileEvent(EngineType engine, bool type) const {}
|
||||
|
||||
@@ -0,0 +1,175 @@
|
||||
/*******************************************************************************
|
||||
*
|
||||
* Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
|
||||
*
|
||||
* All rights reserved. This notice is intended as a precaution against
|
||||
* inadvertent publication and does not imply publication or any waiver
|
||||
* of confidentiality. The year included in the foregoing notice is the
|
||||
* year of creation of the work.
|
||||
*
|
||||
******************************************************************************/
|
||||
|
||||
#include "hwdebug.hpp"
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
|
||||
namespace amd {
|
||||
|
||||
class Device;
|
||||
|
||||
/*
|
||||
***************************************************************************
|
||||
* Implementation of GPU Debug Manager class
|
||||
***************************************************************************
|
||||
*/
|
||||
|
||||
//! Constructor of the debug manager class
|
||||
HwDebugManager::HwDebugManager(amd::Device* device)
|
||||
: context_(NULL)
|
||||
, device_(device)
|
||||
, preDispatchCallBackFunc_(NULL)
|
||||
, postDispatchCallBackFunc_(NULL)
|
||||
, preDispatchCallBackArgs_(NULL)
|
||||
, postDispatchCallBackArgs_(NULL)
|
||||
, paramMemory_(NULL)
|
||||
, numParams_(0)
|
||||
, aclBinary_(NULL)
|
||||
, aqlCodeAddr_(NULL)
|
||||
, aqlCodeSize_(0)
|
||||
, scratchRingAddr_(NULL)
|
||||
, scratchRingSize_(0)
|
||||
, isRegistered_(false)
|
||||
, dbgMsgBufferReady_(false)
|
||||
{
|
||||
memset(&debugInfo_, 0, sizeof(debugInfo_));
|
||||
|
||||
memset(deviceTrapInfo_, 0, sizeof(uint64_t) * kDebugTrapLocationMax);
|
||||
}
|
||||
|
||||
HwDebugManager::~HwDebugManager()
|
||||
{
|
||||
if (NULL != paramMemory_) {
|
||||
delete[] paramMemory_;
|
||||
}
|
||||
}
|
||||
|
||||
//! Setup the call back function pointer
|
||||
void
|
||||
HwDebugManager::setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFunction,
|
||||
cl_PostDispatchCallBackFunctionAMD postDispatchFunction)
|
||||
{
|
||||
preDispatchCallBackFunc_ = preDispatchFunction;
|
||||
postDispatchCallBackFunc_ = postDispatchFunction;
|
||||
}
|
||||
|
||||
//! Setup the call back argument pointers
|
||||
void
|
||||
HwDebugManager::setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs)
|
||||
{
|
||||
preDispatchCallBackArgs_ = preDispatchArgs;
|
||||
postDispatchCallBackArgs_ = postDispatchArgs;
|
||||
}
|
||||
|
||||
//! Get dispatch debug info
|
||||
void
|
||||
HwDebugManager::getDispatchDebugInfo(void* debugInfo) const
|
||||
{
|
||||
memcpy(debugInfo, (void*) &debugInfo_, sizeof(DispatchDebugInfo));
|
||||
}
|
||||
|
||||
|
||||
//! Set the kernel code address and its size
|
||||
void
|
||||
HwDebugManager::setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize)
|
||||
{
|
||||
aqlCodeAddr_ = aqlCodeAddr;
|
||||
aqlCodeSize_ = aqlCodeSize;
|
||||
}
|
||||
|
||||
//! Get the scratch ring
|
||||
void
|
||||
HwDebugManager::setScratchRing(address scratchRingAddr, uint32_t scratchRingSize)
|
||||
{
|
||||
scratchRingAddr_ = scratchRingAddr;
|
||||
scratchRingSize_ = scratchRingSize;
|
||||
}
|
||||
|
||||
//! Map the shader (AQL code) for host access
|
||||
void
|
||||
HwDebugManager::mapKernelCode(uint64_t* aqlCodeAddr, uint32_t* aqlCodeSize) const
|
||||
{
|
||||
*aqlCodeAddr = reinterpret_cast<uint64_t>(aqlCodeAddr_);
|
||||
*aqlCodeSize = aqlCodeSize_;
|
||||
}
|
||||
|
||||
//! Map the scratch ring for host access
|
||||
void
|
||||
HwDebugManager::mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const
|
||||
{
|
||||
*scratchRingAddr = reinterpret_cast<uint64_t>(scratchRingAddr_);
|
||||
*scratchRingSize = scratchRingSize_;
|
||||
}
|
||||
|
||||
void
|
||||
HwDebugManager::setExceptionPolicy(void* exceptionPolicy)
|
||||
{
|
||||
memcpy(&excpPolicy_, exceptionPolicy, sizeof(cl_dbg_exception_policy_amd));
|
||||
}
|
||||
|
||||
void
|
||||
HwDebugManager::getExceptionPolicy(void* exceptionPolicy) const
|
||||
{
|
||||
memcpy(exceptionPolicy, &excpPolicy_, sizeof(cl_dbg_exception_policy_amd));
|
||||
}
|
||||
|
||||
void
|
||||
HwDebugManager::setKernelExecutionMode(void* mode)
|
||||
{
|
||||
cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast<cl_dbg_kernel_exec_mode_amd*>(mode);
|
||||
execMode_.ui32All = execMode->ui32All;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
HwDebugManager::getKernelExecutionMode(void* mode) const
|
||||
{
|
||||
cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast<cl_dbg_kernel_exec_mode_amd*>(mode);
|
||||
execMode->ui32All = execMode_.ui32All;
|
||||
}
|
||||
|
||||
void
|
||||
HwDebugManager::setAclBinary(void* aclBinary)
|
||||
{
|
||||
aclBinary_ = aclBinary;
|
||||
}
|
||||
|
||||
void
|
||||
HwDebugManager::allocParamMemList(uint32_t numParams)
|
||||
{
|
||||
if (NULL != paramMemory_) {
|
||||
delete [] paramMemory_;
|
||||
}
|
||||
|
||||
numParams_ = numParams;
|
||||
paramMemory_ = new amd::Memory*[numParams];
|
||||
}
|
||||
|
||||
cl_mem
|
||||
HwDebugManager::getKernelParamMem(uint32_t paramIdx) const
|
||||
{
|
||||
assert((paramIdx < numParams_) && "Invalid kernel parameter index too big");
|
||||
|
||||
return as_cl(paramMemory_[paramIdx]);
|
||||
}
|
||||
|
||||
void
|
||||
HwDebugManager::assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem)
|
||||
{
|
||||
assert((paramIdx < numParams_) && "Invalid kernel parameter index too big");
|
||||
|
||||
paramMemory_[paramIdx] = mem;
|
||||
}
|
||||
|
||||
} // namespace amd
|
||||
@@ -5,42 +5,67 @@
|
||||
#ifndef HWDEBUG_H_
|
||||
#define HWDEBUG_H_
|
||||
|
||||
#include "device.hpp"
|
||||
#include "amdocl/cl_debugger_amd.h"
|
||||
|
||||
#define TBA_START_OFFSET 256
|
||||
static const int TbaStartOffset = 256;
|
||||
|
||||
/**
|
||||
*******************************************************************************
|
||||
* @brief Debug information required by the AMD debugger
|
||||
* This might have to be moved to a private header. We could provide
|
||||
* these services as a seperate dll.
|
||||
* @details The information is populated by the function oclGetDebugInfo
|
||||
*******************************************************************************
|
||||
static const int RtTrapBufferWaveSize = 64;
|
||||
static const int RtTrapBufferSeNum = 4;
|
||||
static const int RtTrapBufferShNum = 2;
|
||||
static const int RtTrapBufferCuNum = 16;
|
||||
static const int RtTrapBufferSimdNum = 4;
|
||||
static const int RtTrapBufferWaveNum = 16;
|
||||
static const int RtTrapBufferTotalWaveNum =
|
||||
((RtTrapBufferSeNum) * \
|
||||
(RtTrapBufferShNum) * \
|
||||
(RtTrapBufferCuNum) * \
|
||||
(RtTrapBufferSimdNum) * \
|
||||
(RtTrapBufferWaveNum));
|
||||
|
||||
|
||||
/*! \brief Debug trap handler location in the runtime trap buffer
|
||||
*
|
||||
* This enumeration is used to indicate the location where the debug
|
||||
* trap handler and debug trap buffer are set in the device trap buffer.
|
||||
*/
|
||||
struct PacketAmdInfo
|
||||
enum DebugTrapLocation
|
||||
{
|
||||
uint32_t trapReservedVgprIndex; //!< reserved VGPR index, -1 when they are not valid
|
||||
uint32_t scratchBufferWaveOffset; //!< scratch buffer wave offset, -1 when no scratch buffer
|
||||
void *pointerToIsaBuffer; //!< pointer to the buffer containing ISA
|
||||
size_t sizeOfIsaBuffer; //!< size of the ISA buffer
|
||||
uint32_t numberOfVgprs; //!< number of VGPRs used by the kernel
|
||||
uint32_t numberOfSgprs; //!< number of SGPRs used by the kernel
|
||||
size_t sizeOfStaticGroupMemory; //!< Static local memory used by the kernel
|
||||
kDebugTrapHandlerLocation = 0, //! Debug Trap handler location, this location must be 0
|
||||
kDebugTrapBufferLocation = 1, //! Debug Trap buffer location, this location must be 1
|
||||
kDebugTrapLocationMax = 2
|
||||
};
|
||||
|
||||
//! Cache mask for invalidation
|
||||
struct HwDbgGpuCacheMask
|
||||
|
||||
/*! \brief This structure is for the debug info in each kernel dispatch.
|
||||
*
|
||||
* Contains the memory descriptor information of the scratch memory and the global
|
||||
* memory
|
||||
*/
|
||||
struct DispatchDebugInfo
|
||||
{
|
||||
union {
|
||||
struct {
|
||||
uint32_t sqICache : 1; //!< Instruction cache
|
||||
uint32_t sqKCache : 1; //!< Data cache
|
||||
uint32_t tcL1 : 1; //!< tcL1 cache
|
||||
uint32_t tcL2 : 1; //!< tcL2 cache
|
||||
uint32_t reserved : 28;
|
||||
};
|
||||
uint32_t ui32All;
|
||||
};
|
||||
uint32_t scratchMemoryDescriptor_[4]; //! Scratch memory descriptor
|
||||
uint32_t globalMemoryDescriptor_[4]; //! Global memory descriptor
|
||||
};
|
||||
|
||||
/*! \brief Trap handler descriptor
|
||||
*
|
||||
* The trap handler descriptor contains the details of a given trap handler.
|
||||
*/
|
||||
struct TrapHandlerInfo {
|
||||
amd::Memory* trapHandler_; //!< Device memory for the trap handler
|
||||
amd::Memory* trapBuffer_; //!< Device memory for the trap buffer
|
||||
};
|
||||
|
||||
/*! \brief Structure of the runtime trap handler buffer, which includes the following
|
||||
* information: information of the runtime trap handler and buffer, information of
|
||||
* the level-2 trap handlers and buffers.
|
||||
*/
|
||||
struct RuntimeTrapInfo {
|
||||
TrapHandlerInfo trap_; //!< Structure of the address of all trap handlers
|
||||
uint32_t dispatchId_; //!< Dispatch ID that signals the shader event
|
||||
uint32_t vgpr_backup_[RtTrapBufferTotalWaveNum][RtTrapBufferWaveSize];
|
||||
//!< Buffer to backup the VGPR used by the runtime trap handler
|
||||
};
|
||||
|
||||
|
||||
@@ -48,10 +73,16 @@ struct HwDbgGpuCacheMask
|
||||
/**
|
||||
* Opaque pointer to trap event
|
||||
*/
|
||||
typedef uint64_t DebugEvent; //! opaque pointer to trap event
|
||||
typedef uintptr_t DebugEvent;
|
||||
|
||||
namespace amd {
|
||||
|
||||
|
||||
class Context;
|
||||
class Device;
|
||||
class HostQueue;
|
||||
|
||||
|
||||
/*! \class HwDebugManager
|
||||
*
|
||||
* \brief The device interface class for the hardware debug manager
|
||||
@@ -61,32 +92,73 @@ class HwDebugManager
|
||||
public:
|
||||
|
||||
//! Constructor for the Hardware Debug Manager
|
||||
HwDebugManager() : isRegistered_(false), useHwDebug_(false) {}
|
||||
HwDebugManager(amd::Device* device);
|
||||
|
||||
//! Destructor for Hardware Debug Manager
|
||||
~HwDebugManager() {};
|
||||
virtual ~HwDebugManager();
|
||||
|
||||
//! Setup the call back function pointer
|
||||
virtual void setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFn,
|
||||
cl_PostDispatchCallBackFunctionAMD postDispatchFn) = 0;
|
||||
void setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFn,
|
||||
cl_PostDispatchCallBackFunctionAMD postDispatchFn);
|
||||
|
||||
//! Setup the call back argument pointers
|
||||
virtual void setCallBackArguments(void *preDispatchArgs, void *postDispatchArgs) = 0;
|
||||
void setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs);
|
||||
|
||||
//! Flush cache
|
||||
virtual cl_int flushCache(uint32_t mask) = 0;
|
||||
//! Get dispatch debug info
|
||||
void getDispatchDebugInfo(void* debugInfo) const;
|
||||
|
||||
//! Set the kernel code address and its size
|
||||
void setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize);
|
||||
|
||||
//! Get the scratch ring
|
||||
void setScratchRing(address scratchRingAddr, uint32_t scratchRingSize);
|
||||
|
||||
//! Map the shader (AQL code) for host access
|
||||
void mapKernelCode(uint64_t* aqlCodeAddr, uint32_t* aqlCodeSize) const;
|
||||
|
||||
//! Map the scratch ring for host access
|
||||
void mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const;
|
||||
|
||||
//! Retrieve the pre-dispatch callback function
|
||||
cl_PreDispatchCallBackFunctionAMD preDispatchCallBackFunc() const
|
||||
{ return preDispatchCallBackFunc_; }
|
||||
|
||||
//! Retrieve the post-dispatch callback function
|
||||
cl_PostDispatchCallBackFunctionAMD postDispatchCallBackFunc() const
|
||||
{ return postDispatchCallBackFunc_; }
|
||||
|
||||
//! Retrieve the pre-dispatch callback function arguments
|
||||
void* preDispatchCallBackArgs() const { return preDispatchCallBackArgs_; }
|
||||
|
||||
//! Retrieve the post-dispatch callback function arguments
|
||||
void* postDispatchCallBackArgs() const { return postDispatchCallBackArgs_; }
|
||||
|
||||
//! Set exception policy
|
||||
virtual cl_int setExceptionPolicy(void *policy) = 0;
|
||||
void setExceptionPolicy(void* policy);
|
||||
|
||||
//! Get exception policy
|
||||
virtual cl_int getExceptionPolicy(void *policy) const = 0;
|
||||
void getExceptionPolicy(void* policy) const;
|
||||
|
||||
//! Set the kernel execution mode
|
||||
virtual cl_int setKernelExecutionMode(void *mode) = 0;
|
||||
void setKernelExecutionMode(void* mode);
|
||||
|
||||
//! Get the kernel execution mode
|
||||
virtual cl_int getKernelExecutionMode(void *mode) const = 0;
|
||||
void getKernelExecutionMode(void* mode) const;
|
||||
|
||||
//! Setup the pointer to the aclBinary within the debug manager
|
||||
void setAclBinary(void* aclBinary);
|
||||
|
||||
//! Allocate storage to keep the memory pointers of the kernel parameters
|
||||
void allocParamMemList(uint32_t numParams);
|
||||
|
||||
//! Assign the kernel parameter memory
|
||||
void assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem);
|
||||
|
||||
//! Get kernel parameter memory object
|
||||
cl_mem getKernelParamMem(uint32_t paramIdx) const;
|
||||
|
||||
//! Flush cache
|
||||
virtual void flushCache(uint32_t mask) = 0;
|
||||
|
||||
//! Create the debug event
|
||||
virtual DebugEvent createDebugEvent(const bool autoReset) = 0;
|
||||
@@ -95,95 +167,99 @@ public:
|
||||
virtual cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const = 0;
|
||||
|
||||
//! Destroy the debug event
|
||||
virtual cl_int destroyDebugEvent(DebugEvent pEvent) = 0;
|
||||
virtual void destroyDebugEvent(DebugEvent* pEvent) = 0;
|
||||
|
||||
//! Register the debugger
|
||||
virtual cl_int registerDebugger(amd::Context *context, uintptr_t pMessageStorage) = 0;
|
||||
virtual cl_int registerDebugger(amd::Context* context, uintptr_t pMessageStorage) = 0;
|
||||
|
||||
//! Call KMD to register the debugger
|
||||
virtual cl_int registerDebuggerOnQueue(device::VirtualDevice *vDevice) = 0;
|
||||
virtual cl_int registerDebuggerOnQueue(device::VirtualDevice* vDevice) = 0;
|
||||
|
||||
//! Unregister the debugger
|
||||
virtual cl_int unregisterDebugger() = 0;
|
||||
virtual void unregisterDebugger() = 0;
|
||||
|
||||
//! Setup the pointer to the aclBinary within the debug manager
|
||||
virtual void setAclBinary(void *aclBinary) = 0;
|
||||
|
||||
//! Send the wavefront control cmmand
|
||||
virtual cl_int wavefrontControl(uint32_t waveAction,
|
||||
virtual void wavefrontControl(uint32_t waveAction,
|
||||
uint32_t waveMode,
|
||||
uint32_t trapId,
|
||||
void * waveAddr) const = 0;
|
||||
void* waveAddr) const = 0;
|
||||
|
||||
//! Set address watching point
|
||||
virtual cl_int setAddressWatch(uint32_t numWatchPoints,
|
||||
void ** watchAddress,
|
||||
uint64_t * watchMask,
|
||||
uint64_t * watchMode,
|
||||
DebugEvent * event) = 0;
|
||||
virtual void setAddressWatch(uint32_t numWatchPoints,
|
||||
void** watchAddress,
|
||||
uint64_t* watchMask,
|
||||
uint64_t* watchMode,
|
||||
DebugEvent* event) = 0;
|
||||
|
||||
//! Get the packet information for dispatch
|
||||
virtual cl_int getPacketAmdInfo(const void * aqlCodeInfo,
|
||||
void * packetInfo) const = 0;
|
||||
|
||||
//! Get dispatch debug info
|
||||
virtual cl_int getDispatchDebugInfo(void * debugInfo) const = 0;
|
||||
|
||||
//! Map the AQL code for host access
|
||||
virtual cl_int mapKernelCode(uint64_t *aqlCode, uint32_t *aqlCodeSize) const = 0;
|
||||
|
||||
//! Map the scratch ring for host access
|
||||
virtual cl_int mapScratchRing(uint64_t *scratchRingAddr, uint32_t *scratchRingSize) const = 0;
|
||||
virtual void getPacketAmdInfo(const void* aqlCodeInfo,
|
||||
void* packetInfo) const = 0;
|
||||
|
||||
//! Set global memory values
|
||||
virtual cl_int setGlobalMemory(void * memObj,
|
||||
uint32_t offset,
|
||||
void * srcPtr,
|
||||
uint32_t size) = 0;
|
||||
virtual void setGlobalMemory(amd::Memory* memObj,
|
||||
uint32_t offset,
|
||||
void* srcPtr,
|
||||
uint32_t size) = 0;
|
||||
|
||||
//! Set kernel parameter memory object list
|
||||
virtual cl_int setKernelParamMemList(void ** paramMem, uint32_t numParams) = 0;
|
||||
//! Execute the post-dispatch callback function
|
||||
virtual void executePostDispatchCallBack() = 0;
|
||||
|
||||
//! Get kernel parameter memory object
|
||||
virtual uint64_t getKernelParamMem(uint32_t paramIdx) const = 0;
|
||||
//! Execute the pre-dispatch callback function
|
||||
virtual void executePreDispatchCallBack(void* aqlPacket,
|
||||
void* toolInfo) = 0;
|
||||
|
||||
//! Set the kernel code address and its size
|
||||
virtual void setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize) = 0;
|
||||
//! Return the use of HW DEBUG flag
|
||||
bool isMsgBufferReady() const { return dbgMsgBufferReady_; }
|
||||
|
||||
//! Get the scratch ring
|
||||
virtual void setScratchRing(address scratchRingAddr, uint32_t scratchRingSize) = 0;
|
||||
protected:
|
||||
//! Return the context
|
||||
const amd::Context* context() const { return context_; }
|
||||
|
||||
//! Retrieve the pre-dispatch callback function
|
||||
virtual cl_PreDispatchCallBackFunctionAMD getPreDispatchCallBackFunction() const = 0;
|
||||
|
||||
//! Retrieve the post-dispatch callback function
|
||||
virtual void * getPreDispatchCallBackArguments() const = 0;
|
||||
|
||||
//! Retrieve the pre-dispatch callback function arguments
|
||||
virtual cl_PostDispatchCallBackFunctionAMD getPostDispatchCallBackFunction() const = 0;
|
||||
|
||||
//! Retrieve the post-dispatch callback function arguments
|
||||
virtual void * getPostDispatchCallBackArguments() const = 0;
|
||||
|
||||
//! Set the register flag
|
||||
void setRegisterFlag(bool regFlag) { isRegistered_ = regFlag; }
|
||||
|
||||
//! Set the use of HW DEBUG flag
|
||||
void setUseHwDebugFlag(bool flag) { useHwDebug_ = flag; }
|
||||
//! Get the debug device
|
||||
const amd::Device* device() const { return device_; }
|
||||
|
||||
//! Return the register flag
|
||||
bool isRegistered() const { return isRegistered_; }
|
||||
|
||||
//! Return the use of HW DEBUG flag
|
||||
bool useHwDebug() const { return useHwDebug_; }
|
||||
|
||||
//! Return the device trap handler information
|
||||
const uint64_t* deviceTrapInfo() const { return deviceTrapInfo_; }
|
||||
|
||||
protected:
|
||||
bool isRegistered_; //! flag to indicate the debugger has been registered
|
||||
bool useHwDebug_; //! flag to indicate the HW DEBUG is using
|
||||
|
||||
const amd::Context* context_; ///< context that used to create host queue for the debugger
|
||||
amd::Device* device_; ///< Device to run the debugger
|
||||
|
||||
cl_PreDispatchCallBackFunctionAMD preDispatchCallBackFunc_; //!< pre-dispatch callback function
|
||||
cl_PostDispatchCallBackFunctionAMD postDispatchCallBackFunc_; //!< post-dispatch callback function
|
||||
void* preDispatchCallBackArgs_; //!< pre-dispatch callback function arguments
|
||||
void* postDispatchCallBackArgs_; //!< post-dispatch callback function arguments
|
||||
|
||||
DispatchDebugInfo debugInfo_; //!< Debug setting/information for kernel dispatch
|
||||
uint64_t deviceTrapInfo_[kDebugTrapLocationMax]; //!< Device trap buffer, to store various trap handlers on the device
|
||||
|
||||
amd::Memory** paramMemory_; //!< list of memory pointers for kernel parameters
|
||||
uint32_t numParams_; //!< number of kernel parameters
|
||||
|
||||
void* aclBinary_; //!< ACL binary
|
||||
|
||||
address aqlCodeAddr_; //!< The mapped AQL code to allow host access
|
||||
uint32_t aqlCodeSize_; //!< The size of the AQL code info
|
||||
|
||||
address scratchRingAddr_; //!< The mapped address of the scratch buffer
|
||||
uint32_t scratchRingSize_; //!< The size of the scratch ring
|
||||
|
||||
bool isRegistered_; //! flag to indicate the debugger has been registered
|
||||
bool dbgMsgBufferReady_; //! flag to indicate the HW DEBUG is using
|
||||
|
||||
cl_dbg_exception_policy_amd excpPolicy_; //!< exception policy
|
||||
cl_dbg_kernel_exec_mode_amd execMode_; //!< kernel execution mode
|
||||
RuntimeTrapInfo rtTrapHandlerInfo_; //!< Runtime trap information
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
/**@}*/
|
||||
|
||||
/**
|
||||
|
||||
@@ -174,6 +174,8 @@ debug(bool, GPU_FORCE_SINGLE_FP_DENORM, false, \
|
||||
"Forces reporting CL_FP_DENORM bit for single precision") \
|
||||
debug(bool, OCL_FORCE_CPU_SVM, false, \
|
||||
"force svm support for CPU") \
|
||||
debug(bool, GPU_ENABLE_HW_DEBUG, false, \
|
||||
"Enable HW DEBUG for GPU")
|
||||
|
||||
|
||||
|
||||
|
||||
مرجع در شماره جدید
Block a user