P4 to Git Change 1110409 by wchau@wchau_WINDOWS7_OCL on 2015/01/09 15:46:34

ECR #399840 - re-checkin of CL1109955 with the fix of OpenCL sanity check timeout (hw debug flag initialization)

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_debugger_amd.cpp#4 edit
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/cl_debugger_amd.h#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#174 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.hpp#238 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugger.hpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.cpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudebugmanager.hpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.cpp#490 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpudevice.hpp#137 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.cpp#275 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpukernel.hpp#106 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuresource.cpp#200 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuscsi.cpp#30 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpusettings.cpp#297 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.cpp#346 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gpuvirtual.hpp#124 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.cpp#69 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/gpu/gslbe/src/rt/GSLContext.h#42 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.cpp#3 add
... //depot/stg/opencl/drivers/opencl/runtime/device/hwdebug.hpp#4 edit
... //depot/stg/opencl/drivers/opencl/runtime/utils/flags.hpp#223 edit


[ROCm/clr commit: 647aba6ed2]
This commit is contained in:
foreman
2015-01-09 15:56:52 -05:00
والد 10be094f98
کامیت dfebcaac2a
19فایلهای تغییر یافته به همراه1288 افزوده شده و 129 حذف شده
@@ -521,6 +521,7 @@ Settings::Settings()
waitCommand_ = AMD_OCL_WAIT_COMMAND;
supportDepthsRGB_ = false;
assumeAliases_ = false;
enableHwDebug_ = false;
}
bool
@@ -63,7 +63,6 @@ class SvmFillMemoryCommand;
class SvmMapMemoryCommand;
class SvmUnmapMemoryCommand;
class HwDebugManager;
class RunHwDbgCommand;
class Device;
struct KernelParameterDescriptor;
struct Coord3D;
@@ -500,7 +499,7 @@ struct Info : public amd::EmbeddedObject
//! List of supported video attributes (profile/format pairs)
cl_video_attrib_amd* videoAttribs_;
cl_uint numVideoAttribs_;
//Encoder
//Encoder
cl_video_attrib_encode_amd* videoEncAttribs_;
cl_uint numVideoEncAttribs_;
#endif //cl_amd_open_video
@@ -574,9 +573,6 @@ struct Info : public amd::EmbeddedObject
//! The maximum size of global scope variables
size_t maxGlobalVariableSize_;
size_t globalVariablePreferredTotalSize_;
//! Enable HW Debug support
cl_bool enableHwDebug_;
};
//! Device settings
@@ -586,7 +582,7 @@ public:
uint64_t extensions_; //!< Supported OCL extensions
union {
struct {
uint partialDispatch_: 1; //!< Enables partial dispatch
uint partialDispatch_: 1; //!< Enables partial dispatch
uint supportRA_: 1; //!< Support RA channel order format
uint largeHostMemAlloc_: 1; //!< Allow large host mem allocations (> maxSingleAlloc)
uint waitCommand_: 1; //!< Enables a wait for every submitted command
@@ -594,7 +590,8 @@ public:
// that replaces generic OS allocation routines
uint supportDepthsRGB_: 1; //!< Support DEPTH and sRGB channel order format
uint assumeAliases_: 1; //!< Assume aliases in the compilation process
uint reserved_: 25;
uint enableHwDebug_: 1; //!< Enable HW debug support
uint reserved_: 24;
};
uint value_;
};
@@ -776,8 +773,8 @@ protected:
volatile size_t version_; //!< The version we're currently shadowing
//! NB, the map data below is for an API-level map (from clEnqueueMapBuffer),
//! not a physical map. When a memory object does not use USE_HOST_PTR we
//! NB, the map data below is for an API-level map (from clEnqueueMapBuffer),
//! not a physical map. When a memory object does not use USE_HOST_PTR we
//! can use a remote resource and DMA, avoiding the additional CPU memcpy.
amd::Memory* mapMemory_; //!< Memory used as map target buffer
volatile size_t indirectMapCount_; //!< Number of maps
@@ -898,7 +895,7 @@ public:
workGroupInfo_.compileSize_[1] = y;
workGroupInfo_.compileSize_[2] = z;
}
size_t getReqdWorkGroupSize(int dim) {
return workGroupInfo_.compileSize_[dim];
}
@@ -1139,11 +1136,11 @@ public:
never called in storing routines */
bool setBinary(char* theBinary, size_t theBinarySize, bool allocated=false);
//! setin elfIn_
//! setin elfIn_
bool setElfIn(unsigned char eclass);
void resetElfIn();
//! set out elf
//! set out elf
bool setElfOut(unsigned char eclass, const char* outFile);
void resetElfOut();
@@ -1232,7 +1229,7 @@ public:
// Return the encrypt code for this input binary ( "> 0" means encrypted)
int getEncryptCode() { return encryptCode_; }
// Returns TRUE of binary file is SPIR
bool isSPIR() const;
protected:
@@ -1413,9 +1410,6 @@ public:
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd) = 0;
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd) = 0;
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd) = 0;
#if 0 // exclude this until more HW DEBUG codes are submitted
virtual void submitHwDbgCommand(amd::RunHwDbgCommand& cmd) = 0;
#endif
//! Get the blit manager object
device::BlitManager& blitMgr() const { return *blitMgr_; }
@@ -1698,6 +1692,9 @@ public:
//! Initialize the Hardware Debug Manager
virtual cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage) { return CL_SUCCESS; }
//! Remove the Hardware Debug Manager
virtual void hwDebugManagerRemove() {}
protected:
//! Enable the specified extension
char* getExtensionString();
@@ -0,0 +1,127 @@
/*******************************************************************************
*
* Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
*
* All rights reserved. This notice is intended as a precaution against
* inadvertent publication and does not imply publication or any waiver
* of confidentiality. The year included in the foregoing notice is the
* year of creation of the work.
*
******************************************************************************/
#ifndef HWDBG_GPUDEBGGER_H_
#define HWDBG_GPUDEBGGER_H_
#include <cstddef>
#include <cstdint>
#include "hsa.h"
#include "sc-hsa/Interface/SCHSAInterface.h"
#include "device/device.hpp"
#include "device/hwdebug.hpp"
static const int NumberReserveVgprs = 4;
namespace gpu {
/**
* \defgroup Services_API OCL Runtime Services API
* @{
*/
/*! \brief Dispatch packet information
*
* This structure contains the packet information for kernel dispatch
*/
struct PacketAmdInfo
{
uint32_t trapReservedVgprIndex_; //!< reserved VGPR index, -1 when they are not valid
uint32_t scratchBufferWaveOffset_; //!< scratch buffer wave offset, -1 when no scratch buffer
void* pointerToIsaBuffer_; //!< pointer to the buffer containing ISA
size_t sizeOfIsaBuffer_; //!< size of the ISA buffer
uint32_t numberOfVgprs_; //!< number of VGPRs used by the kernel
uint32_t numberOfSgprs_; //!< number of SGPRs used by the kernel
size_t sizeOfStaticGroupMemory_; //!< Static local memory used by the kernel
};
/*! \brief Cache mask for invalidation
*/
struct HwDbgGpuCacheMask
{
HwDbgGpuCacheMask() :ui32All_(0) {}
HwDbgGpuCacheMask(uint32_t mask) :ui32All_(mask) {}
union {
struct {
uint32_t sqICache_ : 1; //!< Instruction cache
uint32_t sqKCache_ : 1; //!< Data cache
uint32_t tcL1_ : 1; //!< tcL1 cache
uint32_t tcL2_ : 1; //!< tcL2 cache
uint32_t reserved_ : 28;
};
uint32_t ui32All_;
};
};
/*! \brief Address watch information
*
* Information about each watch point - address, mask, mode and event
*/
struct HwDbgAddressWatch
{
void* watchAddress_; //! The address of watch point
uint64_t watchMask_; //! The mask for watch point (lower 24 bits)
cl_dbg_address_watch_mode_amd watchMode_; //! The watch mode for this watch
DebugEvent event_; //! Event of the watch point (not used for now)
};
/*! \brief Runtime structure used to communicate debug information
* between Ocl services and core for a kernel dispatch.
*/
struct DebugToolInfo
{
uint64_t scratchAddress_; //! Scratch memory address
size_t scratchSize_; //! Scratch memory size
uint64_t globalAddress_; //! Global memory address
uint32_t cacheDisableMask_; //! Cache mask, indicating caches disabled
uint32_t exceptionMask_; //! Exception mask
uint32_t reservedCuNum_; //! Number of reserved CUs for display,
//! which ranges from 0 to 7 in the current implementation.
bool monitorMode_; //! Debug or profiler mode
bool gpuSingleStepMode_; //! SQ debug mode
amd::Memory* trapHandler_; //! Trap handler address
amd::Memory* trapBuffer_; //! Trap buffer address
bool sqPerfcounterEnable_; //! whether SQ perf counters are enabled
};
/*! \brief Message used by the KFD wave control for CI
*
* Structure indicates the various information used by the wave control function.
*/
struct HwDebugWaveAddr
{
uint32_t VMID_ : 4; //! Virtual memory id
uint32_t wave_ : 4; //! Wave id
uint32_t SIMD_ : 2; //! SIMD id
uint32_t CU_ : 4; //! Compute unit
uint32_t SH_ : 1; //! Shader array
uint32_t SE_ : 1; //! Shader engine
};
/*! \brief Kernel code information
*
* This structure contains the pointer of mapped kernel code for host access
* and its size (in bytes)
*/
struct AqlCodeInfo
{
amd_kernel_code_t * aqlCode_; //! pointer of AQL code to allow host access
uint32_t aqlCodeSize_; //! size of AQL code
};
/**@}*/
} // namespace gpu
#endif // HWDBG_GPUDEBGGER_H_
@@ -0,0 +1,361 @@
/*******************************************************************************
*
* Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
*
* All rights reserved. This notice is intended as a precaution against
* inadvertent publication and does not imply publication or any waiver
* of confidentiality. The year included in the foregoing notice is the
* year of creation of the work.
*
******************************************************************************/
#include "gpudebugmanager.hpp"
#include "gpudevice.hpp"
#include "platform/commandqueue.hpp"
#include "device/device.hpp"
#include "device/gpu/gpumemory.hpp"
#include <iostream>
#include <sstream>
#include <fstream>
namespace gpu {
class VirtualGPU;
class Device;
class Memory;
/*
***************************************************************************
* Implementation of GPU Debug Manager class
***************************************************************************
*/
GpuDebugManager::GpuDebugManager(amd::Device* device)
: HwDebugManager(device)
, vGpu_(NULL)
, debugMessages_(0)
, addressWatch_(NULL)
, addressWatchSize_(0)
, oclEventHandle_(NULL)
{
// Initialize the exception info and the kernel execution mode
excpPolicy_.exceptionMask = 0x0;
excpPolicy_.waveAction = CL_DBG_WAVES_RESUME;
excpPolicy_.hostAction = CL_DBG_HOST_IGNORE;
excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST;
execMode_.ui32All = 0;
rtTrapHandlerInfo_.trap_.trapHandler_ = NULL;
rtTrapHandlerInfo_.trap_.trapBuffer_ = NULL;
aqlPacket_ = (hsa_kernel_dispatch_packet_t *) NULL;
return;
}
GpuDebugManager::~GpuDebugManager()
{
if (NULL != addressWatch_) {
delete [] addressWatch_;
}
}
void
GpuDebugManager::executePreDispatchCallBack(void* aqlPacket,
void* toolInfo)
{
DebugToolInfo* info = reinterpret_cast<DebugToolInfo*>(toolInfo);
aqlPacket_ = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlPacket);
// Only if the pre-dispatch callback is set, will we update cache
// flush configuration and build the memory descriptor.
if (NULL != preDispatchCallBackFunc_) {
// Build the scratch memory descriptor
device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_,
info->scratchAddress_,
info->scratchSize_);
// Build the global memory descriptor
device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_,
info->globalAddress_);
// // for invalidate cache (BuildEndOfKernelNotifyCommands)
// aqlPacket->release_fence_scope = 2;
cl_device_id clDeviceId = as_cl(device_);
preDispatchCallBackFunc_(clDeviceId,
oclEventHandle_,
aqlPacket_,
aclBinary_,
deviceTrapInfo_,
preDispatchCallBackArgs_);
}
// Copy the various info set by the debugger/profiler to the tool info structure
setupTrapInformation(info);
}
void
GpuDebugManager::executePostDispatchCallBack()
{
if (NULL != postDispatchCallBackFunc_) {
cl_device_id clDeviceId = as_cl(device_);
postDispatchCallBackFunc_(clDeviceId,
aqlPacket_->completion_signal.handle,
postDispatchCallBackArgs_);
}
}
cl_int
GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage)
{
//! @todo: obtain the global mutex of HW debug to make sure only one debugger process exist
if (!device()->settings().enableHwDebug_) {
LogError("debugmanager: Register debugger error - HW DEBUG is not enable");
return CL_DEBUGGER_REGISTER_FAILURE_AMD;
}
// first time register - set the message storage, flush queue and enable hw debug
if (!isRegistered()) {
debugMessages_ = messageStorage;
dbgMsgBufferReady_ = true;
isRegistered_ = false;
}
context_ = context;
return CL_SUCCESS;
}
void
GpuDebugManager::unregisterDebugger()
{
if (isRegistered()) {
//! @todo: release the global mutex of HW debug
// reset the debugger registration flag
isRegistered_ = false;
dbgMsgBufferReady_ = false;
context_ = NULL;
}
}
cl_int
GpuDebugManager::registerDebuggerOnQueue(device::VirtualDevice* vDevice)
{
if (!isMsgBufferReady()) {
return CL_DEBUGGER_REGISTER_FAILURE_AMD;
}
if (isRegistered()) { // The debugger has already been registered,
return CL_SUCCESS; // nothing to be done
}
VirtualGPU* vGpu = reinterpret_cast<gpu::VirtualGPU*>(vDevice);
// populate the fields in the debugMessages structure used by the GPU exception notification
if (vGpu->RegisterHwDebugger(debugMessages_)) {
vGpu_ = vGpu;
isRegistered_ = true;
return CL_SUCCESS;
}
return CL_DEBUGGER_REGISTER_FAILURE_AMD;
}
void
GpuDebugManager::flushCache(uint32_t mask)
{
HwDbgGpuCacheMask cacheMask(mask);
device()->xferQueue()->flushCuCaches(cacheMask);
}
void
GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo)
{
toolInfo->scratchAddress_ = 0;
toolInfo->scratchSize_ = 0;
toolInfo->globalAddress_ = 0;
toolInfo->sqPerfcounterEnable_ = false;
// Set up trap related info in the kernel info structure to be
// used in the kernel dispatch.
toolInfo->exceptionMask_ = excpPolicy_.exceptionMask;
toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode;
toolInfo->monitorMode_ = execMode_.monitorMode;
// The order of these three bits is determined by the definition
// of the register COMPUTE_DISPATCH_INITIATOR
toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2)
| (execMode_.disableL2Cache << 1)
| (execMode_.disableL1Vector));
toolInfo->reservedCuNum_ = execMode_.reservedCuNum;
toolInfo->trapHandler_ =
as_amd(reinterpret_cast<cl_mem>(deviceTrapInfo_[kDebugTrapHandlerLocation]));
toolInfo->trapBuffer_ =
as_amd(reinterpret_cast<cl_mem>(deviceTrapInfo_[kDebugTrapBufferLocation]));
}
void
GpuDebugManager::getPacketAmdInfo(
const void* aqlCodeInfo,
void* packetInfo) const
{
const AqlCodeInfo* codeInfo =
reinterpret_cast<const AqlCodeInfo*>(aqlCodeInfo);
const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_;
PacketAmdInfo* packet =
reinterpret_cast<PacketAmdInfo*>(packetInfo);
const amd_kernel_code_t* akc = hostAqlCode;
packet->numberOfSgprs_ = akc->wavefront_sgpr_count;
packet->numberOfVgprs_ = akc->workitem_vgpr_count;
// use mapped kernel_object_address for host accessing of ISA buffer
packet->pointerToIsaBuffer_ = (char*) (hostAqlCode) +
akc->kernel_code_entry_byte_offset;
packet->scratchBufferWaveOffset_ =
akc->debug_wavefront_private_segment_offset_sgpr;
packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_;
packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size;
// The trap_reserved_vgpr_index will be 4 less the original
// This value must be used only by the debugger
packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs;
}
DebugEvent
GpuDebugManager::createDebugEvent(
const bool autoReset)
{
if (!isRegistered()) {
LogError("debugmanager: Failed to flush cache - hw debug is not available");
return 0;
}
// create the event object
osEventHandle shaderEvent = osEventCreate(!autoReset);
// event object has been created, set the initial state
if (shaderEvent != 0) {
osEventReset(shaderEvent); // initial state is non-signaled
if (vGpu_->ExceptionNotification(shaderEvent)) {
isRegistered_ = true;
return shaderEvent;
}
}
return 0;
}
cl_int
GpuDebugManager::waitDebugEvent(
DebugEvent pEvent,
uint32_t timeOut) const
{
if (osEventTimedWait(pEvent, timeOut)) {
return CL_SUCCESS;
}
else {
return CL_EVENT_TIMEOUT_AMD;
}
}
void
GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent)
{
osEventDestroy(*pEvent);
*pEvent = 0;
vGpu_->ExceptionNotification(0);
}
void
GpuDebugManager::wavefrontControl(
uint32_t waveAction,
uint32_t waveMode,
uint32_t trapId,
void* waveAddr) const
{
device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr);
}
void
GpuDebugManager::setAddressWatch(
uint32_t numWatchPoints,
void** watchAddress,
uint64_t* watchMask,
uint64_t* watchMode,
DebugEvent* event)
{
size_t requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch);
// previously allocated size is not big enough, allocate new memory
if (addressWatchSize_ < requiredSize) {
if (NULL != addressWatch_) { // free the smaller address watch storage
delete [] addressWatch_;
}
addressWatch_ = new HwDbgAddressWatch[numWatchPoints];
addressWatchSize_ = requiredSize;
}
// fill in the address watch structure
memset(addressWatch_, 0, addressWatchSize_);
for (uint32_t i = 0; i < numWatchPoints; i++)
{
amd::Memory* watchMem = as_amd(reinterpret_cast<cl_mem>(watchAddress[i]));
Memory* watchMemAddress = device()->getGpuMemory(watchMem);
addressWatch_[i].watchAddress_ = reinterpret_cast<void*>(watchMemAddress->vmAddress());
addressWatch_[i].watchMask_ = watchMask[i];
addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd) watchMode[i];
addressWatch_[i].event_ = (0 != event) ? event[i] : 0;
}
// setup the watch addresses
device()->gslCtx()->setAddressWatch(numWatchPoints, (void*) addressWatch_);
}
void
GpuDebugManager::setGlobalMemory(
amd::Memory* memObj,
uint32_t offset,
void* srcPtr,
uint32_t size)
{
gpu::Memory* globalMem = device()->getGpuMemory(memObj);
address mappedMem = static_cast<address>(globalMem->map(NULL,0));
assert(mappedMem != 0);
void* dest_ptr = reinterpret_cast<void*>(mappedMem + offset);
memcpy(dest_ptr, srcPtr, size);
globalMem->unmap(NULL);
}
} // namespace gpu
@@ -0,0 +1,132 @@
/*******************************************************************************
*
* Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
*
* All rights reserved. This notice is intended as a precaution against
* inadvertent publication and does not imply publication or any waiver
* of confidentiality. The year included in the foregoing notice is the
* year of creation of the work.
*
******************************************************************************/
#ifndef HWDBG_DEBUGMANAGER_H__
#define HWDBG_DEBUGMANAGER_H__
#include "gpuvirtual.hpp"
#include "gpudebugger.hpp"
namespace gpu {
class GpuDebugManager;
class Device;
class Memory;
/*! \brief Debug Manager Class
*
* The debug manager class is used to pass all the trap info to the
* kernel dispatch and then the kernel execution can use such trap information
* for kernel execution. This class contains the trap handler and shader event
* objects. The trap handler is setup by users and passed to the kernel dispatch.
* The shader event is to receive interrupts from the GPU and then users can
* perform various operations.
*
* This class also provides the interface for setting up the pre-dispatch
* callback functions used by the profiler and debugger. It also provides
* a way to retrieve various debug information for the kernel execution.
*
*/
class GpuDebugManager : public amd::HwDebugManager {
public:
//! Constructor of the debug manager class
GpuDebugManager(amd::Device* device);
//! Destructor of the debug manager class
~GpuDebugManager();
//! Get the single instance of the GpuDebugManager class
static GpuDebugManager* getDefaultInstance();
//! Destroy the GpuDebugManager class object
static void destroyInstances();
//! Flush cache
void flushCache(uint32_t mask);
//! Create the debug event
DebugEvent createDebugEvent(const bool autoReset);
//! Wait for the debug event
cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const;
//! Destroy the debug event
void destroyDebugEvent(DebugEvent* pEvent);
//! Register the debugger
cl_int registerDebugger(amd::Context*context, uintptr_t messageStorage);
//! Register the debugger with KMD after command queue has been created
cl_int registerDebuggerOnQueue(device::VirtualDevice* vDevice);
//! Unregister the debugger
void unregisterDebugger();
//! Send the wavefront control cmmand
void wavefrontControl(uint32_t waveAction,
uint32_t waveMode,
uint32_t trapId,
void* waveAddr) const;
//! Set address watching point
void setAddressWatch(uint32_t numWatchPoints,
void** watchAddress,
uint64_t* watchMask,
uint64_t* watchMode,
DebugEvent* pEvent);
//! Get the packet information for dispatch
void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const;
//! Set global memory values
void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size);
//! Execute the post-dispatch callback function
void executePostDispatchCallBack();
//! Execute the pre-dispatch callback function
void executePreDispatchCallBack(void* aqlPacket,
void* toolInfo);
private:
//! Setup trap handler info for kernel execution
void setupTrapInformation(DebugToolInfo* toolInfo);
protected:
const VirtualGPU* vGpu() const { return vGpu_; }
private:
const gpu::Device* device() const {
return reinterpret_cast<const gpu::Device *>(device_); }
VirtualGPU* vGpu_; //!< the virtual GPU
uintptr_t debugMessages_; //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD
HwDbgAddressWatch* addressWatch_; //!< Address watch data
size_t addressWatchSize_; //!< Size of address watch data
//! Arguments used by the callback function
void* oclEventHandle_; //!< event handler
const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet
};
} // namespace gpu
#endif // HWDBG_DEBUGMANAGER_H__
@@ -38,6 +38,8 @@
#include <iostream>
#include <ctype.h>
#include "gpudebugmanager.hpp"
bool DeviceLoad()
{
bool ret = false;
@@ -890,6 +892,7 @@ Device::create(CALuint ordinal, CALuint numOfDevices)
}
}
#ifdef DEBUG
std::stringstream message;
if (settings().remoteAlloc_) {
@@ -1225,7 +1228,7 @@ Device::init()
{
CALuint numDevices = 0;
bool result = false;
bool useDeviceList = false;
bool useDeviceList = false;
requestedDevices_t requestedDevices;
const char *library = getenv("COMPILER_LIBRARY");
@@ -2662,4 +2665,27 @@ Device::SrdManager::fillResourceList(std::vector<const Resource*>& memList)
}
}
cl_int
Device::hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage)
{
hwDebugMgr_ = new GpuDebugManager(this);
cl_int status = hwDebugMgr_->registerDebugger(context, messageStorage);
if (CL_SUCCESS != status) {
delete hwDebugMgr_;
hwDebugMgr_ = NULL;
}
return status;
}
void
Device::hwDebugManagerRemove()
{
hwDebugMgr_->unregisterDebugger();
delete hwDebugMgr_;
hwDebugMgr_ = NULL;
}
} // namespace gpu
@@ -560,6 +560,12 @@ public:
//! Returns SRD manger object
SrdManager& srds() const { return *srdManager_; }
//! Initial the Hardware Debug Manager
cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage);
//! Remove the Hardware Debug Manager
void hwDebugManagerRemove();
private:
//! Disable copy constructor
Device(const Device&);
@@ -3510,6 +3510,7 @@ HSAILKernel::HSAILKernel(std::string name,
, prog_(*prog)
, index_(0)
, code_(NULL)
, codeSize_(0)
, hwMetaData_(NULL)
{
hsa_ = true;
@@ -3924,6 +3925,11 @@ HSAILKernel::loadArguments(
mem->signalWrite(&dev());
}
memList.push_back(gpuMem);
// save the memory object pointer to allow global memory access
if (NULL != dev().hwDebugMgr()) {
dev().hwDebugMgr()->assignKernelParamMem(i, gpuMem->owner());
}
}
// If it is a local pointer
else {
@@ -862,7 +862,10 @@ public:
const void* cpuAqlCode() const { return cpuAqlCode_; }
//! Returns memory object with AQL code
const gpu::Memory* gpuAqlCode() const { return code_; }
gpu::Memory* gpuAqlCode() const { return code_; }
//! Returns size of AQL code
size_t aqlCodeSize() const { return codeSize_; }
//! Returns the size of argument buffer
size_t argsBufferSize() const
@@ -883,7 +886,7 @@ public:
amd::NDRange& lclWorkSize //!< Local work size
) const;
//! Returns AQL packet in CPU memory
//! Returns AQL packet in CPU memory
//! if the kerenl arguments were successfully loaded, otherwise NULL
hsa_kernel_dispatch_packet_t* loadArguments(
VirtualGPU& gpu, //!< Running GPU context
@@ -939,6 +942,8 @@ private:
uint index_; //!< Kernel index in the program
gpu::Memory* code_; //!< Memory object with ISA code
size_t codeSize_; //!< Size of ISA code
char* hwMetaData_; //!< SI metadata
union Flags {
@@ -363,7 +363,8 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
elementSize_ = static_cast<CALuint>(memoryFormatSize(cal()->format_).size_);
cal_.type_ = memType;
if (memType == Scratch) {
cal_.type_ = Local;
// use local memory for scratch buffer unless it is using HW DEBUG
cal_.type_ = (!dev().settings().enableHwDebug_) ? Local : RemoteUSWC;
cal_.scratch_ = true;
}
@@ -463,7 +464,7 @@ Resource::create(MemoryType memType, CreateParams* params, bool heap)
else if ((gslRef_ != NULL) && (!dev().settings().use64BitPtr_)) {
// Make sure runtime didn't pick a resource with > 4GB address
if ((cal()->dimension_ == GSL_MOA_BUFFER) &&
(static_cast<uint64_t>(gslRef_->gslResource()->getSurfaceAddress() +
(static_cast<uint64_t>(gslRef_->gslResource()->getSurfaceAddress() +
gslRef_->gslResource()->getSurfaceSize()) > (uint64_t(4) * Gi))) {
gslRef_->release();
gslRef_ = NULL;
@@ -172,13 +172,17 @@ HSAILKernel::aqlCreateHWInfo(const void* shader, size_t shaderSize)
address codeStartAddress = reinterpret_cast<address>(akc);
address codeEndAddress = reinterpret_cast<address>(hcd) + siMetaData->common.codeLenInByte;
uint64_t codeSize = codeEndAddress - codeStartAddress;
code_ = new gpu::Memory(dev(), amd::alignUp(codeSize, gpu::ConstBuffer::VectorSize));
codeSize_ = codeEndAddress - codeStartAddress;
code_ = new gpu::Memory(dev(), amd::alignUp(codeSize_, gpu::ConstBuffer::VectorSize));
// force to use remote memory for HW DEBUG
Resource::MemoryType resMemType = (!dev().settings().enableHwDebug_) ? Resource::Local : Resource::RemoteUSWC;
// Initialize kernel ISA code
if ((code_ != NULL) && code_->create(Resource::Local)) {
if ((code_ != NULL) && code_->create(resMemType)) {
address cpuCodePtr = static_cast<address>(code_->map(NULL, Resource::WriteOnly));
// Copy only amd_kernel_code_t
memcpy(cpuCodePtr, codeStartAddress, codeSize);
memcpy(cpuCodePtr, codeStartAddress, codeSize_);
code_->unmap(NULL);
}
else {
@@ -134,6 +134,7 @@ Settings::Settings()
// Use host queue for device enqueuing by default
useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
}
bool
@@ -311,7 +312,7 @@ Settings::create(
calAttr.isWorkstation || hsail_) : GPU_FORCE_64BIT_PTR;
}
else {
if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
|| (oclVersion_ >= OpenCL20)))) {
use64BitPtr_ = true;
}
@@ -440,6 +441,11 @@ Settings::create(
if (oclVersion_ >= OpenCL20) {
enableExtension(ClKhrSubGroups);
enableExtension(ClKhrDepthImages);
// Enable HW debug
if (GPU_ENABLE_HW_DEBUG) {
enableHwDebug_ = true;
}
}
if (apuSystem_ &&
@@ -14,6 +14,7 @@
#include "device/gpu/gputhreadtrace.hpp"
#include "device/gpu/gputimestamp.hpp"
#include "device/gpu/gpublit.hpp"
#include "device/gpu/gpudebugger.hpp"
#include "hsa.h"
#include "sc-hsa/Interface/SCHSAInterface.h"
#include <fstream>
@@ -402,6 +403,7 @@ VirtualGPU::VirtualGPU(
, schedParamIdx_(0)
, deviceQueueSize_(0)
, hsaQueueMem_(NULL)
, useHwDebug_(false)
{
memset(&cal_, 0, sizeof(CalVirtualDesc));
for (uint i = 0; i < AllEngines; ++i) {
@@ -585,6 +587,14 @@ VirtualGPU::create(
return false;
}
// Check if HW Debug is used and register the debugger if not done yet
amd::HwDebugManager * dbgManager = dev().hwDebugMgr();
if ( dbgManager && dbgManager->isMsgBufferReady() ) {
if ( dbgManager->registerDebuggerOnQueue(this) == CL_SUCCESS ) {
useHwDebug_ = true;
}
}
return true;
}
@@ -1720,6 +1730,12 @@ VirtualGPU::submitKernelInternalHSA(
hsaKernel.prog().kernelTable()->vmAddress());
}
// setup the storage for the memory pointers of the kernel parameters
uint numParams = kernel.signature().numParameters();
if (useHwDebug_) {
dev().hwDebugMgr()->allocParamMemList(numParams);
}
// Program the kernel arguments for the GPU execution
hsa_kernel_dispatch_packet_t* aqlPkt =
hsaKernel.loadArguments(*this, kernel, sizes, parameters, nativeMem,
@@ -1745,10 +1761,25 @@ VirtualGPU::submitKernelInternalHSA(
addVmMemory(memList[i]);
}
// HW Debug for the kernel?
HwDbgKernelInfo kernelInfo;
HwDbgKernelInfo *pKernelInfo = NULL;
if (useHwDebug_) {
buildKernelInfo(hsaKernel, aqlPkt, kernelInfo);
pKernelInfo = &kernelInfo;
}
GpuEvent gpuEvent;
// Run AQL dispatch in HW
runAqlDispatch(gpuEvent, aqlPkt, vmMems(), cal_.memCount_,
scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress());
scratch, scratchOffset, hsaKernel.cpuAqlCode(), hsaQueueMem_->vmAddress(), pKernelInfo);
if (useHwDebug_) {
if (NULL != dev().hwDebugMgr()->postDispatchCallBackFunc()) {
dev().hwDebugMgr()->executePostDispatchCallBack();
}
}
if (hsaKernel.dynamicParallelism()) {
// Make sure exculsive access to the device queue
@@ -3410,4 +3441,155 @@ VirtualGPU::writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable)
virtualQueue_->writeRawData(hostQ, sizeof(AmdVQueueHeader), vqHeader_, !Wait);
}
void
VirtualGPU::flushCuCaches(HwDbgGpuCacheMask cache_mask)
{
//! @todo: fix issue of no event available for the flush/invalidate cache command
InvalidateSqCaches(cache_mask.sqICache_,
cache_mask.sqKCache_,
cache_mask.tcL1_,
cache_mask.tcL2_);
flushDMA(engineID_);
return;
}
void
VirtualGPU::buildKernelInfo(const HSAILKernel& hsaKernel,
hsa_kernel_dispatch_packet_t* aqlPkt,
HwDbgKernelInfo& kernelInfo)
{
amd::HwDebugManager * dbgManager = dev().hwDebugMgr();
assert (dbgManager && "No HW Debug Manager!");
// Initialize structure with default values
if (hsaKernel.prog().maxScratchRegs() > 0) {
gpu::Memory* scratchBuf = dev().scratch(hwRing())->memObjs_[0];
kernelInfo.scratchBufAddr = scratchBuf->vmAddress();
kernelInfo.scratchBufferSizeInBytes = scratchBuf->size();
// Get the address of the scratch buffer and its size for CPU access
address scratchRingAddr = NULL;
scratchRingAddr = static_cast<address>(scratchBuf->map(NULL, 0));
dbgManager->setScratchRing(scratchRingAddr,scratchBuf->size());
scratchBuf->unmap(NULL);
}
else {
kernelInfo.scratchBufAddr = 0;
kernelInfo.scratchBufferSizeInBytes = 0;
dbgManager->setScratchRing(NULL, 0);
}
//! @todo: need to verify what is wanted for the global memory
kernelInfo.heapBufAddr = (dev().globalMem()).vmAddress();
kernelInfo.pAqlDispatchPacket = aqlPkt;
kernelInfo.pAqlQueuePtr = reinterpret_cast<void*>(hsaQueueMem_->vmAddress());
// Get the address of the kernel code and its size for CPU access
gpu::Memory* aqlCode = hsaKernel.gpuAqlCode();
if (NULL != aqlCode) {
address aqlCodeAddr = static_cast<address>(aqlCode->map(NULL, 0));
dbgManager->setKernelCodeInfo(aqlCodeAddr, hsaKernel.aqlCodeSize());
aqlCode->unmap(NULL);
}
else {
dbgManager->setKernelCodeInfo(NULL, 0);
}
kernelInfo.trapPresent = false;
kernelInfo.trapHandler = NULL;
kernelInfo.trapHandlerBuffer = NULL;
kernelInfo.excpEn = 0;
kernelInfo.cacheDisableMask = 0;
kernelInfo.sqDebugMode = 0;
kernelInfo.mgmtSe0Mask = 0xFFFFFFFF;
kernelInfo.mgmtSe1Mask = 0xFFFFFFFF;
// set kernel info for HW debug and call the callback function
if (NULL != dbgManager->preDispatchCallBackFunc()) {
DebugToolInfo dbgSetting;
dbgSetting.scratchAddress_ = kernelInfo.scratchBufAddr;
dbgSetting.scratchSize_ = kernelInfo.scratchBufferSizeInBytes;
dbgSetting.globalAddress_ = kernelInfo.heapBufAddr;
// Call the predispatch callback function & set the trap info
AqlCodeInfo aqlCodeInfo;
aqlCodeInfo.aqlCode_ = (amd_kernel_code_t *) hsaKernel.cpuAqlCode();
aqlCodeInfo.aqlCodeSize_ = hsaKernel.aqlCodeSize();
// Execute the pre-dispatch call back function
dbgManager->executePreDispatchCallBack(reinterpret_cast<void*>(aqlPkt), &dbgSetting);
// assign the TMA and TBA for kernel dispatch
if (NULL != dbgSetting.trapHandler_ && NULL != dbgSetting.trapBuffer_) {
assignTrapHandler(dbgSetting, kernelInfo);
}
kernelInfo.trapPresent = (kernelInfo.trapHandler) ? true : false;
// Execption policy
kernelInfo.excpEn = dbgSetting.exceptionMask_;
kernelInfo.cacheDisableMask = dbgSetting.cacheDisableMask_;
kernelInfo.sqDebugMode = dbgSetting.gpuSingleStepMode_;
// Compute the mask for reserved CUs. These two dwords correspond to
// two registers used for reserving CUs for display. In the current
// implementation, the number of CUs reserved can be 0 to 7, and it
// is set by debugger users.
if (dbgSetting.monitorMode_) {
uint32_t i = dbgSetting.reservedCuNum_ / 2;
kernelInfo.mgmtSe0Mask <<= i;
i = dbgSetting.reservedCuNum_ - i;
kernelInfo.mgmtSe1Mask <<= i;
}
// flush/invalidate the instruction, data, L1 and L2 caches
InvalidateSqCaches();
}
}
void
VirtualGPU::assignTrapHandler(const DebugToolInfo& dbgSetting,
HwDbgKernelInfo& kernelInfo)
{
Memory * trapHandlerMem = dev().getGpuMemory(dbgSetting.trapHandler_);
Memory * trapBufferMem = dev().getGpuMemory(dbgSetting.trapBuffer_);
addVmMemory(trapHandlerMem);
addVmMemory(trapBufferMem);
// Handle TMA corruption hw bug workaround -
// The trap handler buffer has extra 256 bytes allocated, the TMA address
// is stored in the first two DWORDs and the actual trap handler code
// is stored starting at the location of 256 bytes.
//
// - kernelInfo.trapHandler points directly to the trap handler code
// - kernelInfo.trapHandlerBuffer points directly to the trap buffer (TMA)
//
kernelInfo.trapHandler = reinterpret_cast<void *>(trapHandlerMem->vmAddress() + TbaStartOffset);
kernelInfo.trapHandlerBuffer = reinterpret_cast<void *>(trapBufferMem->vmAddress());
// Address of the trap handler code/buffer should be 256-byte aligned
uint64_t tmaAddress = reinterpret_cast<uint64_t>(kernelInfo.trapHandlerBuffer);
if ((reinterpret_cast<uint64_t>(kernelInfo.trapHandler) & 0xFF) != 0
|| (tmaAddress & 0xFF) != 0) {
assert(false && "Trap handler/buffer is not 256-byte aligned");
}
// map the trap handler buffer address for host access, and store the trap
// buffer address at the beginning of the allocated buffer
address trapHandlerAddress = static_cast<address>(trapHandlerMem->map(NULL,0));
uint32_t * tmaStorage = reinterpret_cast<uint32_t *>(trapHandlerAddress);
tmaStorage[0] = tmaAddress & 0xFFFFFFFF;
tmaStorage[1] = (tmaAddress >> 32) & 0xFFFFFFFF;
trapHandlerMem->unmap(NULL);
}
} // namespace gpu
@@ -12,6 +12,9 @@
#include "device/gpu/gpusched.hpp"
#include "device/blit.hpp"
#include "device/gpu/gpudebugger.hpp"
/*! \addtogroup GPU GPU Resource Implementation
* @{
*/
@@ -28,6 +31,7 @@ class VirtualGPU;
class Program;
class BlitManager;
class ThreadTrace;
class HSAILKernel;
//! Virtual GPU
class VirtualGPU : public device::VirtualDevice, public CALGSLContext
@@ -400,6 +404,8 @@ public:
State state_; //!< virtual GPU current state
CalVirtualDesc cal_; //!< CAL virtual device descriptor
void flushCuCaches(HwDbgGpuCacheMask cache_mask); //!< flush/invalidate SQ cache
protected:
virtual void profileEvent(EngineType engine, bool type) const;
@@ -496,6 +502,17 @@ private:
const amd::BufferRect& dstRect //!< region of destination for copy
);
void buildKernelInfo(
const HSAILKernel& hsaKernel, //!< hsa kernel
hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
);
void assignTrapHandler(
const DebugToolInfo& dbgSetting, //!< debug settings
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
);
GslKernels gslKernels_; //!< GSL kernel descriptors
GslKernelDesc* activeKernelDesc_; //!< active GSL kernel descriptors
GpuEvents gpuEvents_; //!< GPU events
@@ -534,6 +551,8 @@ private:
uint deviceQueueSize_; //!< Device queue size
Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object
bool useHwDebug_; //!< Flag of using HW debug
};
/*@}*/} // namespace gpu
@@ -440,7 +440,7 @@ CALGSLContext::isDone(GpuEvent* event)
if (m_eventQueue[event->engineId_].isDone(event->id))
{
event->invalidate();
return true;
return true;
}
return false;
}
@@ -1269,10 +1269,10 @@ CALGSLContext::writeTimer(bool sdma, const gslMemObject mem, uint32 offset) cons
void
CALGSLContext::runAqlDispatch(GpuEvent& event, const void* aqlPacket,
const gslMemObject* mems, uint32 numMems, gslMemObject scratch, uint32 scratchOffset,
const void* cpuKernelCode, uint64 hsaQueueVA)
const void* cpuKernelCode, uint64 hsaQueueVA, const void* kernelInfo)
{
eventBegin(MainEngine);
m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, scratchOffset, cpuKernelCode, hsaQueueVA);
m_cs->AqlDispatch(aqlPacket, mems, numMems, scratch, scratchOffset, cpuKernelCode, hsaQueueVA, kernelInfo);
eventEnd(MainEngine, event);
}
@@ -1299,3 +1299,30 @@ CALGSLContext::virtualQueueHandshake(GpuEvent& event, const gslMemObject mem, mc
m_cs->VirtualQueueHandshake(mem, parentState, newStateValue, parentChildCounter, signal, dedicatedQueue);
eventEnd(MainEngine, event);
}
bool
CALGSLContext::RegisterHwDebugger(uint64 debugMessages)
{
return m_cs->registerHwDebugger(debugMessages);
}
bool
CALGSLContext::ExceptionNotification(osEventHandle debugEvent)
{
return m_cs->exceptionNotification(debugEvent);
}
void
CALGSLContext::InvalidateSqCaches(bool instInvalidate, bool dataInvalidate, bool tcL1, bool tcL2)
{
// invalidating instruction/data L1 caches using Escape
if (instInvalidate || dataInvalidate) {
m_cs->invalidateSqCaches(instInvalidate, dataInvalidate);
}
if (tcL1) {
flushCUCaches(tcL2);
}
}
@@ -44,7 +44,8 @@ public:
bool runProgramGrid(GpuEvent& event, const ProgramGrid* pProgramGrid, const gslMemObject* mems, uint32 numMems);
bool runProgramVideoDecode(GpuEvent& event, gslMemObject mo, const CALprogramVideoDecode& decode);
void runAqlDispatch(GpuEvent& event, const void* aqlPacket, const gslMemObject* mems,
uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode, uint64 hsaQueueVA);
uint32 numMems, gslMemObject scratch, uint32 scratchOffset, const void* cpuKernelCode,
uint64 hsaQueueVA, const void* kernelInfo);
mcaddr virtualQueueDispatcherStart();
void virtualQueueDispatcherEnd(GpuEvent& event, const gslMemObject* mems, uint32 numMems,
mcaddr signal, mcaddr loopStart, uint32 numTemplates);
@@ -140,6 +141,11 @@ public:
void writeTimer(bool sdma, const gslMemObject mem, uint32 offset) const;
void writeSurfRaw(GpuEvent& event, gslMemObject mem, size_t size, const void* data);
/// HW Debug support functions
bool RegisterHwDebugger(uint64 debugMessages);
bool ExceptionNotification(osEventHandle debugEvent);
void InvalidateSqCaches(bool instInvalidate = true, bool dataInvalidate = true, bool tcL1 = true, bool tcL2 = true);
protected:
void setScratchBuffer(gslMemObject mem, int32 engineId);
virtual void profileEvent(EngineType engine, bool type) const {}
@@ -0,0 +1,175 @@
/*******************************************************************************
*
* Copyright (c) 2014 Advanced Micro Devices, Inc. (unpublished)
*
* All rights reserved. This notice is intended as a precaution against
* inadvertent publication and does not imply publication or any waiver
* of confidentiality. The year included in the foregoing notice is the
* year of creation of the work.
*
******************************************************************************/
#include "hwdebug.hpp"
#include <iostream>
#include <sstream>
#include <fstream>
namespace amd {
class Device;
/*
***************************************************************************
* Implementation of GPU Debug Manager class
***************************************************************************
*/
//! Constructor of the debug manager class
HwDebugManager::HwDebugManager(amd::Device* device)
: context_(NULL)
, device_(device)
, preDispatchCallBackFunc_(NULL)
, postDispatchCallBackFunc_(NULL)
, preDispatchCallBackArgs_(NULL)
, postDispatchCallBackArgs_(NULL)
, paramMemory_(NULL)
, numParams_(0)
, aclBinary_(NULL)
, aqlCodeAddr_(NULL)
, aqlCodeSize_(0)
, scratchRingAddr_(NULL)
, scratchRingSize_(0)
, isRegistered_(false)
, dbgMsgBufferReady_(false)
{
memset(&debugInfo_, 0, sizeof(debugInfo_));
memset(deviceTrapInfo_, 0, sizeof(uint64_t) * kDebugTrapLocationMax);
}
HwDebugManager::~HwDebugManager()
{
if (NULL != paramMemory_) {
delete[] paramMemory_;
}
}
//! Setup the call back function pointer
void
HwDebugManager::setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFunction,
cl_PostDispatchCallBackFunctionAMD postDispatchFunction)
{
preDispatchCallBackFunc_ = preDispatchFunction;
postDispatchCallBackFunc_ = postDispatchFunction;
}
//! Setup the call back argument pointers
void
HwDebugManager::setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs)
{
preDispatchCallBackArgs_ = preDispatchArgs;
postDispatchCallBackArgs_ = postDispatchArgs;
}
//! Get dispatch debug info
void
HwDebugManager::getDispatchDebugInfo(void* debugInfo) const
{
memcpy(debugInfo, (void*) &debugInfo_, sizeof(DispatchDebugInfo));
}
//! Set the kernel code address and its size
void
HwDebugManager::setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize)
{
aqlCodeAddr_ = aqlCodeAddr;
aqlCodeSize_ = aqlCodeSize;
}
//! Get the scratch ring
void
HwDebugManager::setScratchRing(address scratchRingAddr, uint32_t scratchRingSize)
{
scratchRingAddr_ = scratchRingAddr;
scratchRingSize_ = scratchRingSize;
}
//! Map the shader (AQL code) for host access
void
HwDebugManager::mapKernelCode(uint64_t* aqlCodeAddr, uint32_t* aqlCodeSize) const
{
*aqlCodeAddr = reinterpret_cast<uint64_t>(aqlCodeAddr_);
*aqlCodeSize = aqlCodeSize_;
}
//! Map the scratch ring for host access
void
HwDebugManager::mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const
{
*scratchRingAddr = reinterpret_cast<uint64_t>(scratchRingAddr_);
*scratchRingSize = scratchRingSize_;
}
void
HwDebugManager::setExceptionPolicy(void* exceptionPolicy)
{
memcpy(&excpPolicy_, exceptionPolicy, sizeof(cl_dbg_exception_policy_amd));
}
void
HwDebugManager::getExceptionPolicy(void* exceptionPolicy) const
{
memcpy(exceptionPolicy, &excpPolicy_, sizeof(cl_dbg_exception_policy_amd));
}
void
HwDebugManager::setKernelExecutionMode(void* mode)
{
cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast<cl_dbg_kernel_exec_mode_amd*>(mode);
execMode_.ui32All = execMode->ui32All;
}
void
HwDebugManager::getKernelExecutionMode(void* mode) const
{
cl_dbg_kernel_exec_mode_amd* execMode = reinterpret_cast<cl_dbg_kernel_exec_mode_amd*>(mode);
execMode->ui32All = execMode_.ui32All;
}
void
HwDebugManager::setAclBinary(void* aclBinary)
{
aclBinary_ = aclBinary;
}
void
HwDebugManager::allocParamMemList(uint32_t numParams)
{
if (NULL != paramMemory_) {
delete [] paramMemory_;
}
numParams_ = numParams;
paramMemory_ = new amd::Memory*[numParams];
}
cl_mem
HwDebugManager::getKernelParamMem(uint32_t paramIdx) const
{
assert((paramIdx < numParams_) && "Invalid kernel parameter index too big");
return as_cl(paramMemory_[paramIdx]);
}
void
HwDebugManager::assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem)
{
assert((paramIdx < numParams_) && "Invalid kernel parameter index too big");
paramMemory_[paramIdx] = mem;
}
} // namespace amd
@@ -5,42 +5,67 @@
#ifndef HWDEBUG_H_
#define HWDEBUG_H_
#include "device.hpp"
#include "amdocl/cl_debugger_amd.h"
#define TBA_START_OFFSET 256
static const int TbaStartOffset = 256;
/**
*******************************************************************************
* @brief Debug information required by the AMD debugger
* This might have to be moved to a private header. We could provide
* these services as a seperate dll.
* @details The information is populated by the function oclGetDebugInfo
*******************************************************************************
static const int RtTrapBufferWaveSize = 64;
static const int RtTrapBufferSeNum = 4;
static const int RtTrapBufferShNum = 2;
static const int RtTrapBufferCuNum = 16;
static const int RtTrapBufferSimdNum = 4;
static const int RtTrapBufferWaveNum = 16;
static const int RtTrapBufferTotalWaveNum =
((RtTrapBufferSeNum) * \
(RtTrapBufferShNum) * \
(RtTrapBufferCuNum) * \
(RtTrapBufferSimdNum) * \
(RtTrapBufferWaveNum));
/*! \brief Debug trap handler location in the runtime trap buffer
*
* This enumeration is used to indicate the location where the debug
* trap handler and debug trap buffer are set in the device trap buffer.
*/
struct PacketAmdInfo
enum DebugTrapLocation
{
uint32_t trapReservedVgprIndex; //!< reserved VGPR index, -1 when they are not valid
uint32_t scratchBufferWaveOffset; //!< scratch buffer wave offset, -1 when no scratch buffer
void *pointerToIsaBuffer; //!< pointer to the buffer containing ISA
size_t sizeOfIsaBuffer; //!< size of the ISA buffer
uint32_t numberOfVgprs; //!< number of VGPRs used by the kernel
uint32_t numberOfSgprs; //!< number of SGPRs used by the kernel
size_t sizeOfStaticGroupMemory; //!< Static local memory used by the kernel
kDebugTrapHandlerLocation = 0, //! Debug Trap handler location, this location must be 0
kDebugTrapBufferLocation = 1, //! Debug Trap buffer location, this location must be 1
kDebugTrapLocationMax = 2
};
//! Cache mask for invalidation
struct HwDbgGpuCacheMask
/*! \brief This structure is for the debug info in each kernel dispatch.
*
* Contains the memory descriptor information of the scratch memory and the global
* memory
*/
struct DispatchDebugInfo
{
union {
struct {
uint32_t sqICache : 1; //!< Instruction cache
uint32_t sqKCache : 1; //!< Data cache
uint32_t tcL1 : 1; //!< tcL1 cache
uint32_t tcL2 : 1; //!< tcL2 cache
uint32_t reserved : 28;
};
uint32_t ui32All;
};
uint32_t scratchMemoryDescriptor_[4]; //! Scratch memory descriptor
uint32_t globalMemoryDescriptor_[4]; //! Global memory descriptor
};
/*! \brief Trap handler descriptor
*
* The trap handler descriptor contains the details of a given trap handler.
*/
struct TrapHandlerInfo {
amd::Memory* trapHandler_; //!< Device memory for the trap handler
amd::Memory* trapBuffer_; //!< Device memory for the trap buffer
};
/*! \brief Structure of the runtime trap handler buffer, which includes the following
* information: information of the runtime trap handler and buffer, information of
* the level-2 trap handlers and buffers.
*/
struct RuntimeTrapInfo {
TrapHandlerInfo trap_; //!< Structure of the address of all trap handlers
uint32_t dispatchId_; //!< Dispatch ID that signals the shader event
uint32_t vgpr_backup_[RtTrapBufferTotalWaveNum][RtTrapBufferWaveSize];
//!< Buffer to backup the VGPR used by the runtime trap handler
};
@@ -48,10 +73,16 @@ struct HwDbgGpuCacheMask
/**
* Opaque pointer to trap event
*/
typedef uint64_t DebugEvent; //! opaque pointer to trap event
typedef uintptr_t DebugEvent;
namespace amd {
class Context;
class Device;
class HostQueue;
/*! \class HwDebugManager
*
* \brief The device interface class for the hardware debug manager
@@ -61,32 +92,73 @@ class HwDebugManager
public:
//! Constructor for the Hardware Debug Manager
HwDebugManager() : isRegistered_(false), useHwDebug_(false) {}
HwDebugManager(amd::Device* device);
//! Destructor for Hardware Debug Manager
~HwDebugManager() {};
virtual ~HwDebugManager();
//! Setup the call back function pointer
virtual void setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFn,
cl_PostDispatchCallBackFunctionAMD postDispatchFn) = 0;
void setCallBackFunctions(cl_PreDispatchCallBackFunctionAMD preDispatchFn,
cl_PostDispatchCallBackFunctionAMD postDispatchFn);
//! Setup the call back argument pointers
virtual void setCallBackArguments(void *preDispatchArgs, void *postDispatchArgs) = 0;
void setCallBackArguments(void* preDispatchArgs, void* postDispatchArgs);
//! Flush cache
virtual cl_int flushCache(uint32_t mask) = 0;
//! Get dispatch debug info
void getDispatchDebugInfo(void* debugInfo) const;
//! Set the kernel code address and its size
void setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize);
//! Get the scratch ring
void setScratchRing(address scratchRingAddr, uint32_t scratchRingSize);
//! Map the shader (AQL code) for host access
void mapKernelCode(uint64_t* aqlCodeAddr, uint32_t* aqlCodeSize) const;
//! Map the scratch ring for host access
void mapScratchRing(uint64_t* scratchRingAddr, uint32_t* scratchRingSize) const;
//! Retrieve the pre-dispatch callback function
cl_PreDispatchCallBackFunctionAMD preDispatchCallBackFunc() const
{ return preDispatchCallBackFunc_; }
//! Retrieve the post-dispatch callback function
cl_PostDispatchCallBackFunctionAMD postDispatchCallBackFunc() const
{ return postDispatchCallBackFunc_; }
//! Retrieve the pre-dispatch callback function arguments
void* preDispatchCallBackArgs() const { return preDispatchCallBackArgs_; }
//! Retrieve the post-dispatch callback function arguments
void* postDispatchCallBackArgs() const { return postDispatchCallBackArgs_; }
//! Set exception policy
virtual cl_int setExceptionPolicy(void *policy) = 0;
void setExceptionPolicy(void* policy);
//! Get exception policy
virtual cl_int getExceptionPolicy(void *policy) const = 0;
void getExceptionPolicy(void* policy) const;
//! Set the kernel execution mode
virtual cl_int setKernelExecutionMode(void *mode) = 0;
void setKernelExecutionMode(void* mode);
//! Get the kernel execution mode
virtual cl_int getKernelExecutionMode(void *mode) const = 0;
void getKernelExecutionMode(void* mode) const;
//! Setup the pointer to the aclBinary within the debug manager
void setAclBinary(void* aclBinary);
//! Allocate storage to keep the memory pointers of the kernel parameters
void allocParamMemList(uint32_t numParams);
//! Assign the kernel parameter memory
void assignKernelParamMem(uint32_t paramIdx, amd::Memory* mem);
//! Get kernel parameter memory object
cl_mem getKernelParamMem(uint32_t paramIdx) const;
//! Flush cache
virtual void flushCache(uint32_t mask) = 0;
//! Create the debug event
virtual DebugEvent createDebugEvent(const bool autoReset) = 0;
@@ -95,95 +167,99 @@ public:
virtual cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const = 0;
//! Destroy the debug event
virtual cl_int destroyDebugEvent(DebugEvent pEvent) = 0;
virtual void destroyDebugEvent(DebugEvent* pEvent) = 0;
//! Register the debugger
virtual cl_int registerDebugger(amd::Context *context, uintptr_t pMessageStorage) = 0;
virtual cl_int registerDebugger(amd::Context* context, uintptr_t pMessageStorage) = 0;
//! Call KMD to register the debugger
virtual cl_int registerDebuggerOnQueue(device::VirtualDevice *vDevice) = 0;
virtual cl_int registerDebuggerOnQueue(device::VirtualDevice* vDevice) = 0;
//! Unregister the debugger
virtual cl_int unregisterDebugger() = 0;
virtual void unregisterDebugger() = 0;
//! Setup the pointer to the aclBinary within the debug manager
virtual void setAclBinary(void *aclBinary) = 0;
//! Send the wavefront control cmmand
virtual cl_int wavefrontControl(uint32_t waveAction,
virtual void wavefrontControl(uint32_t waveAction,
uint32_t waveMode,
uint32_t trapId,
void * waveAddr) const = 0;
void* waveAddr) const = 0;
//! Set address watching point
virtual cl_int setAddressWatch(uint32_t numWatchPoints,
void ** watchAddress,
uint64_t * watchMask,
uint64_t * watchMode,
DebugEvent * event) = 0;
virtual void setAddressWatch(uint32_t numWatchPoints,
void** watchAddress,
uint64_t* watchMask,
uint64_t* watchMode,
DebugEvent* event) = 0;
//! Get the packet information for dispatch
virtual cl_int getPacketAmdInfo(const void * aqlCodeInfo,
void * packetInfo) const = 0;
//! Get dispatch debug info
virtual cl_int getDispatchDebugInfo(void * debugInfo) const = 0;
//! Map the AQL code for host access
virtual cl_int mapKernelCode(uint64_t *aqlCode, uint32_t *aqlCodeSize) const = 0;
//! Map the scratch ring for host access
virtual cl_int mapScratchRing(uint64_t *scratchRingAddr, uint32_t *scratchRingSize) const = 0;
virtual void getPacketAmdInfo(const void* aqlCodeInfo,
void* packetInfo) const = 0;
//! Set global memory values
virtual cl_int setGlobalMemory(void * memObj,
uint32_t offset,
void * srcPtr,
uint32_t size) = 0;
virtual void setGlobalMemory(amd::Memory* memObj,
uint32_t offset,
void* srcPtr,
uint32_t size) = 0;
//! Set kernel parameter memory object list
virtual cl_int setKernelParamMemList(void ** paramMem, uint32_t numParams) = 0;
//! Execute the post-dispatch callback function
virtual void executePostDispatchCallBack() = 0;
//! Get kernel parameter memory object
virtual uint64_t getKernelParamMem(uint32_t paramIdx) const = 0;
//! Execute the pre-dispatch callback function
virtual void executePreDispatchCallBack(void* aqlPacket,
void* toolInfo) = 0;
//! Set the kernel code address and its size
virtual void setKernelCodeInfo(address aqlCodeAddr, uint32_t aqlCodeSize) = 0;
//! Return the use of HW DEBUG flag
bool isMsgBufferReady() const { return dbgMsgBufferReady_; }
//! Get the scratch ring
virtual void setScratchRing(address scratchRingAddr, uint32_t scratchRingSize) = 0;
protected:
//! Return the context
const amd::Context* context() const { return context_; }
//! Retrieve the pre-dispatch callback function
virtual cl_PreDispatchCallBackFunctionAMD getPreDispatchCallBackFunction() const = 0;
//! Retrieve the post-dispatch callback function
virtual void * getPreDispatchCallBackArguments() const = 0;
//! Retrieve the pre-dispatch callback function arguments
virtual cl_PostDispatchCallBackFunctionAMD getPostDispatchCallBackFunction() const = 0;
//! Retrieve the post-dispatch callback function arguments
virtual void * getPostDispatchCallBackArguments() const = 0;
//! Set the register flag
void setRegisterFlag(bool regFlag) { isRegistered_ = regFlag; }
//! Set the use of HW DEBUG flag
void setUseHwDebugFlag(bool flag) { useHwDebug_ = flag; }
//! Get the debug device
const amd::Device* device() const { return device_; }
//! Return the register flag
bool isRegistered() const { return isRegistered_; }
//! Return the use of HW DEBUG flag
bool useHwDebug() const { return useHwDebug_; }
//! Return the device trap handler information
const uint64_t* deviceTrapInfo() const { return deviceTrapInfo_; }
protected:
bool isRegistered_; //! flag to indicate the debugger has been registered
bool useHwDebug_; //! flag to indicate the HW DEBUG is using
const amd::Context* context_; ///< context that used to create host queue for the debugger
amd::Device* device_; ///< Device to run the debugger
cl_PreDispatchCallBackFunctionAMD preDispatchCallBackFunc_; //!< pre-dispatch callback function
cl_PostDispatchCallBackFunctionAMD postDispatchCallBackFunc_; //!< post-dispatch callback function
void* preDispatchCallBackArgs_; //!< pre-dispatch callback function arguments
void* postDispatchCallBackArgs_; //!< post-dispatch callback function arguments
DispatchDebugInfo debugInfo_; //!< Debug setting/information for kernel dispatch
uint64_t deviceTrapInfo_[kDebugTrapLocationMax]; //!< Device trap buffer, to store various trap handlers on the device
amd::Memory** paramMemory_; //!< list of memory pointers for kernel parameters
uint32_t numParams_; //!< number of kernel parameters
void* aclBinary_; //!< ACL binary
address aqlCodeAddr_; //!< The mapped AQL code to allow host access
uint32_t aqlCodeSize_; //!< The size of the AQL code info
address scratchRingAddr_; //!< The mapped address of the scratch buffer
uint32_t scratchRingSize_; //!< The size of the scratch ring
bool isRegistered_; //! flag to indicate the debugger has been registered
bool dbgMsgBufferReady_; //! flag to indicate the HW DEBUG is using
cl_dbg_exception_policy_amd excpPolicy_; //!< exception policy
cl_dbg_kernel_exec_mode_amd execMode_; //!< kernel execution mode
RuntimeTrapInfo rtTrapHandlerInfo_; //!< Runtime trap information
};
/**@}*/
/**
@@ -174,6 +174,8 @@ debug(bool, GPU_FORCE_SINGLE_FP_DENORM, false, \
"Forces reporting CL_FP_DENORM bit for single precision") \
debug(bool, OCL_FORCE_CPU_SVM, false, \
"force svm support for CPU") \
debug(bool, GPU_ENABLE_HW_DEBUG, false, \
"Enable HW DEBUG for GPU")