P4 to Git Change 1230521 by gandryey@gera-w8 on 2016/01/22 17:58:08

SWDEV-86035 - Add OCL backend for PAL
	- PAL backend build is disabled by default. "BUILD_PAL_DEVICE = yes" enables the build. You also have to update the client workspace with PAL mapping: //depot/stg/pal/... //<your_opencl_location>/runtime/device/pal/palbe/...

Affected files ...

... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/build/Makefile.api#130 edit
... //depot/stg/opencl/drivers/opencl/compiler/sclibdefs.opencl#8 edit
... //depot/stg/opencl/drivers/opencl/opencldefs#166 edit
... //depot/stg/opencl/drivers/opencl/openclrules#91 edit
... //depot/stg/opencl/drivers/opencl/runtime/Makefile#21 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#192 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/Makefile#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/build/Makefile#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/build/Makefile.pal#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbinary.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbinary.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcompiler.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugger.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugmanager.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugmanager.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d10.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d11.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d9.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevicegl.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprintf.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprintf.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsched.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palschedcl.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palthreadtrace.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palthreadtrace.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltrap.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/runtimedefs#36 edit
... //depot/stg/opencl/drivers/opencl/runtime/top.hpp#23 edit


[ROCm/clr commit: c99d679c9e]
此提交包含在:
foreman
2016-01-22 18:18:55 -05:00
父節點 1a277f5c3c
當前提交 dd15a594a2
共有 46 個檔案被更改,包括 22124 行新增2 行删除
+15 -2
查看文件
@@ -15,6 +15,13 @@ extern amd::AppProfile* oclhsaCreateAppProfile();
#include "device/cpu/cpudevice.hpp"
#endif // WITH_CPU_DEVICE
#if defined(WITH_PAL_DEVICE)
//namespace pal {
extern bool PalDeviceLoad();
extern void PalDeviceUnload();
//}
#endif // WITH_PAL_DEVICE
#if defined(WITH_GPU_DEVICE)
extern bool DeviceLoad();
extern void DeviceUnload();
@@ -177,9 +184,12 @@ Device::init()
ret |= oclhsa::NullDevice::init();
}
#endif // WITH_HSA_DEVICE
#if defined(WITH_GPU_DEVICE)
#if defined(WITH_GPU_DEVICE) && !defined(WITH_PAL_DEVICE)
ret |= DeviceLoad();
#endif // WITH_GPU_DEVICE
#if defined(WITH_PAL_DEVICE)
ret |= PalDeviceLoad();
#endif // WITH_PAL_DEVICE
#if defined(WITH_CPU_DEVICE)
ret |= cpu::Device::init();
#endif // WITH_CPU_DEVICE
@@ -203,9 +213,12 @@ Device::tearDown()
oclhsaAppProfile_ = NULL;
}
#endif // WITH_HSA_DEVICE
#if defined(WITH_GPU_DEVICE)
#if defined(WITH_GPU_DEVICE) && !defined(WITH_PAL_DEVICE)
DeviceUnload();
#endif // WITH_GPU_DEVICE
#if defined(WITH_PAL_DEVICE)
PalDeviceUnload();
#endif // WITH_PAL_DEVICE
#if defined(WITH_CPU_DEVICE)
cpu::Device::tearDown();
#endif // WITH_CPU_DEVICE
+25
查看文件
@@ -0,0 +1,25 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "top.hpp"
#include "utils/debug.hpp"
#include "device/appprofile.hpp"
#include "device/pal/palappprofile.hpp"
namespace pal {
AppProfile::AppProfile()
: amd::AppProfile()
, enableHighPerformanceState_(true)
, reportAsOCL12Device_(false)
{
propertyDataMap_.insert(DataMap::value_type("HighPerfState",
PropertyData(DataType_Boolean, &enableHighPerformanceState_)));
propertyDataMap_.insert(DataMap::value_type("OCL12Device",
PropertyData(DataType_Boolean, &reportAsOCL12Device_)));
}
}
+30
查看文件
@@ -0,0 +1,30 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALAPPPROFILE_HPP_
#define PALAPPPROFILE_HPP_
#include <string>
#include <map>
namespace pal {
class AppProfile : public amd::AppProfile
{
public:
AppProfile();
//! return the value of enableHighPerformanceState_
bool enableHighPerformanceState() const { return enableHighPerformanceState_; }
bool reportAsOCL12Device() const { return reportAsOCL12Device_; }
private:
bool enableHighPerformanceState_;
bool reportAsOCL12Device_;
};
}
#endif // PALAPPPROFILE_HPP_
+7
查看文件
@@ -0,0 +1,7 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
namespace pal {
} // namespace pal
+48
查看文件
@@ -0,0 +1,48 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALBINARY_HPP_
#define PALBINARY_HPP_
#include "top.hpp"
#include "device/pal/paldevice.hpp"
#include "device/pal/palkernel.hpp"
namespace pal {
class ClBinaryHsa : public device::ClBinary
{
public:
ClBinaryHsa(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3)
: device::ClBinary(dev, bifVer)
{}
//! Destructor
~ClBinaryHsa() {}
protected:
bool setElfTarget() {
uint32_t target = static_cast<uint32_t>(21);//dev().calTarget());
assert (((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15");
uint16_t elf_target = (uint16_t)(0x7FFF & target);
return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM);
return true;
}
private:
//! Disable default copy constructor
ClBinaryHsa(const ClBinaryHsa&);
//! Disable default operator=
ClBinaryHsa& operator=(const ClBinaryHsa&);
//! Returns the HSA device for this object
const Device& dev() const { return static_cast<const Device&>(dev_); }
};
} // namespace pal
#endif // PALBINARY_HPP_
檔案差異因為檔案過大而無法顯示 載入差異
+451
查看文件
@@ -0,0 +1,451 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALBLIT_HPP_
#define PALBLIT_HPP_
#include "top.hpp"
#include "platform/command.hpp"
#include "device/pal/paldefs.hpp"
#include "device/device.hpp"
#include "device/blit.hpp"
/*! \addtogroup PAL Blit Implementation
* @{
*/
//! PAL Blit Manager Implementation
namespace pal {
class Device;
class Kernel;
class Memory;
class VirtualGPU;
//! DMA Blit Manager
class DmaBlitManager : public device::HostBlitManager
{
public:
//! Constructor
DmaBlitManager(
VirtualGPU& gpu, //!< Virtual GPU to be used for blits
Setup setup = Setup() //!< Specifies HW accelerated blits
);
//! Destructor
virtual ~DmaBlitManager() {}
//! Creates DmaBlitManager object
virtual bool create(amd::Device& device) { return true; }
//! Copies a buffer object to system memory
virtual bool readBuffer(
device::Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destination host memory
const amd::Coord3D& origin, //!< Source origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to system memory
virtual bool readBufferRect(
device::Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destinaiton host memory
const amd::BufferRect& bufRect, //!< Source rectangle
const amd::BufferRect& hostRect, //!< Destination rectangle
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies an image object to system memory
virtual bool readImage(
device::Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destination host memory
const amd::Coord3D& origin, //!< Source origin
const amd::Coord3D& size, //!< Size of the copy region
size_t rowPitch, //!< Row pitch for host memory
size_t slicePitch, //!< Slice pitch for host memory
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies system memory to a buffer object
virtual bool writeBuffer(
const void* srcHost, //!< Source host memory
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies system memory to a buffer object
virtual bool writeBufferRect(
const void* srcHost, //!< Source host memory
device::Memory& dstMemory, //!< Destination memory object
const amd::BufferRect& hostRect, //!< Destination rectangle
const amd::BufferRect& bufRect, //!< Source rectangle
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies system memory to an image object
virtual bool writeImage(
const void* srcHost, //!< Source host memory
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
size_t rowPitch, //!< Row pitch for host memory
size_t slicePitch, //!< Slice pitch for host memory
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to another buffer object
virtual bool copyBuffer(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to another buffer object
virtual bool copyBufferRect(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::BufferRect& srcRect, //!< Source rectangle
const amd::BufferRect& dstRect, //!< Destination rectangle
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies an image object to a buffer object
virtual bool copyImageToBuffer(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
size_t rowPitch = 0, //!< Pitch for buffer
size_t slicePitch = 0 //!< Slice for buffer
) const;
//! Copies a buffer object to an image object
virtual bool copyBufferToImage(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
size_t rowPitch = 0, //!< Pitch for buffer
size_t slicePitch = 0 //!< Slice for buffer
) const;
//! Copies an image object to another image object
virtual bool copyImage(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
protected:
const static uint MaxPinnedBuffers = 4;
//! Synchronizes the blit operations if necessary
inline void synchronize() const;
//! Returns the virtual GPU object
VirtualGPU& gpu() const { return static_cast<VirtualGPU&>(vDev_); }
//! Returns the GPU device object
const Device& dev() const { return static_cast<const Device&>(dev_); };
inline Memory& gpuMem(device::Memory& mem) const;
//! Pins host memory for GPU access
amd::Memory* pinHostMemory(
const void* hostMem, //!< Host memory pointer
size_t pinSize, //!< Host memory size
size_t& partial //!< Extra offset for memory alignment
) const;
const size_t MinSizeForPinnedTransfer;
bool completeOperation_; //!< DMA blit manager must complete operation
amd::Context* context_; //!< A dummy context
private:
//! Disable copy constructor
DmaBlitManager(const DmaBlitManager&);
//! Disable operator=
DmaBlitManager& operator=(const DmaBlitManager&);
//! Reads video memory, using a staged buffer
bool readMemoryStaged(
Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destination host memory
Memory** xferBuf, //!< Staged buffer for read
size_t origin, //!< Original offset in the source memory
size_t& offset, //!< Offset for the current copy pointer
size_t& totalSize, //!< Total size for copy region
size_t xferSize //!< Transfer size
) const;
//! Write into video memory, using a staged buffer
bool writeMemoryStaged(
const void* srcHost, //!< Source host memory
Memory& dstMemory, //!< Destination memory object
Memory& xferBuf, //!< Staged buffer for write
size_t origin, //!< Original offset in the destination memory
size_t& offset, //!< Offset for the current copy pointer
size_t& totalSize, //!< Total size for the copy region
size_t xferSize //!< Transfer size
) const;
};
//! Kernel Blit Manager
class KernelBlitManager : public DmaBlitManager
{
public:
enum {
BlitCopyImage = 0,
BlitCopyImage1DA,
BlitCopyImageToBuffer,
BlitCopyBufferToImage,
BlitCopyBufferRect,
BlitCopyBufferRectAligned,
BlitCopyBuffer,
BlitCopyBufferAligned,
FillBuffer,
FillImage,
Scheduler,
BlitTotal
};
//! Constructor
KernelBlitManager(
VirtualGPU& gpu, //!< Virtual GPU to be used for blits
Setup setup = Setup() //!< Specifies HW accelerated blits
);
//! Destructor
virtual ~KernelBlitManager();
//! Creates DmaBlitManager object
virtual bool create(amd::Device& device);
//! Copies a buffer object to another buffer object
virtual bool copyBufferRect(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::BufferRect& srcRectIn, //!< Source rectangle
const amd::BufferRect& dstRectIn, //!< Destination rectangle
const amd::Coord3D& sizeIn, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to system memory
virtual bool readBuffer(
device::Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destination host memory
const amd::Coord3D& origin, //!< Source origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to system memory
virtual bool readBufferRect(
device::Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destinaiton host memory
const amd::BufferRect& bufRect, //!< Source rectangle
const amd::BufferRect& hostRect, //!< Destination rectangle
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies system memory to a buffer object
virtual bool writeBuffer(
const void* srcHost, //!< Source host memory
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies system memory to a buffer object
virtual bool writeBufferRect(
const void* srcHost, //!< Source host memory
device::Memory& dstMemory, //!< Destination memory object
const amd::BufferRect& hostRect, //!< Destination rectangle
const amd::BufferRect& bufRect, //!< Source rectangle
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to an image object
virtual bool copyBuffer(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies a buffer object to an image object
virtual bool copyBufferToImage(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
size_t rowPitch = 0, //!< Pitch for buffer
size_t slicePitch = 0 //!< Slice for buffer
) const;
//! Copies an image object to a buffer object
virtual bool copyImageToBuffer(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
size_t rowPitch = 0, //!< Pitch for buffer
size_t slicePitch = 0 //!< Slice for buffer
) const;
//! Copies an image object to another image object
virtual bool copyImage(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies an image object to system memory
virtual bool readImage(
device::Memory& srcMemory, //!< Source memory object
void* dstHost, //!< Destination host memory
const amd::Coord3D& origin, //!< Source origin
const amd::Coord3D& size, //!< Size of the copy region
size_t rowPitch, //!< Row pitch for host memory
size_t slicePitch, //!< Slice pitch for host memory
bool entire = false //!< Entire buffer will be updated
) const;
//! Copies system memory to an image object
virtual bool writeImage(
const void* srcHost, //!< Source host memory
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
size_t rowPitch, //!< Row pitch for host memory
size_t slicePitch, //!< Slice pitch for host memory
bool entire = false //!< Entire buffer will be updated
) const;
//! Fills a buffer memory with a pattern data
virtual bool fillBuffer(
device::Memory& memory, //!< Memory object to fill with pattern
const void* pattern, //!< Pattern data
size_t patternSize, //!< Pattern size
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Fills an image memory with a pattern data
virtual bool fillImage(
device::Memory& dstMemory, //!< Memory object to fill with pattern
const void* pattern, //!< Pattern data
const amd::Coord3D& origin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false //!< Entire buffer will be updated
) const;
//! Fills an image memory with a pattern data
virtual bool runScheduler(
device::Memory& vqueue, //!< Memory object for virtual queue
device::Memory& params, //!< Extra arguments for the scheduler
uint paramIdx, //!< Parameter index
uint threads //!< Number of scheduling threads
) const;
private:
static const size_t MaxXferBuffers = 2;
//! Copies a buffer object to an image object
bool copyBufferToImageKernel(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
size_t rowPitch = 0, //!< Pitch for buffer
size_t slicePitch = 0 //!< Slice for buffer
) const;
//! Copies an image object to a buffer object
bool copyImageToBufferKernel(
device::Memory& srcMemory, //!< Source memory object
device::Memory& dstMemory, //!< Destination memory object
const amd::Coord3D& srcOrigin, //!< Source origin
const amd::Coord3D& dstOrigin, //!< Destination origin
const amd::Coord3D& size, //!< Size of the copy region
bool entire = false, //!< Entire buffer will be updated
size_t rowPitch = 0, //!< Pitch for buffer
size_t slicePitch = 0 //!< Slice for buffer
) const;
//! Creates a program for all blit operations
bool createProgram(
Device& device //!< Device object
);
//! Creates a view memory object
Memory* createView(
const Memory& parent, //!< Parent memory object
const cl_image_format format //!< The new format for a view
) const;
//! Disable copy constructor
KernelBlitManager(const KernelBlitManager&);
//! Disable operator=
KernelBlitManager& operator=(const KernelBlitManager&);
amd::Program* program_; //!< GPU program obejct
amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit
amd::Memory* constantBuffer_; //!< An internal CB for blits
amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images
size_t xferBufferSize_; //!< Transfer buffer size
amd::Monitor* lockXferOps_; //!< Lock transfer operation
};
static const char* BlitName[KernelBlitManager::BlitTotal] = {
"copyImage",
"copyImage1DA",
"copyImageToBuffer",
"copyBufferToImage",
"copyBufferRect",
"copyBufferRectAligned",
"copyBuffer",
"copyBufferAligned",
"fillBuffer",
"fillImage",
"scheduler",
};
/*@}*/} // namespace pal
#endif /*PALBLIT_HPP_*/
+147
查看文件
@@ -0,0 +1,147 @@
//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//
#include <string>
#include <sstream>
#include <fstream>
#include <iostream>
#include "os/os.hpp"
#include "device/pal/paldevice.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palkernel.hpp"
#include "utils/options.hpp"
#include <cstdio>
//CLC_IN_PROCESS_CHANGE
extern int openclFrontEnd(const char* cmdline, std::string*, std::string* typeInfo = nullptr);
namespace pal {
bool
HSAILProgram::compileImpl(
const std::string& sourceCode,
const std::vector<const std::string*>& headers,
const char** headerIncludeNames,
amd::option::Options* options)
{
acl_error errorCode;
aclTargetInfo target;
std::string arch = "hsail";
if (dev().settings().use64BitPtr_) {
arch += "64";
}
target = aclGetTargetInfo(arch.c_str(),
dev().info().name_, &errorCode);
// end if asic info is ready
// We dump the source code for each program (param: headers)
// into their filenames (headerIncludeNames) into the TEMP
// folder specific to the OS and add the include path while
// compiling
// Find the temp folder for the OS
std::string tempFolder = amd::Os::getTempPath();
std::string tempFileName = amd::Os::getTempFileName();
// Iterate through each source code and dump it into tmp
std::fstream f;
std::vector<std::string> headerFileNames(headers.size());
std::vector<std::string> newDirs;
for (size_t i = 0; i < headers.size(); ++i) {
std::string headerPath = tempFolder;
std::string headerIncludeName(headerIncludeNames[i]);
// replace / in path with current os's file separator
if (amd::Os::fileSeparator() != '/') {
for (std::string::iterator it = headerIncludeName.begin(),
end = headerIncludeName.end(); it != end; ++it) {
if (*it == '/') *it = amd::Os::fileSeparator();
}
}
size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator());
if (pos != std::string::npos) {
headerPath += amd::Os::fileSeparator();
headerPath += headerIncludeName.substr(0, pos);
headerIncludeName = headerIncludeName.substr(pos+1);
}
if (!amd::Os::pathExists(headerPath)) {
bool ret = amd::Os::createPath(headerPath);
assert(ret && "failed creating path!");
newDirs.push_back(headerPath);
}
std::string headerFullName =
headerPath + amd::Os::fileSeparator() + headerIncludeName;
headerFileNames[i] = headerFullName;
f.open(headerFullName.c_str(), std::fstream::out);
// Should we allow asserts
assert(!f.fail() && "failed creating header file!");
f.write(headers[i]->c_str(), headers[i]->length());
f.close();
}
// Create Binary
binaryElf_ = aclBinaryInit(sizeof(aclBinary),
&target, &binOpts_, &errorCode);
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error: aclBinary init failure\n";
LogWarning("aclBinaryInit failed");
return false;
}
// Insert opencl into binary
errorCode = aclInsertSection(dev().compiler(), binaryElf_,
sourceCode.c_str(), strlen(sourceCode.c_str()), aclSOURCE);
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error: Inserting openCl Source to binary\n";
}
// Set the options for the compiler
// Set the include path for the temp folder that contains the includes
if (!headers.empty()) {
compileOptions_.append(" -I");
compileOptions_.append(tempFolder);
}
//Add only for CL2.0 and above
if (options->oVariables->CLStd[2] >= '2') {
std::stringstream opts;
opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE="
<< device().info().maxGlobalVariableSize_;
compileOptions_.append(opts.str());
}
#if !defined(_LP64) && defined(ATI_OS_LINUX)
if (options->origOptionStr.find("-cl-std=CL2.0") != std::string::npos && !dev().settings().force32BitOcl20_) {
errorCode = ACL_UNSUPPORTED;
LogWarning("aclCompile failed");
return false;
}
#endif
// Compile source to IR
compileOptions_.append(hsailOptions());
errorCode = aclCompile(dev().compiler(), binaryElf_, compileOptions_.c_str(),
ACL_TYPE_OPENCL, ACL_TYPE_LLVMIR_BINARY, nullptr);
buildLog_ += aclGetCompilerLog(dev().compiler());
if (errorCode != ACL_SUCCESS) {
LogWarning("aclCompile failed");
buildLog_ += "Error: Compiling CL to IR\n";
return false;
}
clBinary()->storeCompileOptions(compileOptions_);
// Save the binary in the interface class
size_t size = 0;
void* mem = nullptr;
aclWriteToMem(binaryElf_, &mem, &size);
setBinary(static_cast<char*>(mem), size);
// Save the binary inside the program
// The FSAILProgram will be responsible to free it during destruction
rawBinary_ = mem;
return true;
}
} // namespace pal
+89
查看文件
@@ -0,0 +1,89 @@
//
// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
//
#include "device/pal/palconstbuf.hpp"
#include "device/pal/palvirtual.hpp"
#include "device/pal/paldevice.hpp"
#include "device/pal/palsettings.hpp"
namespace pal {
ConstBuffer::ConstBuffer(
VirtualGPU& gpu,
size_t size)
: Memory(const_cast<pal::Device&>(gpu.dev()), size * VectorSize)
, gpu_(gpu)
, size_(size * VectorSize)
, wrtOffset_(0)
, lastWrtSize_(0)
, wrtAddress_(nullptr)
{
}
ConstBuffer::~ConstBuffer()
{
if (wrtAddress_ != nullptr) {
unmap(&gpu_);
}
amd::AlignedMemory::deallocate(sysMemCopy_);
}
bool
ConstBuffer::create()
{
// Create sysmem copy for the constant buffer
sysMemCopy_ = reinterpret_cast<address>(amd::AlignedMemory::allocate(size_, 256));
if (sysMemCopy_ == nullptr) {
LogPrintfError("We couldn't allocate sysmem copy for constant buffer,\
size(%d)!", size_);
return false;
}
memset(sysMemCopy_, 0, size_);
if (!Memory::create(Resource::RemoteUSWC)) {
LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_);
return false;
}
// Constant buffer warm-up
warmUpRenames(gpu_);
wrtAddress_ = map(&gpu_, Resource::Discard);
if (wrtAddress_ == nullptr) {
LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
return false;
}
return true;
}
bool
ConstBuffer::uploadDataToHw(size_t size)
{
static const size_t HwCbAlignment = 256;
// Align copy size on the vector's boundary
size_t count = amd::alignUp(size, VectorSize);
wrtOffset_ += lastWrtSize_;
// Check if CB has enough space for copy
if ((wrtOffset_ + count) > size_) {
if (wrtAddress_ != nullptr) {
unmap(&gpu_);
}
wrtAddress_ = map(&gpu_, Resource::Discard);
wrtOffset_ = 0;
lastWrtSize_ = 0;
}
// Update memory with new CB data
memcpy((reinterpret_cast<char*>(wrtAddress_) + wrtOffset_), sysMemCopy_, count);
// Adjust the size by the HW CB buffer alignment
lastWrtSize_ = amd::alignUp(size, HwCbAlignment);
return true;
}
} // namespace pal
+70
查看文件
@@ -0,0 +1,70 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALCONSTBUF_HPP_
#define PALCONSTBUF_HPP_
#include "device/pal/palmemory.hpp"
//! \namespace pal PAL Resource Implementation
namespace pal {
//! Cconstant buffer
class ConstBuffer : public Memory
{
public:
//! Vector size of the constant buffer
static const size_t VectorSize = 16;
//! Constructor for the ConstBuffer class
ConstBuffer(
VirtualGPU& gpu, //!< Virtual GPU device object
size_t size //!< size of the constant buffer in vectors
);
//! Destructor for the ConstBuffer class
~ConstBuffer();
//! Creates the real HW constant buffer
bool create();
/*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
*
* \return True if the data upload was succesful
*/
bool uploadDataToHw(
size_t size //!< real data size for upload
);
//! Returns a pointer to the system memory copy for CB
address sysMemCopy() const { return sysMemCopy_; }
//! Returns CB size
size_t size() const { return size_; }
//! Returns current write offset for the constant buffer
size_t wrtOffset() const { return wrtOffset_; }
//! Returns last write size for the constant buffer
size_t lastWrtSize() const { return lastWrtSize_; }
private:
//! Disable copy constructor
ConstBuffer(const ConstBuffer&);
//! Disable operator=
ConstBuffer& operator=(const ConstBuffer&);
VirtualGPU& gpu_; //!< Virtual GPU object
address sysMemCopy_; //!< System memory copy
size_t size_; //!< Constant buffer size
size_t wrtOffset_; //!< Current write offset
size_t lastWrtSize_; //!< Last write size
void* wrtAddress_; //!< Write address in CB
};
/*@}*/} // namespace pal
#endif /*PALCONSTBUF_HPP_*/
+119
查看文件
@@ -0,0 +1,119 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "device/pal/paldefs.hpp"
#include "device/pal/palcounters.hpp"
#include "device/pal/palvirtual.hpp"
namespace pal {
PalCounterReference*
PalCounterReference::Create(
VirtualGPU& gpu,
const Pal::PerfExperimentCreateInfo& createInfo)
{
Pal::Result result;
size_t palExperSize = gpu.dev().iDev()->GetPerfExperimentSize(
createInfo, &result);
if (result != Pal::Result::Success) {
return nullptr;
}
PalCounterReference* memRef = new (palExperSize) PalCounterReference(gpu);
if (memRef != nullptr) {
result = gpu.dev().iDev()->CreatePerfExperiment(createInfo,
&memRef[1], &memRef->perfExp_);
if (result != Pal::Result::Success) {
memRef->release();
return nullptr;
}
}
return memRef;
}
PalCounterReference::~PalCounterReference() {
// The counter object is always associated with a particular queue,
// so we have to lock just this queue
amd::ScopedLock lock(gpu_.execution());
if (nullptr != iPerf()) {
iPerf()->Destroy();
}
}
bool
PalCounterReference::growResultArray(uint index) {
if (results_ != nullptr) {
delete [] results_;
}
results_ = new uint64_t [index + 1];
if (results_ == nullptr) {
return false;
}
return true;
}
PerfCounter::~PerfCounter()
{
if (calRef_ == nullptr) {
return;
}
// Release the counter reference object
calRef_->release();
}
bool
PerfCounter::create(
PalCounterReference* calRef)
{
assert(&gpu() == &calRef->gpu());
calRef_ = calRef;
counter_ = calRef->iPerf();
index_ = calRef->retain() - 2;
calRef->growResultArray(index_);
// Initialize the counter
Pal::PerfCounterInfo counterInfo = {};
counterInfo.counterType = Pal::PerfCounterType::Global;
counterInfo.block = static_cast<Pal::GpuBlock>(info_.blockIndex_);
counterInfo.instance = info_.counterIndex_;
counterInfo.eventId = info_.eventIndex_;
Pal::Result result = counter_->AddCounter(counterInfo);
if (result != Pal::Result::Success) {
return false;
}
return true;
}
uint64_t
PerfCounter::getInfo(uint64_t infoType) const
{
switch (infoType) {
case CL_PERFCOUNTER_GPU_BLOCK_INDEX: {
// Return the GPU block index
return info()->blockIndex_;
}
case CL_PERFCOUNTER_GPU_COUNTER_INDEX: {
// Return the GPU counter index
return info()->counterIndex_;
}
case CL_PERFCOUNTER_GPU_EVENT_INDEX: {
// Return the GPU event index
return info()->eventIndex_;
}
case CL_PERFCOUNTER_DATA: {
Unimplemented();
//gslCounter()->GetResult(gpu().cs(), reinterpret_cast<uint64*>(calRef_->results()));
return calRef_->results()[index_];
}
default:
LogError("Wrong PerfCounter::getInfo parameter");
}
return 0;
}
} // namespace pal
+152
查看文件
@@ -0,0 +1,152 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALCOUNTERS_HPP_
#define PALCOUNTERS_HPP_
#include "top.hpp"
#include "device/device.hpp"
#include "device/pal/paldevice.hpp"
#include "palPerfExperiment.h"
namespace pal {
class VirtualGPU;
class PalCounterReference : public amd::ReferenceCountedObject
{
public:
static PalCounterReference* Create(
VirtualGPU& gpu,
const Pal::PerfExperimentCreateInfo& createInfo);
//! Default constructor
PalCounterReference(
VirtualGPU& gpu //!< Virtual GPU device object
)
: perfExp_(nullptr)
, gpu_(gpu)
, results_(nullptr) {}
//! Get PAL counter
Pal::IPerfExperiment* iPerf() const { return perfExp_; }
//! Returns the virtual GPU device
const VirtualGPU& gpu() const { return gpu_; }
//! Increases the results array for this PAL counter(container)
bool growResultArray(
uint maxIndex //!< the maximum HW counter index in the PAL counter
);
void finalize() {
iPerf()->Finalize();
Pal::GlobalCounterLayout layout = {};
layout.sampleCount = referenceCount() - 1;
iPerf()->GetGlobalCounterLayout(&layout); }
//! Returns the PAL counter results
uint64_t* results() const { return results_; }
Pal::IPerfExperiment* perfExp_; //!< PAL performance experiment object
protected:
//! Default destructor
~PalCounterReference();
private:
//! Disable copy constructor
PalCounterReference(const PalCounterReference&);
//! Disable operator=
PalCounterReference& operator=(const PalCounterReference&);
VirtualGPU& gpu_; //!< The virtual GPU device object
uint64_t* results_; //!< Counter results
};
//! Performance counter implementation on GPU
class PerfCounter : public device::PerfCounter
{
public:
//! The performance counter info
struct Info : public amd::EmbeddedObject
{
uint blockIndex_; //!< Index of the block to configure
uint counterIndex_; //!< Index of the hardware counter
uint eventIndex_; //!< Event you wish to count with the counter
};
//! The PerfCounter flags
enum Flags
{
BeginIssued = 0x00000001,
EndIssued = 0x00000002,
ResultReady = 0x00000004
};
//! Constructor for the GPU PerfCounter object
PerfCounter(
const Device& device, //!< A GPU device object
const VirtualGPU& gpu, //!< Virtual GPU device object
cl_uint blockIndex, //!< HW block index
cl_uint counterIndex, //!< Counter index within the block
cl_uint eventIndex) //!< Event index for profiling
: gpuDevice_(device)
, gpu_(gpu)
, calRef_(NULL)
, flags_(0)
, counter_(0)
, index_(0)
{
info_.blockIndex_ = blockIndex;
info_.counterIndex_ = counterIndex;
info_.eventIndex_ = eventIndex;
}
//! Destructor for the GPU PerfCounter object
virtual ~PerfCounter();
//! Creates the current object
bool create(
PalCounterReference* calRef //!< Reference counter
);
//! Returns the specific information about the counter
uint64_t getInfo(
uint64_t infoType //!< The type of returned information
) const;
//! Returns the GPU device, associated with the current object
const Device& dev() const { return gpuDevice_; }
//! Returns the virtual GPU device
const VirtualGPU& gpu() const { return gpu_; }
//! Returns the CAL performance counter descriptor
const Info* info() const { return &info_; }
//! Returns the Info structure for performance counter
Pal::IPerfExperiment* iPerf() const { return counter_; }
private:
//! Disable default copy constructor
PerfCounter(const PerfCounter&);
//! Disable default operator=
PerfCounter& operator=(const PerfCounter&);
const Device& gpuDevice_; //!< The backend device
const VirtualGPU& gpu_; //!< The virtual GPU device object
PalCounterReference* calRef_; //!< Reference counter
uint flags_; //!< The perfcounter object state
Info info_; //!< The info structure for perfcounter
Pal::IPerfExperiment* counter_; //!< GSL counter object
uint index_; //!< Counter index in the CAL container
};
} // namespace pal
#endif // PALCOUNTERS_HPP_
+121
查看文件
@@ -0,0 +1,121 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALDEBGGER_H_
#define PALDEBGGER_H_
#include <cstddef>
#include <cstdint>
#include "hsa.h"
#include "amd_hsa_kernel_code.h"
#include "device/device.hpp"
#include "device/hwdebug.hpp"
#include "acl.h"
static const int NumberReserveVgprs = 4;
namespace pal {
/**
* \defgroup Services_API OCL Runtime Services API
* @{
*/
/*! \brief Dispatch packet information
*
* This structure contains the packet information for kernel dispatch
*/
struct PacketAmdInfo
{
uint32_t trapReservedVgprIndex_; //!< reserved VGPR index, -1 when they are not valid
uint32_t scratchBufferWaveOffset_; //!< scratch buffer wave offset, -1 when no scratch buffer
void* pointerToIsaBuffer_; //!< pointer to the buffer containing ISA
size_t sizeOfIsaBuffer_; //!< size of the ISA buffer
uint32_t numberOfVgprs_; //!< number of VGPRs used by the kernel
uint32_t numberOfSgprs_; //!< number of SGPRs used by the kernel
size_t sizeOfStaticGroupMemory_; //!< Static local memory used by the kernel
};
/*! \brief Cache mask for invalidation
*/
struct HwDbgGpuCacheMask
{
HwDbgGpuCacheMask() :ui32All_(0) {}
HwDbgGpuCacheMask(uint32_t mask) :ui32All_(mask) {}
union {
struct {
uint32_t sqICache_ : 1; //!< Instruction cache
uint32_t sqKCache_ : 1; //!< Data cache
uint32_t tcL1_ : 1; //!< tcL1 cache
uint32_t tcL2_ : 1; //!< tcL2 cache
uint32_t reserved_ : 28;
};
uint32_t ui32All_;
};
};
/*! \brief Address watch information
*
* Information about each watch point - address, mask, mode and event
*/
struct HwDbgAddressWatch
{
void* watchAddress_; //! The address of watch point
uint64_t watchMask_; //! The mask for watch point (lower 24 bits)
cl_dbg_address_watch_mode_amd watchMode_; //! The watch mode for this watch
DebugEvent event_; //! Event of the watch point (not used for now)
};
/*! \brief Runtime structure used to communicate debug information
* between Ocl services and core for a kernel dispatch.
*/
struct DebugToolInfo
{
uint64_t scratchAddress_; //! Scratch memory address
size_t scratchSize_; //! Scratch memory size
uint64_t globalAddress_; //! Global memory address
uint32_t cacheDisableMask_; //! Cache mask, indicating caches disabled
uint32_t exceptionMask_; //! Exception mask
uint32_t reservedCuNum_; //! Number of reserved CUs for display,
//! which ranges from 0 to 7 in the current implementation.
bool monitorMode_; //! Debug or profiler mode
bool gpuSingleStepMode_; //! SQ debug mode
amd::Memory* trapHandler_; //! Trap handler address
amd::Memory* trapBuffer_; //! Trap buffer address
bool sqPerfcounterEnable_; //! whether SQ perf counters are enabled
aclBinary* aclBinary_; //! pointer of the kernel ACL binary
amd::Event* event_; //! pointer of the kernel event in the enqueue command
};
/*! \brief Message used by the KFD wave control for CI
*
* Structure indicates the various information used by the wave control function.
*/
struct HwDebugWaveAddr
{
uint32_t VMID_ : 4; //! Virtual memory id
uint32_t wave_ : 4; //! Wave id
uint32_t SIMD_ : 2; //! SIMD id
uint32_t CU_ : 4; //! Compute unit
uint32_t SH_ : 1; //! Shader array
uint32_t SE_ : 1; //! Shader engine
};
/*! \brief Kernel code information
*
* This structure contains the pointer of mapped kernel code for host access
* and its size (in bytes)
*/
struct AqlCodeInfo
{
amd_kernel_code_t * aqlCode_; //! pointer of AQL code to allow host access
uint32_t aqlCodeSize_; //! size of AQL code
};
/**@}*/
} // namespace pal
#endif // PALDEBGGER_H_
+412
查看文件
@@ -0,0 +1,412 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "platform/commandqueue.hpp"
#include "device/device.hpp"
#include "device/pal/paldevice.hpp"
#include "device/pal/palmemory.hpp"
#include "device/pal/paltrap.hpp"
#include "device/pal/paldebugmanager.hpp"
#include <iostream>
#include <sstream>
#include <fstream>
namespace pal {
class VirtualGPU;
class Device;
class Memory;
/*
***************************************************************************
* Implementation of GPU Debug Manager class
***************************************************************************
*/
GpuDebugManager::GpuDebugManager(amd::Device* device)
: HwDebugManager(device)
, vGpu_(nullptr)
, debugMessages_(0)
, addressWatch_(nullptr)
, addressWatchSize_(0)
, oclEventHandle_(nullptr)
{
// Initialize the exception info and the kernel execution mode
excpPolicy_.exceptionMask = 0x0;
excpPolicy_.waveAction = CL_DBG_WAVES_RESUME;
excpPolicy_.hostAction = CL_DBG_HOST_IGNORE;
excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST;
execMode_.ui32All = 0;
rtTrapHandlerInfo_.trap_.trapHandler_ = nullptr;
rtTrapHandlerInfo_.trap_.trapBuffer_ = nullptr;
aqlPacket_ = (hsa_kernel_dispatch_packet_t *) nullptr;
return;
}
GpuDebugManager::~GpuDebugManager()
{
if (nullptr != addressWatch_) {
delete [] addressWatch_;
}
}
void
GpuDebugManager::executePreDispatchCallBack(void* aqlPacket,
void* toolInfo)
{
DebugToolInfo* info = reinterpret_cast<DebugToolInfo*>(toolInfo);
aqlPacket_ = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlPacket);
Unimplemented();
// Only if the pre-dispatch callback is set, will we update cache
// flush configuration and build the memory descriptor.
if (nullptr != preDispatchCallBackFunc_) {
/*
// Build the scratch memory descriptor
device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_,
info->scratchAddress_,
info->scratchSize_);
// Build the global memory descriptor
device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_,
info->globalAddress_);
*/
// // for invalidate cache (BuildEndOfKernelNotifyCommands)
// aqlPacket->release_fence_scope = 2;
aclBinary_ = reinterpret_cast<void*>(info->aclBinary_);
oclEventHandle_ = reinterpret_cast<void*>(as_cl(info->event_));
cl_device_id clDeviceId = as_cl(device_);
preDispatchCallBackFunc_(clDeviceId,
oclEventHandle_,
aqlPacket_,
aclBinary_,
preDispatchCallBackArgs_);
}
// setup the trap handler information only if the debugger has been registered
if (isRegistered()) {
// Copy the various info set by the debugger/profiler to the tool info structure
setupTrapInformation(info);
}
}
void
GpuDebugManager::executePostDispatchCallBack()
{
if (nullptr != postDispatchCallBackFunc_) {
cl_device_id clDeviceId = as_cl(device_);
postDispatchCallBackFunc_(clDeviceId,
aqlPacket_->completion_signal.handle,
postDispatchCallBackArgs_);
}
}
//! Map the kernel code for host access
void
GpuDebugManager::mapKernelCode(void* aqlCodeInfo) const
{
AqlCodeInfo* codeInfo = reinterpret_cast<AqlCodeInfo*>(aqlCodeInfo);
codeInfo->aqlCode_ = reinterpret_cast<amd_kernel_code_t*>(aqlCodeAddr_);
codeInfo->aqlCodeSize_ = aqlCodeSize_;
}
cl_int
GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage)
{
if (!device()->settings().enableHwDebug_) {
LogError("debugmanager: Register debugger error - HW DEBUG is not enable");
return CL_DEBUGGER_REGISTER_FAILURE_AMD;
}
// first time register - set the message storage, flush queue and enable hw debug
if (!isRegistered()) {
debugMessages_ = messageStorage;
Unimplemented();
/*
if (!device()->gslCtx()->registerHwDebugger(debugMessages_)) {
LogError("debugmanager: Register debugger failed");
return CL_OUT_OF_RESOURCES;
}
*/
isRegistered_ = true;
if (CL_SUCCESS != createRuntimeTrapHandler()) {
LogError("debugmanager: Create runtime trap handler failed");
return CL_OUT_OF_RESOURCES;
}
}
context_ = context;
return CL_SUCCESS;
}
void
GpuDebugManager::unregisterDebugger()
{
if (isRegistered()) {
// reset the debugger registration flag
isRegistered_ = false;
context_ = nullptr;
}
}
void
GpuDebugManager::flushCache(uint32_t mask)
{
HwDbgGpuCacheMask cacheMask(mask);
device()->xferQueue()->flushCuCaches(cacheMask);
}
void
GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo)
{
toolInfo->scratchAddress_ = 0;
toolInfo->scratchSize_ = 0;
toolInfo->globalAddress_ = 0;
toolInfo->sqPerfcounterEnable_ = false;
// Set up trap related info in the kernel info structure to be
// used in the kernel dispatch.
toolInfo->exceptionMask_ = excpPolicy_.exceptionMask;
toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode;
toolInfo->monitorMode_ = execMode_.monitorMode;
// The order of these three bits is determined by the definition
// of the register COMPUTE_DISPATCH_INITIATOR
toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2)
| (execMode_.disableL2Cache << 1)
| (execMode_.disableL1Vector));
toolInfo->reservedCuNum_ = execMode_.reservedCuNum;
toolInfo->trapHandler_ = rtTrapInfo_[kDebugTrapHandlerLocation];
toolInfo->trapBuffer_ = rtTrapInfo_[kDebugTrapBufferLocation];
}
void
GpuDebugManager::getPacketAmdInfo(
const void* aqlCodeInfo,
void* packetInfo) const
{
const AqlCodeInfo* codeInfo =
reinterpret_cast<const AqlCodeInfo*>(aqlCodeInfo);
const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_;
PacketAmdInfo* packet =
reinterpret_cast<PacketAmdInfo*>(packetInfo);
const amd_kernel_code_t* akc = hostAqlCode;
packet->numberOfSgprs_ = akc->wavefront_sgpr_count;
packet->numberOfVgprs_ = akc->workitem_vgpr_count;
// use mapped kernel_object_address for host accessing of ISA buffer
packet->pointerToIsaBuffer_ = (char*) (hostAqlCode) +
akc->kernel_code_entry_byte_offset;
packet->scratchBufferWaveOffset_ =
akc->debug_wavefront_private_segment_offset_sgpr;
packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_;
packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size;
// The trap_reserved_vgpr_index will be 4 less the original
// This value must be used only by the debugger
packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs;
}
DebugEvent
GpuDebugManager::createDebugEvent(
const bool autoReset)
{
Unimplemented();
/*
// create the event object
osEventHandle shaderEvent = osEventCreate(!autoReset);
// event object has been created, set the initial state
if (shaderEvent != 0) {
osEventReset(shaderEvent); // initial state is non-signaled
if (device()->gslCtx()->exceptionNotification(shaderEvent)) {
return shaderEvent;
}
}
*/
return 0;
}
cl_int
GpuDebugManager::waitDebugEvent(
DebugEvent pEvent,
uint32_t timeOut) const
{
Unimplemented();
/*
if (osEventTimedWait(pEvent, timeOut)) {
return CL_SUCCESS;
}
else {
return CL_EVENT_TIMEOUT_AMD;
}
*/
return CL_SUCCESS;
}
void
GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent)
{
Unimplemented();
/*
osEventDestroy(*pEvent);
*pEvent = 0;
device()->gslCtx()->exceptionNotification(0);
*/
}
void
GpuDebugManager::wavefrontControl(
uint32_t waveAction,
uint32_t waveMode,
uint32_t trapId,
void* waveAddr) const
{
Unimplemented();
//device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr);
}
void
GpuDebugManager::setAddressWatch(
uint32_t numWatchPoints,
void** watchAddress,
uint64_t* watchMask,
uint64_t* watchMode,
DebugEvent* event)
{
size_t requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch);
// previously allocated size is not big enough, allocate new memory
if (addressWatchSize_ < requiredSize) {
if (nullptr != addressWatch_) { // free the smaller address watch storage
delete [] addressWatch_;
}
addressWatch_ = new HwDbgAddressWatch[numWatchPoints];
addressWatchSize_ = requiredSize;
}
// fill in the address watch structure
memset(addressWatch_, 0, addressWatchSize_);
for (uint32_t i = 0; i < numWatchPoints; i++)
{
amd::Memory* watchMem = as_amd(reinterpret_cast<cl_mem>(watchAddress[i]));
Memory* watchMemAddress = device()->getGpuMemory(watchMem);
addressWatch_[i].watchAddress_ = reinterpret_cast<void*>(watchMemAddress->vmAddress());
addressWatch_[i].watchMask_ = watchMask[i];
addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd) watchMode[i];
addressWatch_[i].event_ = (0 != event) ? event[i] : 0;
}
Unimplemented();
// setup the watch addresses
//device()->gslCtx()->setAddressWatch(numWatchPoints, (void*) addressWatch_);
}
void
GpuDebugManager::setGlobalMemory(
amd::Memory* memObj,
uint32_t offset,
void* srcPtr,
uint32_t size)
{
Memory* globalMem = device()->getGpuMemory(memObj);
address mappedMem = static_cast<address>(globalMem->map(nullptr,0));
assert(mappedMem != 0);
void* dest_ptr = reinterpret_cast<void*>(mappedMem + offset);
memcpy(dest_ptr, srcPtr, size);
globalMem->unmap(nullptr);
}
cl_int
GpuDebugManager::createRuntimeTrapHandler()
{
size_t codeSize = 0;
const uint32_t* rtTrapCode = nullptr;
if (device()->settings().viPlus_) {
codeSize = sizeof(RuntimeTrapCodeVi);
rtTrapCode = RuntimeTrapCodeVi;
}
else {
codeSize = sizeof(RuntimeTrapCode);
rtTrapCode = RuntimeTrapCode;
}
uint32_t numCodes = codeSize / sizeof(uint32_t);
// Handle TMA corruption hw bug workaround -
// The trap handler buffer has extra 256 bytes allocated, the TMA address
// is stored in the first two DWORDs and the actual trap handler code
// is stored starting at the location of 256 bytes (TbaStartOffset).
//
// allocate memory for the runtime trap handler (TBA) + TMA address
uint32_t allocSize = codeSize + TbaStartOffset;
Memory* rtTBA = new Memory(*device(), allocSize);
runtimeTBA_ = rtTBA;
if ((rtTBA == nullptr) || !rtTBA->create(Resource::RemoteUSWC)) {
return CL_OUT_OF_RESOURCES;
}
address tbaAddress = reinterpret_cast<address>(rtTBA->map(nullptr));
// allocate buffer for the runtime trap handler buffer (TMA)
uint32_t tmaSize = 0x100;
Memory* rtTMA = new Memory(*device(), tmaSize);
runtimeTMA_ = rtTMA;
if ((rtTMA == nullptr) || !rtTMA->create(Resource::RemoteUSWC)) {
return CL_OUT_OF_RESOURCES;
}
uint64_t rtTmaAddress = rtTMA->vmAddress();
if ((rtTBA->vmAddress() & 0xFF) != 0 || (rtTmaAddress & 0xFF) != 0) {
LogError("debugmanager: Trap handler/buffer is not 256-byte aligned");
return CL_INVALID_VALUE;
}
// store the TMA address at the beginning of trap handler buffer
uint64_t* tbaStorage = reinterpret_cast<uint64_t*>(tbaAddress);
tbaStorage[0] = rtTmaAddress;
// save the trap handler code
uint32_t* trapHandlerPtr = (uint32_t*)(tbaAddress + TbaStartOffset);
for (uint32_t i = 0; i < numCodes; i++) {
trapHandlerPtr[i] = rtTrapCode[i];
}
rtTBA->unmap(nullptr);
return CL_SUCCESS;
}
} // namespace pal
+117
查看文件
@@ -0,0 +1,117 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALDEBUGMANAGER_H__
#define PALDEBUGMANAGER_H__
#include "device/pal/palvirtual.hpp"
#include "device/pal/paldebugger.hpp"
namespace pal {
class GpuDebugManager;
class Device;
class Memory;
/*! \brief Debug Manager Class
*
* The debug manager class is used to pass all the trap info to the
* kernel dispatch and then the kernel execution can use such trap information
* for kernel execution. This class contains the trap handler and shader event
* objects. The trap handler is setup by users and passed to the kernel dispatch.
* The shader event is to receive interrupts from the GPU and then users can
* perform various operations.
*
* This class also provides the interface for setting up the pre-dispatch
* callback functions used by the profiler and debugger. It also provides
* a way to retrieve various debug information for the kernel execution.
*
*/
class GpuDebugManager : public amd::HwDebugManager {
public:
//! Constructor of the debug manager class
GpuDebugManager(amd::Device* device);
//! Destructor of the debug manager class
~GpuDebugManager();
//! Get the single instance of the GpuDebugManager class
static GpuDebugManager* getDefaultInstance();
//! Destroy the GpuDebugManager class object
static void destroyInstances();
//! Flush cache
void flushCache(uint32_t mask);
//! Create the debug event
DebugEvent createDebugEvent(const bool autoReset);
//! Wait for the debug event
cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const;
//! Destroy the debug event
void destroyDebugEvent(DebugEvent* pEvent);
//! Register the debugger
cl_int registerDebugger(amd::Context*context, uintptr_t messageStorage);
//! Unregister the debugger
void unregisterDebugger();
//! Send the wavefront control cmmand
void wavefrontControl(uint32_t waveAction,
uint32_t waveMode,
uint32_t trapId,
void* waveAddr) const;
//! Set address watching point
void setAddressWatch(uint32_t numWatchPoints,
void** watchAddress,
uint64_t* watchMask,
uint64_t* watchMode,
DebugEvent* pEvent);
//! Map the kernel code for host access
void mapKernelCode(void* aqlCodeInfo) const;
//! Get the packet information for dispatch
void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const;
//! Set global memory values
void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size);
//! Execute the post-dispatch callback function
void executePostDispatchCallBack();
//! Execute the pre-dispatch callback function
void executePreDispatchCallBack(void* aqlPacket,
void* toolInfo);
protected:
const VirtualGPU* vGpu() const { return vGpu_; }
private:
//! Setup trap handler info for kernel execution
void setupTrapInformation(DebugToolInfo* toolInfo);
//! Create runtime trap handler
cl_int createRuntimeTrapHandler();
const pal::Device* device() const {
return reinterpret_cast<const pal::Device *>(device_); }
VirtualGPU* vGpu_; //!< the virtual GPU
uintptr_t debugMessages_; //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD
HwDbgAddressWatch* addressWatch_; //!< Address watch data
size_t addressWatchSize_; //!< Size of address watch data
//! Arguments used by the callback function
void* oclEventHandle_; //!< event handler
const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet
};
} // namespace pal
#endif // PALDEBUGMANAGER_H__
+584
查看文件
@@ -0,0 +1,584 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALDEFS_HPP_
#define PALDEFS_HPP_
#include "top.hpp"
#include "pal.h"
#include "palGpuMemory.h"
#include "palImage.h"
#include "palFormatInfo.h"
//
/// Memory Object Type
//
enum PalGpuMemoryType {
PAL_DEPTH_BUFFER = 0, ///< Depth Buffer
PAL_BUFFER, ///< Pure buffer
PAL_TEXTURE_3D, ///< 3D texture
PAL_TEXTURE_2D, ///< 2D texture
PAL_TEXTURE_1D, ///< 1D texture
PAL_TEXTURE_1D_ARRAY, ///< 1D Array texture
PAL_TEXTURE_2D_ARRAY, ///< 2D Array texture
PAL_TEXTURE_BUFFER, ///< "buffer" texture inside VBO
};
struct HwDbgKernelInfo
{
uint64_t scratchBufAddr; ///< Handle of GPU local memory for kernel private scratch space
size_t scratchBufferSizeInBytes; ///< size of memory pointed to by pScratchBuffer,
uint64_t heapBufAddr; ///< Address of the global heap base
const void* pAqlDispatchPacket; ///< Pointer to the dipatch packet
const void* pAqlQueuePtr; ///< pointer to the AQL Queue
void* trapHandler; ///< address of the trap handler (TBA)
void* trapHandlerBuffer; ///< address of the trap handler buffer (TMA)
uint32_t excpEn; ///< excecption mask
bool trapPresent; ///< trap present flag
bool sqDebugMode; ///< debug mode flag (GPU single step mode)
uint32_t mgmtSe0Mask; ///< mask for SE0 (reserving CU for display)
uint32_t mgmtSe1Mask; ///< mask for SE1 (reserving CU for display)
uint32_t cacheDisableMask; ///< cache disable mask
};
//! Engine types
enum EngineType
{
MainEngine = 0,
SdmaEngine,
AllEngines
};
struct GpuEvent
{
static const unsigned int InvalidID = ((1<<30) - 1);
EngineType engineId_; ///< type of the id
unsigned int id; ///< actual event id
//! GPU event default constructor
GpuEvent(): engineId_(MainEngine), id(InvalidID) {}
//! Returns true if the current event is valid
bool isValid() const { return (id != InvalidID) ? true : false; }
//! Set invalid event id
void invalidate() { id = InvalidID; }
};
/*! \addtogroup PAL
* @{
*/
//! PAL Device Implementation
namespace pal {
//! Maximum number of the supported global atomic counters
const static uint MaxAtomicCounters = 8;
//! Maximum number of the supported samplers
const static uint MaxSamplers = 16;
//! Maximum number of supported read images
const static uint MaxReadImage = 128;
//! Maximum number of supported write images
const static uint MaxWriteImage = 8;
//! Maximum number of supported read/write images for OCL20
const static uint MaxReadWriteImage = 64;
//! Maximum number of supported constant arguments
const static uint MaxConstArguments = 8;
//! Maximum number of supported kernel UAV arguments
const static uint MaxUavArguments = 1024;
//! Maximum number of pixels for a 1D image created from a buffer
const static size_t MaxImageBufferSize = 65536;
//! Maximum number of pixels for a 1D image created from a buffer
const static size_t MaxImageArraySize = 2048;
//! Maximum number of supported constant buffers
const static uint MaxConstBuffers = MaxConstArguments + 8;
//! Maximum number of constant buffers for arguments
const static uint MaxConstBuffersArguments = 2;
//! Alignment restriciton for the pinned memory
const static size_t PinnedMemoryAlignment = 4 * Ki;
//! HSA path specific defines for images
const static uint HsaImageObjectSize = 48;
const static uint HsaImageObjectAlignment = 16;
const static uint HsaSamplerObjectSize = 32;
const static uint HsaSamplerObjectAlignment = 16;
//! HSA path specific defines for images
const static uint DeviceQueueMaskSize = 32;
struct AMDDeviceInfo {
const char* targetName_; //!< Target name
const char* machineTarget_; //!< Machine target
uint simdPerCU_; //!< Number of SIMDs per CU
uint simdWidth_; //!< Number of workitems processed per SIMD
uint simdInstructionWidth_; //!< Number of instructions processed per SIMD
uint memChannelBankWidth_; //!< Memory channel bank width
uint localMemSizePerCU_; //!< Local memory size per CU
uint localMemBanks_; //!< Number of banks of local memory
uint gfxipVersion_; //!< The core engine GFXIP version
};
static const AMDDeviceInfo DeviceInfo[] = {
/* Unknown */ { "", "unknown", 4, 16, 1, 256, 64 * Ki, 32, 702 },
/* Tahiti */ { "", "tahiti", 4, 16, 1, 256, 64 * Ki, 32, 702 },
/* Pitcairn */ { "", "pitcairn", 4, 16, 1, 256, 64 * Ki, 32, 702 },
/* Capeverde */ { "", "bonaire", 4, 16, 1, 256, 64 * Ki, 32, 702 },
/* Oland */ { "", "oland", 4, 16, 1, 256, 64 * Ki, 32, 702 },
/* Hainan */ { "", "hainan", 4, 16, 1, 256, 64 * Ki, 32, 702 },
/* Bonaire */ { "Bonaire", "bonaire", 4, 16, 1, 256, 64 * Ki, 32, 702 },
/* Hawaii */ { "Hawaii", "hawaii", 4, 16, 1, 256, 64 * Ki, 32, 702 },
/* Kalindi */ { "Kalindi", "kalindi", 4, 16, 1, 256, 64 * Ki, 32, 702 },
/* Spectre */ { "Spectre", "spectre", 4, 16, 1, 256, 64 * Ki, 32, 701 },
/* Carrizo */ { "Carrizo" , "carrizo", 4, 16, 1, 256, 64 * Ki, 32, 800 },
/* Stoney */ { "Stoney", "stoney", 4, 16, 1, 256, 64 * Ki, 32, 800 },
/* Iceland */ { "Iceland", "iceland", 4, 16, 1, 256, 64 * Ki, 32, 800 },
/* Tonga */ { "Tonga", "tonga", 4, 16, 1, 256, 64 * Ki, 32, 800 },
/* Fiji */ { "Fiji", "fiji", 4, 16, 1, 256, 64 * Ki, 32, 800 },
/* Ellesmere */ { "Horse", "horse", 4, 16, 1, 256, 64 * Ki, 32, 800 },
/* Buffin */ { "Goose", "goose", 4, 16, 1, 256, 64 * Ki, 32, 800 },
};
static const char* Gfx700 = "AMD:AMDGPU:7:0:0";
static const char* Gfx701 = "AMD:AMDGPU:7:0:1";
static const char* Gfx800 = "AMD:AMDGPU:8:0:0";
static const char* Gfx801 = "AMD:AMDGPU:8:0:1";
static const char* Gfx804 = "AMD:AMDGPU:8:0:4";
static const char* Gfx810 = "AMD:AMDGPU:8:1:0";
static const char* Gfx900 = "AMD:AMDGPU:9:0:0";
static const char* Gfx901 = "AMD:AMDGPU:9:0:1";
// Supported OpenCL versions
enum OclVersion {
OpenCL10,
OpenCL11,
OpenCL12,
OpenCL20
};
struct MemoryFormat {
cl_image_format clFormat_; //!< CL image format
Pal::Format palFormat_; //!< PAL image format
Pal::ChannelMapping palChannel_;//!< PAL channel mapping
};
static const MemoryFormat
MemoryFormatMap[] = {
// R
{ { CL_R, CL_UNORM_INT8 },
{ Pal::ChFmt::R8, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_R, CL_UNORM_INT16 },
{ Pal::ChFmt::R16, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_R, CL_SNORM_INT8 },
{ Pal::ChFmt::R8, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_R, CL_SNORM_INT16 },
{ Pal::ChFmt::R16, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_R, CL_SIGNED_INT8 },
{ Pal::ChFmt::R8, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_R, CL_SIGNED_INT16 },
{ Pal::ChFmt::R16, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_R, CL_SIGNED_INT32 },
{ Pal::ChFmt::R32, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_R, CL_UNSIGNED_INT8 },
{ Pal::ChFmt::R8, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_R, CL_UNSIGNED_INT16 },
{ Pal::ChFmt::R16, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_R, CL_UNSIGNED_INT32 },
{ Pal::ChFmt::R32, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_R, CL_HALF_FLOAT },
{ Pal::ChFmt::R16, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_R, CL_FLOAT },
{ Pal::ChFmt::R32, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
// A
{ { CL_A, CL_UNORM_INT8 },
{ Pal::ChFmt::R8, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
{ { CL_A, CL_UNORM_INT16 },
{ Pal::ChFmt::R16, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
{ { CL_A, CL_SNORM_INT8 },
{ Pal::ChFmt::R8, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
{ { CL_A, CL_SNORM_INT16 },
{ Pal::ChFmt::R16, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
{ { CL_A, CL_SIGNED_INT8 },
{ Pal::ChFmt::R8, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
{ { CL_A, CL_SIGNED_INT16 },
{ Pal::ChFmt::R16, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
{ { CL_A, CL_SIGNED_INT32},
{ Pal::ChFmt::R32, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
{ { CL_A, CL_UNSIGNED_INT8 },
{ Pal::ChFmt::R8, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
{ { CL_A, CL_UNSIGNED_INT16 },
{ Pal::ChFmt::R16, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
{ { CL_A, CL_UNSIGNED_INT32},
{ Pal::ChFmt::R32 , Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
{ { CL_A, CL_HALF_FLOAT },
{ Pal::ChFmt::R16, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
{ { CL_A, CL_FLOAT },
{ Pal::ChFmt::R32, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
// RG
{ { CL_RG, CL_UNORM_INT8 },
{ Pal::ChFmt::R8G8, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_RG, CL_UNORM_INT16 },
{ Pal::ChFmt::R16G16, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_RG, CL_SNORM_INT8 },
{ Pal::ChFmt::R8G8, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_RG, CL_SNORM_INT16 },
{ Pal::ChFmt::R16G16, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_RG, CL_SIGNED_INT8 },
{ Pal::ChFmt::R8G8, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_RG, CL_SIGNED_INT16 },
{ Pal::ChFmt::R16G16, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_RG, CL_SIGNED_INT32},
{ Pal::ChFmt::R32G32, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_RG, CL_UNSIGNED_INT8 },
{ Pal::ChFmt::R8G8, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_RG, CL_UNSIGNED_INT16 },
{ Pal::ChFmt::R16G16, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_RG, CL_UNSIGNED_INT32},
{ Pal::ChFmt::R32G32, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_RG, CL_HALF_FLOAT },
{ Pal::ChFmt::R16G16, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
{ { CL_RG, CL_FLOAT },
{ Pal::ChFmt::R32G32, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
/*
// RA
{ { CL_RA, CL_UNORM_INT8 },
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8 } },
{ { CL_RA, CL_UNORM_INT16 },
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16 } },
{ { CL_RA, CL_SNORM_INT8 },
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8 } },
{ { CL_RA, CL_SNORM_INT16 },
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sUV16 } },
{ { CL_RA, CL_SIGNED_INT8 },
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8I } },
{ { CL_RA, CL_SIGNED_INT16 },
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG16I } },
{ { CL_RA, CL_SIGNED_INT32},
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG32I } },
{ { CL_RA, CL_UNSIGNED_INT8 },
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8I } },
{ { CL_RA, CL_UNSIGNED_INT16 },
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16I } },
{ { CL_RA, CL_UNSIGNED_INT32},
{ GSL_CHANNEL_ORDER_RA , CM_SURF_FMT_RG32I } },
{ { CL_RA, CL_HALF_FLOAT },
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16F } },
{ { CL_RA, CL_FLOAT },
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG32F } },
*/
// RGB
{ { CL_RGB, CL_UNORM_INT_101010 },
{ Pal::ChFmt::R10G10B10A2, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
{ { CL_RGB, CL_UNSIGNED_INT8 }, // This is used only by blit kernel
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
// RGBA
{ { CL_RGBA, CL_UNORM_INT8 },
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
{ { CL_RGBA, CL_UNORM_INT16 },
{ Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
{ { CL_RGBA, CL_SNORM_INT8 },
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
{ { CL_RGBA, CL_SNORM_INT16 },
{ Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
{ { CL_RGBA, CL_SIGNED_INT8 },
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
{ { CL_RGBA, CL_SIGNED_INT16 },
{ Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
{ { CL_RGBA, CL_SIGNED_INT32 },
{ Pal::ChFmt::R32G32B32A32, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
{ { CL_RGBA, CL_UNSIGNED_INT8 },
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
{ { CL_RGBA, CL_UNSIGNED_INT16 },
{ Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
{ { CL_RGBA, CL_UNSIGNED_INT32},
{ Pal::ChFmt::R32G32B32A32, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
{ { CL_RGBA, CL_HALF_FLOAT },
{ Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
{ { CL_RGBA, CL_FLOAT },
{ Pal::ChFmt::R32G32B32A32, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
// ARGB
{ { CL_ARGB, CL_UNORM_INT8 },
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
{ { CL_ARGB, CL_SNORM_INT8 },
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
{ { CL_ARGB, CL_SIGNED_INT8 },
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
{ { CL_ARGB, CL_UNSIGNED_INT8 },
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
// BGRA
{ { CL_BGRA, CL_UNORM_INT8 },
{ Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
{ { CL_BGRA, CL_SNORM_INT8 },
{ Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
{ { CL_BGRA, CL_SIGNED_INT8 },
{ Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Sint },
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
{ { CL_BGRA, CL_UNSIGNED_INT8 },
{ Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
// LUMINANCE
{ { CL_LUMINANCE, CL_SNORM_INT8 },
{ Pal::ChFmt::R8, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
{ { CL_LUMINANCE, CL_SNORM_INT16 },
{ Pal::ChFmt::R16, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
{ { CL_LUMINANCE, CL_UNORM_INT8 },
{ Pal::ChFmt::R8, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
{ { CL_LUMINANCE, CL_UNORM_INT16 },
{ Pal::ChFmt::R16, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
{ { CL_LUMINANCE, CL_HALF_FLOAT },
{ Pal::ChFmt::R16, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
{ { CL_LUMINANCE, CL_FLOAT },
{ Pal::ChFmt::R32, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
// INTENSITY
{ { CL_INTENSITY, CL_SNORM_INT8 },
{ Pal::ChFmt::R8, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
{ { CL_INTENSITY, CL_SNORM_INT16 },
{ Pal::ChFmt::R16, Pal::NumFmt::Snorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
{ { CL_INTENSITY, CL_UNORM_INT8 },
{ Pal::ChFmt::R8, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
{ { CL_INTENSITY, CL_UNORM_INT16 },
{ Pal::ChFmt::R16, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
{ { CL_INTENSITY, CL_HALF_FLOAT },
{ Pal::ChFmt::R16, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
{ { CL_INTENSITY, CL_FLOAT },
{ Pal::ChFmt::R32, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
// sRBGA
{ { CL_sRGBA, CL_UNORM_INT8 },
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Srgb },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
{ { CL_sRGBA, CL_UNSIGNED_INT8 }, // This is used only by blit kernel
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
// sRBG
{ { CL_sRGB, CL_UNORM_INT8 },
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Srgb },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
{ { CL_sRGB, CL_UNSIGNED_INT8 }, // This is used only by blit kernel
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
// sRBGx
{ { CL_sRGBx, CL_UNORM_INT8 },
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Srgb },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
{ { CL_sRGBx, CL_UNSIGNED_INT8 }, // This is used only by blit kernel
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
// sBGRA
{ { CL_sBGRA, CL_UNORM_INT8 },
{ Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Srgb },
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
{ { CL_sBGRA, CL_UNSIGNED_INT8 }, // This is used only by blit kernel
{ Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
// DEPTH
{ { CL_DEPTH, CL_FLOAT },
{ Pal::ChFmt::R32, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
{ { CL_DEPTH, CL_UNSIGNED_INT32 }, // This is used only by blit kernel
{ Pal::ChFmt::R32, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
{ { CL_DEPTH, CL_UNORM_INT16 },
{ Pal::ChFmt::R16, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
{ { CL_DEPTH, CL_UNSIGNED_INT16 }, // This is used only by blit kernel
{ Pal::ChFmt::R16, Pal::NumFmt::Uint },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
{ { CL_DEPTH_STENCIL, CL_UNORM_INT24 },
{ Pal::ChFmt::R32, Pal::NumFmt::Unorm },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
{ { CL_DEPTH_STENCIL, CL_FLOAT },
{ Pal::ChFmt::R32, Pal::NumFmt::Float },
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }
};
} // namespace pal
#endif // PALDEFS_HPP_
檔案差異因為檔案過大而無法顯示 載入差異
+598
查看文件
@@ -0,0 +1,598 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALDEVICE_HPP_
#define PALDEVICE_HPP_
#include "top.hpp"
#include "device/device.hpp"
#include "platform/command.hpp"
#include "platform/program.hpp"
#include "platform/perfctr.hpp"
#include "platform/threadtrace.hpp"
#include "platform/memory.hpp"
#include "utils/concurrent.hpp"
#include "thread/thread.hpp"
#include "thread/monitor.hpp"
#include "device/pal/palvirtual.hpp"
#include "device/pal/palmemory.hpp"
#include "device/pal/paldefs.hpp"
#include "device/pal/palsettings.hpp"
#include "device/pal/palappprofile.hpp"
#include "acl.h"
#include "memory"
/*! \addtogroup PAL
* @{
*/
//! PAL Device Implementation
namespace pal {
//! A nil device object
class NullDevice : public amd::Device
{
protected:
static aclCompiler* compiler_;
public:
aclCompiler* compiler() const { return compiler_; }
public:
static bool init(void);
//! Construct a new identifier
NullDevice();
//! Creates an offline device with the specified target
bool create(
Pal::GfxIpLevel ipLevel //!< GPU ip level
);
virtual cl_int createSubDevices(
device::CreateSubDevicesInfo& create_info,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) {
return CL_INVALID_VALUE;
}
//! Instantiate a new virtual device
virtual device::VirtualDevice* createVirtualDevice(
amd::CommandQueue* queue = NULL
) { return NULL; }
//! Compile the given source code.
virtual device::Program* createProgram(amd::option::Options* options = NULL);
//! Just returns NULL for the dummy device
virtual device::Memory* createMemory(amd::Memory& owner) const { return NULL; }
//! Sampler object allocation
virtual bool createSampler(
const amd::Sampler& owner, //!< abstraction layer sampler object
device::Sampler** sampler //!< device sampler object
) const
{
ShouldNotReachHere();
return true;
}
//! Just returns NULL for the dummy device
virtual device::Memory* createView(
amd::Memory& owner, //!< Owner memory object
const device::Memory& parent //!< Parent device memory object for the view
) const { return NULL; }
//! Reallocates the provided buffer object
virtual bool reallocMemory(amd::Memory& owner) const { return true; }
//! Acquire external graphics API object in the host thread
//! Needed for OpenGL objects on CPU device
virtual bool bindExternalDevice(
intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
virtual bool unbindExternalDevice(
intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
//! Releases non-blocking map target memory
virtual void freeMapTarget(amd::Memory& mem, void* target) {}
Pal::GfxIpLevel ipLevel() const { return ipLevel_; }
const AMDDeviceInfo* hwInfo() const { return hwInfo_; }
//! Empty implementation on Null device
virtual bool globalFreeMemory(size_t* freeMemory) const { return false; }
//! Get GPU device settings
const pal::Settings& settings() const
{ return reinterpret_cast<pal::Settings&>(*settings_); }
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const { return NULL; }
virtual void svmFree(void* ptr) const {return;}
protected:
Pal::GfxIpLevel ipLevel_; //!< Device IP level
const AMDDeviceInfo* hwInfo_; //!< Device HW info structure
//! Fills OpenCL device info structure
void fillDeviceInfo(
const Pal::DeviceProperties& palProp,//!< PAL device properties
const Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount],
size_t maxTextureSize, //!< Maximum texture size supported in HW
uint numComputeRings //!< Number of compute rings
);
};
//! Forward declarations
class Command;
class Device;
class GpuCommand;
class Heap;
class HeapBlock;
class Program;
class Kernel;
class Memory;
class Resource;
class VirtualDevice;
class PrintfDbg;
class ThreadTrace;
#ifndef CL_FILTER_NONE
#define CL_FILTER_NONE 0x1142
#endif
class Sampler : public device::Sampler
{
public:
//! Constructor
Sampler(const Device& dev): dev_(dev) {}
//! Default destructor for the device memory object
virtual ~Sampler();
//! Creates a device sampler from the OCL sampler state
bool create(
uint32_t oclSamplerState //!< OCL sampler state
);
//! Creates a device sampler from the OCL sampler state
bool create(
const amd::Sampler& owner //!< AMD sampler object
);
const void* hwState() const { return hwState_; }
private:
//! Disable default copy constructor
Sampler& operator=(const Sampler&);
//! Disable operator=
Sampler(const Sampler&);
const Device& dev_; //!< Device object associated with the sampler
address hwState_; //!< GPU HW state (\todo legacy path)
};
//! A GPU device ordinal (physical GPU device)
class Device : public NullDevice
{
public:
//! Locks any access to the virtual GPUs
class ScopedLockVgpus : public amd::StackObject {
public:
//! Default constructor
ScopedLockVgpus(const Device& dev);
//! Destructor
~ScopedLockVgpus();
private:
const Device& dev_; //! Device object
};
//! Transfer buffers
class XferBuffers : public amd::HeapObject
{
public:
static const size_t MaxXferBufListSize = 8;
//! Default constructor
XferBuffers(const Device& device, Resource::MemoryType type, size_t bufSize)
: type_(type)
, bufSize_(bufSize)
, acquiredCnt_(0)
, gpuDevice_(device)
{}
//! Default destructor
~XferBuffers();
//! Creates the xfer buffers object
bool create();
//! Acquires an instance of the transfer buffers
Memory& acquire();
//! Releases transfer buffer
void release(
VirtualGPU& gpu, //!< Virual GPU object used with the buffer
Memory& buffer //!< Transfer buffer for release
);
//! Returns the buffer's size for transfer
size_t bufSize() const { return bufSize_; }
private:
//! Disable copy constructor
XferBuffers(const XferBuffers&);
//! Disable assignment operator
XferBuffers& operator=(const XferBuffers&);
//! Get device object
const Device& dev() const { return gpuDevice_; }
Resource::MemoryType type_; //!< The buffer's type
size_t bufSize_; //!< Staged buffer size
std::list<Memory*> freeBuffers_; //!< The list of free buffers
amd::Atomic<uint> acquiredCnt_; //!< The total number of acquired buffers
amd::Monitor lock_; //!< Stgaed buffer acquire/release lock
const Device& gpuDevice_; //!< GPU device object
};
//! Virtual address cache entry
struct VACacheEntry : public amd::HeapObject
{
void* startAddress_; //!< Start virtual address
void* endAddress_; //!< End virtual address
Memory* memory_; //!< GPU memory, associated with the range
//! Constructor
VACacheEntry(
void* startAddress, //!< Start virtual address
void* endAddress, //!< End virtual address
Memory* memory //!< GPU memory object
): startAddress_(startAddress), endAddress_(endAddress), memory_(memory) {}
private:
//! Disable default constructor
VACacheEntry();
};
struct ScratchBuffer : public amd::HeapObject
{
uint regNum_; //!< The number of used scratch registers
Memory* memObj_; //!< Memory objects for scratch buffers
uint offset_; //!< Offset from the global scratch store
uint size_; //!< Scratch buffer size on this queue
//! Default constructor
ScratchBuffer(): regNum_(0), memObj_(NULL), offset_(0) {}
//! Default constructor
~ScratchBuffer();
//! Destroys memory objects
void destroyMemory();
};
class SrdManager : public amd::HeapObject {
public:
SrdManager(const Device& dev, uint srdSize, uint bufSize)
: dev_(dev)
, numFlags_(bufSize / (srdSize * MaskBits))
, srdSize_(srdSize)
, bufSize_(bufSize) {}
~SrdManager();
//! Allocates a new SRD slot for a resource
uint64_t allocSrdSlot(address* cpuAddr);
//! Frees a SRD slot
void freeSrdSlot(uint64_t addr);
// Fills the memory list for VidMM KMD
void fillResourceList(std::vector<const Memory*>& memList);
private:
//! Disable copy constructor
SrdManager(const SrdManager&);
//! Disable assignment operator
SrdManager& operator=(const SrdManager&);
struct Chunk {
Memory* buf_;
uint* flags_;
Chunk(): buf_(NULL), flags_(NULL) {}
};
static const uint MaskBits = 32;
const Device& dev_; //!< GPU device for the chunk manager
amd::Monitor ml_; //!< Global lock for the SRD manager
std::vector<Chunk> pool_; //!< Pool of SRD buffers
uint numFlags_; //!< Total number of flags in array
uint srdSize_; //!< SRD size
uint bufSize_; //!< Buffer size that holds SRDs
};
//! Initialise the whole GPU device subsystem
static bool init();
//! Shutdown the whole GPU device subsystem
static void tearDown();
//! Construct a new physical GPU device
Device();
//! Initialise a device (i.e. all parts of the constructor that could
//! potentially fail)
bool create(
Pal::IDevice* device //!< PAL device interface object
);
//! Destructor for the physical GPU device
virtual ~Device();
//! Instantiate a new virtual device
device::VirtualDevice* createVirtualDevice(
amd::CommandQueue* queue = NULL
);
//! Memory allocation
virtual device::Memory* createMemory(
amd::Memory& owner //!< abstraction layer memory object
) const;
//! Sampler object allocation
virtual bool createSampler(
const amd::Sampler& owner, //!< abstraction layer sampler object
device::Sampler** sampler //!< device sampler object
) const;
//! Reallocates the provided buffer object
virtual bool reallocMemory(
amd::Memory& owner //!< Buffer for reallocation
) const;
//! Allocates a view object from the device memory
virtual device::Memory* createView(
amd::Memory& owner, //!< Owner memory object
const device::Memory& parent //!< Parent device memory object for the view
) const;
//! Create the device program.
virtual device::Program* createProgram(amd::option::Options* options = NULL);
//! Attempt to bind with external graphics API's device/context
virtual bool bindExternalDevice(
intptr_t type,
void* pDevice,
void* pContext,
bool validateOnly);
//! Attempt to unbind with external graphics API's device/context
virtual bool unbindExternalDevice(
intptr_t type,
void* pDevice,
void* pContext,
bool validateOnly);
//! Validates kernel before execution
virtual bool validateKernel(
const amd::Kernel& kernel, //!< AMD kernel object
const device::VirtualDevice* vdev
);
//! Retrieves information about free memory on a GPU device
virtual bool globalFreeMemory(size_t* freeMemory) const;
//! Returns a GPU memory object from AMD memory object
pal::Memory* getGpuMemory(
amd::Memory* mem //!< Pointer to AMD memory object
) const;
amd::Monitor& lockAsyncOps() const { return *lockAsyncOps_; }
//! Returns the lock object for the virtual gpus list
amd::Monitor* vgpusAccess() const { return vgpusAccess_; }
//! Returns the monitor object for PAL
amd::Monitor& lockPAL() const { return *lockPAL_; }
//! Returns the number of virtual GPUs allocated on this device
uint numOfVgpus() const { return numOfVgpus_; }
uint numOfVgpus_; //!< The number of virtual GPUs (lock protected)
typedef std::vector<VirtualGPU*> VirtualGPUs;
//! Returns the list of all virtual GPUs running on this device
const VirtualGPUs vgpus() const { return vgpus_; }
VirtualGPUs vgpus_; //!< The list of all running virtual gpus (lock protected)
//! Scratch buffer allocation
pal::Memory* createScratchBuffer(
size_t size //!< Size of buffer
) const;
//! Returns transfer buffer object
XferBuffers& xferWrite() const { return *xferWrite_; }
//! Returns transfer buffer object
XferBuffers& xferRead() const { return *xferRead_; }
//! Adds GPU memory to the VA cache list
void addVACache(Memory* memory) const;
//! Removes GPU memory from the VA cache list
void removeVACache(const Memory* memory) const;
//! Finds GPU memory from virtual address
Memory* findMemoryFromVA(const void* ptr, size_t* offset) const;
//! Finds an appropriate map target
amd::Memory* findMapTarget(size_t size) const;
//! Adds a map target to the cache
bool addMapTarget(amd::Memory* memory) const;
//! Returns resource cache object
ResourceCache& resourceCache() const { return *resourceCache_; }
//! Returns the number of available compute rings
uint numComputeEngines() const { return numComputeEngines_; }
//! Returns the number of available DMA engines
uint numDMAEngines() const { return numDmaEngines_; }
//! Returns engines object
const device::BlitManager& xferMgr() const;
VirtualGPU* xferQueue() const { return xferQueue_; }
//! Retrieves the internal format from the OCL format
Pal::Format getPalFormat(
const amd::Image::Format& format, //! OCL image format
Pal::ChannelMapping* channel
) const;
const ScratchBuffer* scratch(uint idx) const { return scratch_[idx]; }
//! Returns the global scratch buffer
Memory* globalScratchBuf() const { return globalScratchBuf_; };
//! Destroys scratch buffer memory
void destroyScratchBuffers();
//! Initialize heap resources if uninitialized
bool initializeHeapResources();
//! Set GSL sampler to the specified state
void fillHwSampler(
uint32_t state, //!< Sampler's OpenCL state
void* hwState, //!< Sampler's HW state
uint32_t hwStateSize, //!< Size of sampler's HW state
uint32_t mipFilter = CL_FILTER_NONE, //!< Mip filter
float minLod = 0.f, //!< Min level of detail
float maxLod = CL_MAXFLOAT //!< Max level of detail
) const;
//! host memory alloc
virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
//! SVM allocation
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
cl_svm_mem_flags flags, void* svmPtr) const;
//! Free host SVM memory
void hostFree(void* ptr, size_t size) const;
//! SVM free
virtual void svmFree(void* ptr) const;
//! Returns SRD manger object
SrdManager& srds() const { return *srdManager_; }
//! Initial the Hardware Debug Manager
cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage);
//! Returns PAL device properties
const Pal::DeviceProperties& properties() const { return properties_; }
//! Returns PAL device interface
Pal::IDevice* iDev() const { return device_; }
//! Return private device context for internal allocations
amd::Context& context() const { return *context_; }
//! Update free memory for OCL extension
void updateFreeMemory(
Pal::GpuHeap heap, //!< PAL GPU heap for update
Pal::gpusize size, //!< Size of alocated/destroyed memory
bool free //!< TRUE if runtime frees memory
);
//! Interop for GL device
bool initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const;
bool glCanInterop(void* GLplatformContext, void* GLdeviceContext) const;
bool resGLAssociate(void* GLContext, uint name, uint type,
void** handle, void** mbResHandle, size_t* offset) const;
bool resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const;
bool resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const;
bool resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const;
private:
//! Disable copy constructor
Device(const Device&);
//! Disable assignment
Device& operator=(const Device&);
//! Sends the stall command to all queues
bool stallQueues();
//! Buffer allocation
pal::Memory* createBuffer(
amd::Memory& owner, //!< Abstraction layer memory object
bool directAccess //!< Use direct host memory access
) const;
//! Image allocation
pal::Memory* createImage(
amd::Memory& owner, //!< Abstraction layer memory object
bool directAccess //!< Use direct host memory access
) const;
//! Allocates/reallocates the scratch buffer, according to the usage
bool allocScratch(
uint regNum, //!< Number of the scratch registers
const VirtualGPU* vgpu //!< Virtual GPU for the allocation
);
//! Interop for D3D devices
bool associateD3D11Device(
void* d3d11Device //!< void* is of type ID3D11Device*
);
bool associateD3D10Device(
void* d3d10Device //!< void* is of type ID3D10Device*
);
bool associateD3D9Device(
void* d3d9Device //!< void* is of type IDirect3DDevice9*
);
//! Interop for GL device
bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const;
bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const;
amd::Context* context_; //!< A dummy context for internal allocations
amd::Monitor* lockAsyncOps_; //!< Lock to serialise all async ops on this device
amd::Monitor* lockForInitHeap_; //!< Lock to serialise all async ops on initialization heap operation
amd::Monitor* lockPAL_; //!< Lock to serialise PAL access
amd::Monitor* vgpusAccess_; //!< Lock to serialise virtual gpu list access
amd::Monitor* scratchAlloc_; //!< Lock to serialise scratch allocation
amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources
XferBuffers* xferRead_; //!< Transfer buffers read
XferBuffers* xferWrite_; //!< Transfer buffers write
amd::Monitor* vaCacheAccess_; //!< Lock to serialize VA caching access
std::list<VACacheEntry*>* vaCacheList_; //!< VA cache list
std::vector<amd::Memory*>* mapCache_; //!< Map cache info structure
ResourceCache* resourceCache_; //!< Resource cache
uint numComputeEngines_; //!< The number of available compute engines
uint numDmaEngines_; //!< The number of available compute engines
bool heapInitComplete_; //!< Keep track of initialization status of heap resources
VirtualGPU* xferQueue_; //!< Transfer queue
std::vector<ScratchBuffer*> scratch_; //!< Scratch buffers for kernels
Memory* globalScratchBuf_; //!< Global scratch buffer
SrdManager* srdManager_; //!< SRD manager object
static AppProfile appProfile_; //!< application profile
mutable bool freeCPUMem_; //!< flag to mark GPU free SVM CPU mem
Pal::DeviceProperties properties_; //!< PAL device properties
Pal::IDevice* device_; //!< PAL device object
std::atomic<Pal::gpusize> freeMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter
};
/*@}*/} // namespace pal
#endif /*PALDEVICE_HPP_*/
+143
查看文件
@@ -0,0 +1,143 @@
#include "paldevice.hpp"
#if defined(ATI_OS_LINUX)
namespace pal {
bool
Device::associateD3D10Device(void* d3d10Device)
{
return false;
}
} // pal
#else // !ATI_OS_WIN
#include <D3D10_1.h>
/**************************************************************************************************************
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
* This means OCL client spec will need to change to include headers directly from the DXX perforce tree.
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes.
**************************************************************************************************************/
#include "DxxOpenCLInteropExt.h"
namespace pal {
static bool
queryD3D10DeviceGPUMask(ID3D10Device* pd3d10Device, UINT* pd3d10DeviceGPUMask)
{
HMODULE hDLL = nullptr;
IAmdDxExt* pExt = nullptr;
IAmdDxExtCLInterop* pCLExt = nullptr;
PFNAmdDxExtCreate AmdDxExtCreate;
HRESULT hr = S_OK;
// Get a handle to the DXX DLL with extension API support
#if defined _WIN64
static const CHAR dxxModuleName[13] = "atidxx64.dll";
#else
static const CHAR dxxModuleName[13] = "atidxx32.dll";
#endif
hDLL = GetModuleHandle(dxxModuleName);
if (hDLL == nullptr) {
hr = E_FAIL;
}
// Get the exported AmdDxExtCreate() function pointer
if (SUCCEEDED(hr)) {
AmdDxExtCreate = reinterpret_cast<PFNAmdDxExtCreate>(
GetProcAddress(hDLL, "AmdDxExtCreate"));
if (AmdDxExtCreate == nullptr) {
hr = E_FAIL;
}
}
// Create the extension object
if (SUCCEEDED(hr)) {
hr = AmdDxExtCreate(pd3d10Device, &pExt);
}
// Get the extension version information
if (SUCCEEDED(hr)) {
AmdDxExtVersion extVersion;
hr = pExt->GetVersion(&extVersion);
if (extVersion.majorVersion == 0)
{
hr = E_FAIL;
}
}
// Get the OpenCL Interop interface
if (SUCCEEDED(hr)) {
pCLExt = static_cast<IAmdDxExtCLInterop*>(
pExt->GetExtInterface(AmdDxExtCLInteropID));
if (pCLExt != nullptr) {
// Get the GPU mask using the CL Interop extension.
pCLExt->QueryInteropGpuMask(pd3d10DeviceGPUMask);
}
else {
hr = E_FAIL;
}
}
if (pCLExt != nullptr) {
pCLExt->Release();
}
if (pExt != nullptr) {
pExt->Release();
}
return (SUCCEEDED(hr));
}
bool
Device::associateD3D10Device(void* d3d10Device)
{
ID3D10Device* pd3d10Device = static_cast<ID3D10Device*>(d3d10Device);
IDXGIDevice* pDXGIDevice;
pd3d10Device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice);
IDXGIAdapter* pDXGIAdapter;
pDXGIDevice->GetAdapter(&pDXGIAdapter);
DXGI_ADAPTER_DESC adapterDesc;
pDXGIAdapter->GetDesc(&adapterDesc);
// match the adapter
bool canInteroperate =
(properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) &&
(properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart);
UINT chainBitMask = 1 << properties().gpuIndex;
// match the chain ID
if (canInteroperate) {
UINT d3d10DeviceGPUMask = 0;
if (queryD3D10DeviceGPUMask(pd3d10Device, &d3d10DeviceGPUMask)) {
canInteroperate = (chainBitMask & d3d10DeviceGPUMask) != 0;
}
else {
// special handling for Intel iGPU + AMD dGPU in LDA mode
// (only occurs on a PX platform) where
// the D3D10Device object is created on the Intel iGPU and
// passed to AMD dGPU (secondary) to interoperate.
if (chainBitMask > 1) {
canInteroperate = false;
}
}
}
pDXGIDevice->Release();
pDXGIAdapter->Release();
return canInteroperate;
}
} // pal
#endif // !ATI_OS_WIN
+142
查看文件
@@ -0,0 +1,142 @@
#include "paldevice.hpp"
#if defined(ATI_OS_LINUX)
namespace pal {
bool
Device::associateD3D11Device(void* d3d11Device)
{
return false;
}
}
#else // !ATI_OS_LINUX
#include <D3D11.h>
/**************************************************************************************************************
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
* This means OCL client spec will need to change to include headers directly from the DXX perforce tree.
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes.
**************************************************************************************************************/
#include "DxxOpenCLInteropExt.h"
namespace pal {
static bool
queryD3D11DeviceGPUMask(ID3D11Device* pd3d11Device, UINT* pd3d11DeviceGPUMask)
{
HMODULE hDLL = nullptr;
IAmdDxExt* pExt = nullptr;
IAmdDxExtCLInterop* pCLExt = nullptr;
PFNAmdDxExtCreate11 AmdDxExtCreate11;
HRESULT hr = S_OK;
// Get a handle to the DXX DLL with extension API support
#if defined _WIN64
static const CHAR dxxModuleName[13] = "atidxx64.dll";
#else
static const CHAR dxxModuleName[13] = "atidxx32.dll";
#endif
hDLL = GetModuleHandle(dxxModuleName);
if (hDLL == nullptr) {
hr = E_FAIL;
}
// Get the exported AmdDxExtCreate() function pointer
if (SUCCEEDED(hr)) {
AmdDxExtCreate11 = reinterpret_cast<PFNAmdDxExtCreate11>(
GetProcAddress(hDLL, "AmdDxExtCreate11"));
if (AmdDxExtCreate11 == nullptr) {
hr = E_FAIL;
}
}
// Create the extension object
if (SUCCEEDED(hr)) {
hr = AmdDxExtCreate11(pd3d11Device, &pExt);
}
// Get the extension version information
if (SUCCEEDED(hr)) {
AmdDxExtVersion extVersion;
hr = pExt->GetVersion(&extVersion);
if (extVersion.majorVersion == 0) {
hr = E_FAIL;
}
}
// Get the OpenCL Interop interface
if (SUCCEEDED(hr)) {
pCLExt = static_cast<IAmdDxExtCLInterop*>(
pExt->GetExtInterface(AmdDxExtCLInteropID));
if (pCLExt != nullptr) {
// Get the GPU mask using the CL Interop extension.
pCLExt->QueryInteropGpuMask(pd3d11DeviceGPUMask);
}
else {
hr = E_FAIL;
}
}
if (pCLExt != nullptr) {
pCLExt->Release();
}
if (pExt != nullptr) {
pExt->Release();
}
return (SUCCEEDED(hr));
}
bool
Device::associateD3D11Device(void* d3d11Device)
{
ID3D11Device* pd3d11Device = static_cast<ID3D11Device*>(d3d11Device);
IDXGIDevice* pDXGIDevice;
pd3d11Device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice);
IDXGIAdapter* pDXGIAdapter;
pDXGIDevice->GetAdapter(&pDXGIAdapter);
DXGI_ADAPTER_DESC adapterDesc;
pDXGIAdapter->GetDesc(&adapterDesc);
// match the adapter
bool canInteroperate =
(properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) &&
(properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart);
UINT chainBitMask = 1 << properties().gpuIndex;
// match the chain ID
if (canInteroperate) {
UINT d3d11DeviceGPUMask = 0;
if (queryD3D11DeviceGPUMask(pd3d11Device, &d3d11DeviceGPUMask)) {
canInteroperate = (chainBitMask & d3d11DeviceGPUMask) != 0;
}
else {
// special handling for Intel iGPU + AMD dGPU in LDA mode
// (only occurs on a PX platform) where
// the D3D11Device object is created on the Intel iGPU and
// passed to AMD dGPU (secondary) to interoperate.
if (chainBitMask > 1) {
canInteroperate = false;
}
}
}
pDXGIDevice->Release();
pDXGIAdapter->Release();
return canInteroperate;
}
} // pal
#endif // !ATI_OS_LINUX
+53
查看文件
@@ -0,0 +1,53 @@
#include "paldevice.hpp"
#if defined(ATI_OS_LINUX)
namespace pal {
bool
Device::associateD3D9Device(void* d3dDevice)
{
return false;
}
}
#else // !ATI_OS_LINUX
#include <d3d9.h>
#include <dxgi.h>
/**************************************************************************************************************
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
* This means OCL client spec will need to change to include headers directly from the DXX perforce tree.
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes.
**************************************************************************************************************/
#include "DxxOpenCLInteropExt.h"
namespace pal {
bool
Device::associateD3D9Device(void* d3d9Device)
{
D3DCAPS9 pCaps;
IDirect3D9* p3d9dev;
LUID d3d9deviceLuid = {0, 0};
IDirect3DDevice9* pd3d9Device = static_cast<IDirect3DDevice9*>(d3d9Device);
// Get D3D9 Device caps
pd3d9Device->GetDeviceCaps(&pCaps);
// Get 3D9 Device
pd3d9Device->GetDirect3D(&p3d9dev);
IDirect3D9Ex* p3d9devEx = static_cast<IDirect3D9Ex*>(p3d9dev);
p3d9devEx->GetAdapterLUID(pCaps.AdapterOrdinal, &d3d9deviceLuid);
p3d9dev->Release();
// match the adapter
bool canInteroperate =
(properties().osProperties.luidHighPart == d3d9deviceLuid.HighPart) &&
(properties().osProperties.luidLowPart == d3d9deviceLuid.LowPart);
return canInteroperate;
}
} // pal
#endif // !ATI_OS_WIN
+306
查看文件
@@ -0,0 +1,306 @@
#include "platform/context.hpp"
#include "device/device.hpp"
#include "platform/runtime.hpp"
#include "platform/agent.hpp"
#ifdef _WIN32
#include <d3d10_1.h>
#include "CL/cl_d3d10.h"
#include "CL/cl_d3d11.h"
#endif // _WIN32
#include <GL/gl.h>
#include <GL/glext.h>
#include "CL/cl_gl.h"
#include "paldevice.hpp"
//#include "cwddeci.h"
#include <GL/gl.h>
#include "GL/glATIInternal.h"
#ifdef ATI_OS_LINUX
#include <stdlib.h>
#include <dlfcn.h>
#include "GL/glx.h"
#include "GL/glxext.h"
#include "GL/glXATIPrivate.h"
#else
#include "GL/wglATIPrivate.h"
#endif
#ifdef ATI_OS_LINUX
typedef void* (*PFNGlxGetProcAddress)(const GLubyte* procName);
static PFNGlxGetProcAddress pfnGlxGetProcAddress=NULL;
static PFNGLXBEGINCLINTEROPAMD glXBeginCLInteropAMD = NULL;
static PFNGLXENDCLINTEROPAMD glXEndCLInteropAMD = NULL;
static PFNGLXRESOURCEATTACHAMD glXResourceAttachAMD = NULL;
static PFNGLXRESOURCEDETACHAMD glxResourceAcquireAMD = NULL;
static PFNGLXRESOURCEDETACHAMD glxResourceReleaseAMD = NULL;
static PFNGLXRESOURCEDETACHAMD glXResourceDetachAMD = NULL;
static PFNGLXGETCONTEXTMVPUINFOAMD glXGetContextMVPUInfoAMD = NULL;
#else
static PFNWGLBEGINCLINTEROPAMD wglBeginCLInteropAMD = NULL;
static PFNWGLENDCLINTEROPAMD wglEndCLInteropAMD = NULL;
static PFNWGLRESOURCEATTACHAMD wglResourceAttachAMD = NULL;
static PFNWGLRESOURCEDETACHAMD wglResourceAcquireAMD = NULL;
static PFNWGLRESOURCEDETACHAMD wglResourceReleaseAMD = NULL;
static PFNWGLRESOURCEDETACHAMD wglResourceDetachAMD = NULL;
static PFNWGLGETCONTEXTGPUINFOAMD wglGetContextGPUInfoAMD = NULL;
#endif
namespace pal {
bool
Device::initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const
{
#ifdef ATI_OS_LINUX
GLXContext ctx = (GLXContext)GLplatformContext;
void * pModule = dlopen("libGL.so.1",RTLD_NOW);
if(NULL == pModule) {
return false;
}
pfnGlxGetProcAddress = (PFNGlxGetProcAddress) dlsym(pModule,"glXGetProcAddress");
if (NULL == pfnGlxGetProcAddress) {
return false;
}
if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD ||
!glXResourceDetachAMD || !glXGetContextMVPUInfoAMD) {
glXBeginCLInteropAMD = (PFNGLXBEGINCLINTEROPAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXBeginCLInteroperabilityAMD");
glXEndCLInteropAMD = (PFNGLXENDCLINTEROPAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXEndCLInteroperabilityAMD");
glXResourceAttachAMD = (PFNGLXRESOURCEATTACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceAttachAMD");
glxResourceAcquireAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceAcquireAMD");
glxResourceReleaseAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceReleaseAMD");
glXResourceDetachAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceDetachAMD");
glXGetContextMVPUInfoAMD = (PFNGLXGETCONTEXTMVPUINFOAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXGetContextMVPUInfoAMD");
}
if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD ||
!glXResourceDetachAMD
#ifndef BRAHMA
|| !glXGetContextMVPUInfoAMD
#endif
) {
return false;
}
#else
if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD ||
!wglResourceDetachAMD || !wglGetContextGPUInfoAMD) {
HGLRC fakeRC = NULL;
if (!wglGetCurrentContext()) {
fakeRC = wglCreateContext((HDC)GLdeviceContext);
wglMakeCurrent((HDC)GLdeviceContext, fakeRC);
}
wglBeginCLInteropAMD = (PFNWGLBEGINCLINTEROPAMD) wglGetProcAddress ("wglBeginCLInteroperabilityAMD");
wglEndCLInteropAMD = (PFNWGLENDCLINTEROPAMD) wglGetProcAddress ("wglEndCLInteroperabilityAMD");
wglResourceAttachAMD = (PFNWGLRESOURCEATTACHAMD) wglGetProcAddress ("wglResourceAttachAMD");
wglResourceAcquireAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceAcquireAMD");
wglResourceReleaseAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceReleaseAMD");
wglResourceDetachAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceDetachAMD");
wglGetContextGPUInfoAMD = (PFNWGLGETCONTEXTGPUINFOAMD) wglGetProcAddress ("wglGetContextGPUInfoAMD");
if (fakeRC) {
wglMakeCurrent(NULL, NULL);
wglDeleteContext(fakeRC);
}
}
if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD ||
!wglResourceDetachAMD || !wglGetContextGPUInfoAMD) {
return false;
}
#endif
return true;
}
bool
Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const
{
bool canInteroperate = false;
#ifdef ATI_OS_WIN
LUID glAdapterLuid = {0, 0};
UINT glChainBitMask = 0;
HGLRC hRC = (HGLRC)GLplatformContext;
//get GL context's LUID and chainBitMask from UGL
if (wglGetContextGPUInfoAMD(hRC, &glAdapterLuid, &glChainBitMask)) {
// match the adapter
canInteroperate =
(properties().osProperties.luidHighPart == glAdapterLuid.HighPart) &&
(properties().osProperties.luidLowPart == glAdapterLuid.LowPart) &&
((1 << properties().gpuIndex) == glChainBitMask);
}
#else
#ifdef BRAHMA
canInteroperate = true;
#else
GLuint glDeviceId = 0 ;
GLuint glChainMask = 0 ;
GLXContext ctx = (GLXContext)GLplatformContext;
if (glXGetContextMVPUInfoAMD(ctx, &glDeviceId, &glChainMask)) {
// we allow intoperability only with GL context reside on a single GPU
canInteroperate =
(properties().deviceId == glDeviceId) &&
((1 << properties().gpuIndex) == glChainBitMask);
}
}
#endif
#endif
return canInteroperate;
}
bool
Device::glAssociate(void* GLplatformContext, void* GLdeviceContext) const
{
//initialize pointers to the gl extension that supports interoperability
if (!initGLInteropPrivateExt(GLplatformContext, GLdeviceContext) ||
!glCanInterop(GLplatformContext, GLdeviceContext)) {
return false;
}
int flags = 0;
/*
if (m_adp->pAsicInfo->svmFineGrainSystem)
{
flags = GL_INTEROP_SVM;
}
*/
#ifdef ATI_OS_LINUX
GLXContext ctx = (GLXContext)GLplatformContext;
return (glXBeginCLInteropAMD(ctx, 0)) ? true : false;
#else
HGLRC hRC = (HGLRC)GLplatformContext;
return (wglBeginCLInteropAMD(hRC, flags)) ? true : false;
#endif
}
bool
Device::glDissociate(void* GLplatformContext, void* GLdeviceContext) const
{
int flags = 0;
/*
if (m_adp->pAsicInfo->svmFineGrainSystem)
{
flags = GL_INTEROP_SVM;
}
*/
#ifdef ATI_OS_LINUX
GLXContext ctx = (GLXContext)GLplatformContext;
return (glXEndCLInteropAMD(ctx, 0)) ? true : false;
#else
HGLRC hRC = (HGLRC)GLplatformContext;
return (wglEndCLInteropAMD(hRC, flags)) ? true : false;
#endif
}
bool
Device::resGLAssociate(
void* GLContext,
uint name,
uint type,
void** handle,
void** mbResHandle,
size_t* offset) const
{
amd::ScopedLock lk(lockPAL());
GLResource hRes = {};
GLResourceData hData = {};
bool status = false;
hRes.type = type;
hRes.name = name;
hData.version = GL_RESOURCE_DATA_VERSION;
#ifdef ATI_OS_LINUX
GLXContext ctx = (GLXContext)GLContext;
if (glXResourceAttachAMD(ctx, &hRes, &hData)) {
attribs.dynamicSharedBufferID = hData->sharedBufferID;
status = true;
}
#else
HGLRC hRC = (HGLRC)GLContext;
if (wglResourceAttachAMD(hRC, &hRes, &hData)) {
status = true;
}
#endif
if (!status) {
return false;
}
*handle = reinterpret_cast<void*>(hData.handle);
*mbResHandle = reinterpret_cast<void*>(hData.mbResHandle);
*offset = static_cast<size_t>(hData.offset);
return status;
}
bool
Device::resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const
{
amd::ScopedLock lk(lockPAL());
GLResource hRes = {};
hRes.mbResHandle = (GLuintp)mbResHandle;
hRes.type = type;
#ifdef ATI_OS_LINUX
GLXContext ctx = (GLXContext) GLplatformContext;
return (glxResourceAcquireAMD(ctx, &hRes)) ? true : false;
#else
HGLRC hRC = wglGetCurrentContext();
//! @todo A temporary workaround for MT issue in conformance fence_sync
if (0 == hRC) {
return true;
}
return (wglResourceAcquireAMD(hRC, &hRes)) ? true : false;
#endif
}
bool
Device::resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const
{
amd::ScopedLock lk(lockPAL());
GLResource hRes = {};
hRes.mbResHandle = (GLuintp)mbResHandle;
hRes.type = type;
#ifdef ATI_OS_LINUX
//TODO : make sure the application GL context is current. if not no
// point calling into the GL RT.
GLXContext ctx = (GLXContext) GLplatformContext;
return (glxResourceReleaseAMD(ctx, &hRes)) ? true : false;
#else
// Make the call into the GL driver only if the application GL context is current
HGLRC hRC = wglGetCurrentContext();
//! @todo A temporary workaround for MT issue in conformance fence_sync
if (0 == hRC) {
return true;
}
return (wglResourceReleaseAMD(hRC, &hRes)) ? true : false;
#endif
}
bool
Device::resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const
{
amd::ScopedLock lk(lockPAL());
GLResource hRes = {};
hRes.mbResHandle = (GLuintp)mbResHandle;
hRes.type = type;
#ifdef ATI_OS_LINUX
GLXContext ctx = (GLXContext)GLplatformContext;
return (glXResourceDetachAMD(ctx, &hRes)) ? true : false;
#else
HGLRC hRC = (HGLRC)GLplatformContext;
return (wglResourceDetachAMD(hRC, &hRes)) ? true : false;
#endif
}
} // pal
檔案差異因為檔案過大而無法顯示 載入差異
+263
查看文件
@@ -0,0 +1,263 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef GPUKERNEL_HPP_
#define GPUKERNEL_HPP_
#include "device/device.hpp"
#include "utils/macros.hpp"
#include "platform/command.hpp"
#include "platform/program.hpp"
#include "platform/kernel.hpp"
#include "platform/sampler.hpp"
#include "device/pal/paldevice.hpp"
#include "device/pal/palvirtual.hpp"
#include "amd_hsa_kernel_code.h"
#include "device/pal/palprintf.hpp"
#include "device/pal/palwavelimiter.hpp"
#include "hsa.h"
namespace amd {
namespace hsa {
namespace loader {
class Symbol;
} // loader
} // hsa
} // amd
//! \namespace pal PAL Device Implementation
namespace pal {
class VirtualGPU;
class Device;
class NullDevice;
class HSAILProgram;
struct HWSHADER_Helper
{
template <typename S, typename T>
static T Get(S base, T offset) {
return reinterpret_cast<T>(reinterpret_cast<intptr_t>(base)
+ reinterpret_cast<size_t>(offset));
}
};
#define HWSHADER_Get(shader, field) \
HWSHADER_Helper::Get((shader), (shader)->field)
template <typename D, typename S>
static void CalcPtr(D& dst, const S src, size_t structSize, size_t size) {
dst = reinterpret_cast<D>(reinterpret_cast<const intptr_t>(src)
+ structSize * size);
}
/*! \addtogroup pal PAL Device Implementation
* @{
*/
enum HSAIL_ADDRESS_QUALIFIER{
HSAIL_ADDRESS_ERROR = 0,
HSAIL_ADDRESS_GLOBAL,
HSAIL_ADDRESS_LOCAL,
HSAIL_MAX_ADDRESS_QUALIFIERS
} ;
enum HSAIL_ARG_TYPE{
HSAIL_ARGTYPE_ERROR = 0,
HSAIL_ARGTYPE_POINTER,
HSAIL_ARGTYPE_VALUE,
HSAIL_ARGTYPE_IMAGE,
HSAIL_ARGTYPE_SAMPLER,
HSAIL_ARGTYPE_QUEUE,
HSAIL_ARGMAX_ARG_TYPES
};
enum HSAIL_DATA_TYPE{
HSAIL_DATATYPE_ERROR = 0,
HSAIL_DATATYPE_B1,
HSAIL_DATATYPE_B8,
HSAIL_DATATYPE_B16,
HSAIL_DATATYPE_B32,
HSAIL_DATATYPE_B64,
HSAIL_DATATYPE_S8,
HSAIL_DATATYPE_S16,
HSAIL_DATATYPE_S32,
HSAIL_DATATYPE_S64,
HSAIL_DATATYPE_U8,
HSAIL_DATATYPE_U16,
HSAIL_DATATYPE_U32,
HSAIL_DATATYPE_U64,
HSAIL_DATATYPE_F16,
HSAIL_DATATYPE_F32,
HSAIL_DATATYPE_F64,
HSAIL_DATATYPE_STRUCT,
HSAIL_DATATYPE_OPAQUE,
HSAIL_DATATYPE_MAX_TYPES
};
enum HSAIL_ACCESS_TYPE {
HSAIL_ACCESS_TYPE_NONE = 0,
HSAIL_ACCESS_TYPE_RO,
HSAIL_ACCESS_TYPE_WO,
HSAIL_ACCESS_TYPE_RW
};
class HSAILKernel : public device::Kernel
{
public:
struct Argument
{
std::string name_; //!< Argument's name
std::string typeName_; //!< Argument's type name
uint size_; //!< Size in bytes
uint offset_; //!< Argument's offset
uint alignment_; //!< Argument's alignment
HSAIL_ARG_TYPE type_; //!< Type of the argument
HSAIL_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument
HSAIL_DATA_TYPE dataType_; //!< The type of data
uint numElem_; //!< Number of elements
HSAIL_ACCESS_TYPE access_; //!< Access type for the argument
};
// Max number of possible extra (hidden) kernel arguments
static const uint MaxExtraArgumentsNum = 6;
HSAILKernel(std::string name,
HSAILProgram* prog,
std::string compileOptions,
uint extraArgsNum);
virtual ~HSAILKernel();
//! Initializes the metadata required for this kernel,
//! finalizes the kernel if needed
bool init(amd::hsa::loader::Symbol *sym, bool finalize = false);
//! Returns true if memory is valid for execution
virtual bool validateMemory(uint idx, amd::Memory* amdMem) const;
//! Returns a pointer to the hsail argument
const Argument* argument(size_t i) const { return arguments_[i]; }
//! Returns the number of hsail arguments
size_t numArguments() const { return arguments_.size(); }
//! Returns GPU device object, associated with this kernel
const Device& dev() const;
//! Returns HSA program associated with this kernel
const HSAILProgram& prog() const;
//! Returns LDS size used in this kernel
uint32_t ldsSize() const
{ return cpuAqlCode_->workgroup_group_segment_byte_size; }
//! Returns pointer on CPU to AQL code info
const void* cpuAqlCode() const { return cpuAqlCode_; }
//! Returns memory object with AQL code
pal::Memory* gpuAqlCode() const { return code_; }
//! Returns size of AQL code
size_t aqlCodeSize() const { return codeSize_; }
//! Returns the size of argument buffer
size_t argsBufferSize() const
{ return cpuAqlCode_->kernarg_segment_byte_size; }
//! Returns spill reg size per workitem
int spillSegSize() const
{ return cpuAqlCode_->workitem_private_segment_byte_size; }
//! Returns TRUE if kernel uses dynamic parallelism
bool dynamicParallelism() const
{ return (flags_.dynamicParallelism_) ? true : false; }
//! Returns TRUE if kernel is internal kernel
bool isInternalKernel() const
{ return (flags_.internalKernel_) ? true : false; }
//! Finds local workgroup size
void findLocalWorkSize(
size_t workDim, //!< Work dimension
const amd::NDRange& gblWorkSize,//!< Global work size
amd::NDRange& lclWorkSize //!< Local work size
) const;
//! Returns AQL packet in CPU memory
//! if the kerenl arguments were successfully loaded, otherwise NULL
hsa_kernel_dispatch_packet_t* loadArguments(
VirtualGPU& gpu, //!< Running GPU context
const amd::Kernel& kernel, //!< AMD kernel object
const amd::NDRangeContainer& sizes, //!< NDrange container
const_address parameters, //!< Application arguments for the kernel
bool nativeMem, //!< Native memory objectes are passed
uint64_t vmDefQueue, //!< GPU VM default queue pointer
uint64_t* vmParentWrap, //!< GPU VM parent aql wrap object
std::vector<const Memory*>& memList //!< Memory list for GSL/VidMM handles
) const;
//! Returns pritnf info array
const std::vector<PrintfInfo>& printfInfo() const { return printf_; }
//! Returns the kernel index in the program
uint index() const { return index_; }
//! Returns kernel's extra argument count
uint extraArgumentsNum() const { return extraArgumentsNum_; }
private:
//! Disable copy constructor
HSAILKernel(const HSAILKernel&);
//! Disable operator=
HSAILKernel& operator=(const HSAILKernel&);
//! Creates AQL kernel HW info
bool aqlCreateHWInfo(amd::hsa::loader::Symbol *sym);
//! Initializes arguments_ and the abstraction layer kernel parameters
void initArgList(
const aclArgData* aclArg //!< List of ACL arguments
);
//! Initializes Hsail Argument metadata and info
void initHsailArgs(
const aclArgData* aclArg //!< List of ACL arguments
);
//! Initializes Hsail Printf metadata and info
void initPrintf(
const aclPrintfFmt* aclPrintf //!< List of ACL printfs
);
std::vector<Argument*> arguments_; //!< Vector list of HSAIL Arguments
std::string compileOptions_; //!< compile used for finalizing this kernel
amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU
const NullDevice& dev_; //!< GPU device object
const HSAILProgram& prog_; //!< Reference to the parent program
std::vector<PrintfInfo> printf_; //!< Format strings for GPU printf support
uint index_; //!< Kernel index in the program
pal::Memory* code_; //!< Memory object with ISA code
size_t codeSize_; //!< Size of ISA code
char* hwMetaData_; //!< SI metadata
uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments
union Flags {
struct {
uint imageEna_: 1; //!< Kernel uses images
uint imageWriteEna_: 1; //!< Kernel uses image writes
uint dynamicParallelism_: 1; //!< Dynamic parallelism enabled
uint internalKernel_: 1; //!< True: internal kernel
};
uint value_;
Flags(): value_(0) {}
} flags_;
};
/*@}*/} // namespace pal
#endif /*PALKERNEL_HPP_*/
檔案差異因為檔案過大而無法顯示 載入差異
+275
查看文件
@@ -0,0 +1,275 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALMEMORY_HPP_
#define PALMEMORY_HPP_
#include "top.hpp"
#include "thread/atomic.hpp"
#include "device/pal/palresource.hpp"
#include <map>
/*! \addtogroup GPU
* @{
*/
namespace device {
class Memory;
}
//! PAL Device Implementation
namespace pal {
class Device;
class Heap;
class Resource;
class Memory;
class VirtualGPU;
//! GPU memory object.
// Wrapper that can contain a heap block or an interop buffer/image.
class Memory: public device::Memory, public Resource
{
public:
enum InteropType {
InteropNone = 0, //!< None interop memory
InteropHwEmulation = 1, //!< Uses HW emulaiton with calMemCopy
InteropDirectAccess = 2 //!< Uses direct access to the interop surface
};
//! Constructor (with owner)
Memory(
const Device& gpuDev, //!< GPU device object
amd::Memory& owner, //!< Abstraction layer memory object
size_t size //!< Memory size for allocation
);
//! Constructor (nonfat version for local scratch mem use without heap block)
Memory(
const Device& gpuDev, //!< GPU device object
size_t size //!< Memory size for allocation
);
//! Constructor memory for images (without global heap allocation)
Memory(
const Device& gpuDev, //!< GPU device object
amd::Memory& owner, //!< Abstraction layer memory object
size_t width, //!< Allocated memory width
size_t height, //!< Allocated memory height
size_t depth, //!< Allocated memory depth
cl_image_format format, //!< Memory format
cl_mem_object_type imageType, //!< CL image type
uint mipLevels //!< The number of mip levels
);
//! Constructor memory for images (without global heap allocation)
Memory(
const Device& gpuDev, //!< GPU device object
size_t size, //!< Memory object size
size_t width, //!< Allocated memory width
size_t height, //!< Allocated memory height
size_t depth, //!< Allocated memory depth
cl_image_format format, //!< Memory format
cl_mem_object_type imageType, //!< CL image type
uint mipLevels //!< The number of mip levels
);
//! Default destructor
~Memory();
//! Creates the interop memory
bool createInterop(
InteropType type //!< The interop type
);
//! Overloads the resource create method
virtual bool create(
Resource::MemoryType memType, //!< Memory type
Resource::CreateParams* params = NULL //!< Prameters for create
);
//! Allocate memory for API-level maps
virtual void* allocMapTarget(
const amd::Coord3D& origin, //!< The map location in memory
const amd::Coord3D& region, //!< The map region in memory
uint mapFlags, //!< Map flags
size_t* rowPitch = NULL, //!< Row pitch for the mapped memory
size_t* slicePitch = NULL //!< Slice for the mapped memory
);
//! Pins system memory associated with this memory object
virtual bool pinSystemMemory(
void* hostPtr, //!< System memory address
size_t size //!< Size of allocated system memory
);
//! Releases indirect map surface
virtual void releaseIndirectMap() { decIndMapCount(); }
//! Map the device memory to CPU visible
virtual void* cpuMap(
device::VirtualDevice& vDev,//!< Virtual device for map operaiton
uint flags = 0, //!< flags for the map operation
// Optimization for multilayer map/unmap
uint startLayer = 0, //!< Start layer for multilayer map
uint numLayers = 0, //!< End layer for multilayer map
size_t* rowPitch = NULL, //!< Row pitch for the device memory
size_t* slicePitch = NULL //!< Slice pitch for the device memory
);
//! Unmap the device memory
virtual void cpuUnmap(
device::VirtualDevice& vDev //!< Virtual device for unmap operaiton
);
//! Updates device memory from the owner's host allocation
void syncCacheFromHost(
VirtualGPU& gpu, //!< Virtual GPU device object
//! Synchronization flags
device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags()
);
//! Updates the owner's host allocation from device memory
virtual void syncHostFromCache(
//! Synchronization flags
device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags()
);
//! Creates a view from current resource
virtual Memory* createBufferView(
amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner
);
//! Allocates host memory for synchronization with MGPU context
void mgpuCacheWriteBack();
//! Transfers objects data to the destination object
bool moveTo(Memory& dst);
//! Accessors for indirect map memory object
Memory* mapMemory() const;
//! Returns the interop memory for this memory object
Memory* interop() const { return interopMemory_; }
//! Gets interop type for this memory object
InteropType interopType() const { return interopType_; }
//! Sets interop type for this memory object
void setInteropType(InteropType type) { interopType_ = type; }
//! Set the owner
void setOwner(amd::Memory* owner) { owner_ = owner; }
// Decompress GL depth-stencil/MSAA resources for CL access
// Invalidates any FBOs the resource may be bound to, otherwise the GL driver may crash.
virtual bool processGLResource(GLResourceOP operation);
//! Returns the interop resource for this memory object
const Memory* parent() const { return parent_; }
//! Returns TRUE if direct map is acceaptable. The method detects
//! forced USWC memory on APU and will cause a switch to
//! indirect map for allocations with a possibility of host read
bool isDirectMap()
{
return (isCacheable() || !isHostMemDirectAccess() ||
(owner()->getMemFlags() &
(CL_MEM_ALLOC_HOST_PTR | CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY)));
}
protected:
//! Decrement map count
void decIndMapCount();
//! Initialize the object members
void init();
private:
//! Disable copy constructor
Memory(const Memory&);
//! Disable operator=
Memory& operator=(const Memory&);
InteropType interopType_; //!< Interop type
Memory* interopMemory_; //!< interop memory
Memory* pinnedMemory_; //!< Memory used as pinned system memory
const Memory* parent_; //!< Parent memory object
};
class Buffer: public pal::Memory
{
public:
//! Buffer constructor
Buffer(
const Device& gpuDev, //!< GPU device object
amd::Memory& owner, //!< Abstraction layer memory object
size_t size //!< Buffer size
)
: pal::Memory(gpuDev, owner, size)
{}
//! Creates a view from current resource
virtual Memory* createBufferView(
amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner
) const;
private:
//! Disable copy constructor
Buffer(const Buffer&);
//! Disable operator=
Buffer& operator=(const Buffer&);
};
class Image: public pal::Memory
{
public:
//! Image constructor
Image(
const Device& gpuDev, //!< GPU device object
amd::Memory& owner, //!< Abstraction layer memory object
size_t width, //!< Allocated memory width
size_t height, //!< Allocated memory height
size_t depth, //!< Allocated memory depth
cl_image_format format, //!< Memory format
cl_mem_object_type imageType, //!< CL image type
uint mipLevels //!< The number of mip levels
)
: pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels)
{}
//! Image constructor
Image(
const Device& gpuDev, //!< GPU device object
size_t size, //!< Memory size
size_t width, //!< Allocated memory width
size_t height, //!< Allocated memory height
size_t depth, //!< Allocated memory depth
cl_image_format format, //!< Memory format
cl_mem_object_type imageType, //!< CL image type
uint mipLevels //!< The number of mip levels
)
: pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels)
{}
//! Allocate memory for API-level maps
virtual void* allocMapTarget(
const amd::Coord3D& origin, //!< The map location in memory
const amd::Coord3D& region, //!< The map region in memory
uint mapFlags, //!< Map flags
size_t* rowPitch = NULL, //!< Row pitch for the mapped memory
size_t* slicePitch = NULL //!< Slice for the mapped memory
);
private:
//! Disable copy constructor
Image(const Image&);
//! Disable operator=
Image& operator=(const Image&);
};
} // namespace pal
#endif // PALMEMORY_HPP_
+714
查看文件
@@ -0,0 +1,714 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "top.hpp"
#include "os/os.hpp"
#include "device/device.hpp"
#include "device/pal/paldefs.hpp"
#include "device/pal/palmemory.hpp"
#include "device/pal/palkernel.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palprintf.hpp"
#include <cstdio>
#include <algorithm>
#include <math.h>
namespace pal {
PrintfDbg::PrintfDbg(Device& device, FILE* file)
: dbgBuffer_(nullptr)
, dbgFile_(file)
, gpuDevice_(device)
, wiDbgSize_(0)
, initCntValue_(device, 4)
{
}
PrintfDbg::~PrintfDbg()
{
delete dbgBuffer_;
}
bool
PrintfDbg::create()
{
// Create a resource for the init count value
if (initCntValue_.create(Resource::Remote)) {
uint32_t* value = reinterpret_cast<uint32_t*>(initCntValue_.map(nullptr));
// The counter starts from 1
if (nullptr != value) {
*value = 1;
}
else {
return false;
}
initCntValue_.unmap(nullptr);
return true;
}
return false;
}
bool
PrintfDbg::init(
VirtualGPU& gpu,
bool printfEnabled,
const amd::NDRange& size)
{
// Set up debug output buffer (if printf active)
if (printfEnabled) {
if (!allocate()) {
return false;
}
// Make sure that the size isn't bigger than the reported max
if (size.product() <= dev().settings().maxWorkGroupSize_) {
size_t wiDbgSizeTmp;
// Calculate the debug buffer size per workitem
wiDbgSizeTmp = std::min(dbgBuffer_->size() / size.product(),
dev().xferRead().bufSize());
// Make sure the size is DWORD aligned
wiDbgSizeTmp = amd::alignDown(wiDbgSizeTmp, sizeof(uint32_t));
// If the new size is different, then clear the initial values
if (wiDbgSize_ != wiDbgSizeTmp) {
wiDbgSize_ = wiDbgSizeTmp;
if (!clearWorkitems(gpu, 0, size.product())) {
wiDbgSize_ = 0;
return false;
}
}
}
}
return true;
}
bool
PrintfDbg::output(
VirtualGPU& gpu,
bool printfEnabled,
const amd::NDRange& size,
const std::vector<PrintfInfo>& printfInfo)
{
// Are we expected to generate debug output?
if (printfEnabled && !printfInfo.empty()) {
uint32_t* workitemData;
size_t i, j, k, z;
bool realloc = false;
// Wait for kernel execution
gpu.waitAllEngines();
size_t zdim = 1;
size_t ydim = 1;
size_t xdim = 1;
switch (size.dimensions()) {
case 3:
zdim = size[2];
// Fall through ...
case 2:
ydim = size[1];
// Fall through ...
case 1:
xdim = size[0];
// Fall through ...
default:
break;
}
for (k = 0; k < zdim; ++k) {
for (j = 0; j < ydim; ++j) {
for (i = 0; i < xdim; ++i) {
size_t idx = (xdim * (ydim * k + j) + i);
workitemData = mapWorkitem(gpu, idx, &realloc);
if (nullptr != workitemData) {
uint32_t wp = workitemData[0]; // write pointer (i.e. first unwritten element)
// Walk through each PrintfDbg entry
for (z = 1; (z < (wiDbgSize() / sizeof(uint32_t))) && (z < wp); ) {
if (printfInfo.size() < workitemData[z]) {
LogError("The format string wasn't reported");
return false;
}
// Get the PrintfDbg info
const PrintfInfo& info = printfInfo[workitemData[z++]];
// There's something in this buffer
outputDbgBuffer(info, workitemData, z);
}
}
unmapWorkitem(gpu, workitemData);
}
}
}
// Reallocate debug buffer if necessary
if (!allocate(realloc)) {
return false;
}
}
return true;
}
bool
PrintfDbg::allocate(bool realloc)
{
if (nullptr == dbgBuffer_) {
dbgBuffer_ = dev().createScratchBuffer(dev().info().printfBufferSize_);
}
else if (realloc) {
LogWarning("Debug buffer reallocation!");
// Double the buffer size if it's not big enough
size_t size = dbgBuffer_->size();
delete dbgBuffer_;
dbgBuffer_ = dev().createScratchBuffer(size << 1);
}
return (nullptr != dbgBuffer_) ? true : false;
}
bool
PrintfDbg::checkFloat(const std::string& fmt) const
{
switch (fmt[fmt.size() - 1]) {
case 'e':
case 'E':
case 'f':
case 'g':
case 'G':
case 'a':
return true;
break;
default:
break;
}
return false;
}
bool
PrintfDbg::checkString(const std::string& fmt) const
{
if (fmt[fmt.size() - 1] == 's')
return true;
return false;
}
int
PrintfDbg::checkVectorSpecifier(
const std::string& fmt,
size_t startPos,
size_t& curPos) const
{
int vectorSize = 0;
size_t pos = curPos;
size_t size = curPos - startPos;
if (size >= 3) {
size = 0;
//no modifiers
if (fmt[curPos - 3] == 'v') {
size = 2;
}
//the modifiers are "h" or "l"
else if (fmt[curPos - 4] == 'v') {
size = 3;
}
//the modifier is "hh"
else if ((curPos >= 5) && (fmt[curPos - 5] == 'v')) {
size = 4;
}
if (size > 0) {
curPos = size;
pos -= curPos;
// Get vector size
vectorSize = fmt[pos++] - '0';
// PrintfDbg supports only 2, 3, 4, 8 and 16 wide vectors
switch (vectorSize) {
case 1:
if ((fmt[pos++] - '0') == 6) {
vectorSize = 16;
}
else {
vectorSize = 0;
}
break;
case 2:
case 3:
case 4:
case 8:
break;
default:
vectorSize = 0;
break;
}
}
}
return vectorSize;
}
static const size_t ConstStr = 0xffffffff;
static const char Separator[] = ",\0";
size_t
PrintfDbg::outputArgument(
const std::string& fmt,
bool printFloat,
size_t size,
const uint32_t* argument) const
{
// Serialize the output to the screen
amd::ScopedLock k(dev().lockAsyncOps());
size_t copiedBytes = size;
// Print the string argument, using standard PrintfDbg()
if (checkString(fmt.c_str())) {
//copiedBytes should be as number of printed chars
copiedBytes = 0;
//(null) should be printed
if (*argument == 0) {
amd::Os::printf(fmt.data(),0);
//copiedBytes = strlen("(null)")
copiedBytes = 6;
}
else {
const unsigned char* argumentStr = reinterpret_cast<const unsigned char*>(argument);
amd::Os::printf(fmt.data(),argumentStr);
//copiedBytes = strlen(argumentStr)
while (argumentStr[copiedBytes++] != 0);
}
}
// Print the argument(except for string ), using standard PrintfDbg()
else {
bool hlModifier = (strstr(fmt.c_str(),"hl") != nullptr);
std::string hlFmt;
if (hlModifier) {
hlFmt = fmt;
hlFmt.erase(hlFmt.find_first_of("hl"),2);
}
switch (size) {
case 0: {
const char* str = reinterpret_cast<const char*>(argument);
amd::Os::printf(fmt.data(), str);
// Find the string length
while (str[copiedBytes++] != 0);
}
break;
case 1:
amd::Os::printf(fmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
break;
case 2:
case 4:
if (printFloat) {
static const char* fSpecifiers = "eEfgGa";
std::string fmtF = fmt;
size_t posS = fmtF.find_first_of("%");
size_t posE = fmtF.find_first_of(fSpecifiers);
if (posS != std::string::npos &&posE != std::string::npos) {
fmtF.replace(posS+1,posE-posS,"s");
}
float fArg = *(reinterpret_cast<const float*>(argument));
float fSign = copysign(1.0,fArg);
if (isinf(fArg)&&!isnan(fArg)) {
if(fSign < 0) {
amd::Os::printf(fmtF.data(),"-infinity");
}
else {
amd::Os::printf(fmtF.data(),"infinity");
}
}
else if (isnan(fArg)) {
if(fSign < 0) {
amd::Os::printf(fmtF.data(),"-nan");
}
else {
amd::Os::printf(fmtF.data(),"nan");
}
}
else if (hlModifier) {
amd::Os::printf(hlFmt.data(),fArg);
}
else {
amd::Os::printf(fmt.data(),fArg);
}
}
else {
bool hhModifier = (strstr(fmt.c_str(),"hh") != nullptr);
if (hhModifier) {
//current implementation of printf in gcc 4.5.2 runtime libraries, doesn`t recognize "hh" modifier ==>
//argument should be explicitly converted to unsigned char (uchar) before printing and
//fmt should be updated not to contain "hh" modifier
std::string hhFmt = fmt;
hhFmt.erase(hhFmt.find_first_of("h"),2);
amd::Os::printf(hhFmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
}
else if (hlModifier) {
amd::Os::printf(hlFmt.data(), *argument);
}
else {
amd::Os::printf(fmt.data(), *argument);
}
}
break;
case 8:
if (printFloat) {
if (hlModifier) {
amd::Os::printf(hlFmt.data(), *(reinterpret_cast<const double*>(argument)));
}
else {
amd::Os::printf(fmt.data(), *(reinterpret_cast<const double*>(argument)));
}
}
else {
std::string out = fmt;
// Use 'll' for 64 bit printf
out.insert((out.size() - 1), 1, 'l');
amd::Os::printf(out.data(), *(reinterpret_cast<const uint64_t*>(argument)));
}
break;
case ConstStr: {
const char* str = reinterpret_cast<const char*>(argument);
amd::Os::printf(fmt.data(), str);
}
break;
default:
amd::Os::printf("Error: Unsupported data size for PrintfDbg. %d bytes",
static_cast<int>(size));
return 0;
}
}
fflush(stdout);
return copiedBytes;
}
void
PrintfDbg::outputDbgBuffer(const PrintfInfo& info, const uint32_t* workitemData, size_t& i) const
{
static const char* specifiers = "cdieEfgGaosuxXp";
static const char* modifiers = "hl";
static const char* special = "%n";
static const std::string sepStr = "%s";
const uint32_t* s = workitemData;
size_t pos = 0;
// Find the format string
std::string str = info.fmtString_;
std::string fmt;
size_t posStart, posEnd;
// Print all arguments
// Note: the following code walks through all arguments, provided by the kernel and
// finds the corresponding specifier in the format string.
// Then it splits the original string into substrings with a single specifier and
// uses standard PrintfDbg() to print each argument
for (uint j = 0; j < info.arguments_.size(); ++j) {
do {
posStart = str.find_first_of("%", pos);
if (posStart != std::string::npos) {
posStart++;
// Erase all spaces after %
while (str[posStart] == ' ') {
str.erase(posStart, 1);
}
size_t tmp = str.find_first_of(special, posStart);
size_t tmp2 = str.find_first_of(specifiers, posStart);
// Special cases. Special symbol is located before any specifier
if (tmp < tmp2) {
posEnd = posStart + 1;
fmt = str.substr(pos, posEnd - pos);
fmt.erase(posStart - pos - 1, 1);
pos = posStart = posEnd;
outputArgument(sepStr, false, ConstStr,
reinterpret_cast<const uint32_t*>(fmt.data()));
continue;
}
break;
}
else if (pos < str.length()) {
outputArgument(sepStr, false, ConstStr,reinterpret_cast<const uint32_t*>((str.substr(pos)).data()));
}
}
while (posStart != std::string::npos);
if (posStart != std::string::npos) {
bool printFloat = false;
int vectorSize = 0;
size_t length;
size_t idPos = 0;
// Search for PrintfDbg specifier in the format string.
// It will be a split point for the output
posEnd = str.find_first_of(specifiers, posStart);
if (posEnd == std::string::npos) {
pos = posStart = posEnd;
break;
}
posEnd++;
size_t curPos = posEnd;
vectorSize = checkVectorSpecifier(str, posStart, curPos);
// Get substring from the last position to the current specifier
fmt = str.substr(pos, posEnd - pos);
// Readjust the string pointer if PrintfDbg outputs a vector
if (vectorSize != 0) {
size_t posVecSpec = fmt.length()-(curPos + 1);
size_t posVecMod = fmt.find_first_of(modifiers,posVecSpec + 1);
size_t posMod = str.find_first_of(modifiers,posStart);
if(posMod < posEnd){
fmt = fmt.erase(posVecSpec, posVecMod - posVecSpec);
}
else{
fmt = fmt.erase(posVecSpec, curPos);
}
idPos = posStart - pos - 1;
}
pos = posStart = posEnd;
// Find out if the argument is a float
printFloat = checkFloat(fmt);
// Is it a scalar value?
if (vectorSize == 0) {
length = outputArgument(fmt, printFloat, info.arguments_[j], &s[i]);
if (0 == length) {
return;
}
i += amd::alignUp(length, sizeof(uint32_t)) / sizeof(uint32_t);
}
else {
// 3-component vector's size is defined as 4 * size of each scalar component
size_t elemSize = info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize);
size_t k = i * sizeof(uint32_t);
std::string elementStr = fmt.substr(idPos, fmt.size());
// Print first element with full string
if (0 == outputArgument(fmt, printFloat, elemSize, &s[i])) {
return;
}
// Print other elemnts with separator if available
for (int e = 1; e < vectorSize; ++e) {
const char* t = reinterpret_cast<const char*>(s);
// Output the vector separator
outputArgument(sepStr, false, ConstStr,
reinterpret_cast<const uint32_t*>(Separator));
// Output the next element
outputArgument(elementStr, printFloat, elemSize,
reinterpret_cast<const uint32_t*>(&t[k + e * elemSize]));
}
i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t)))
/ sizeof(uint32_t);
}
}
}
if (pos != std::string::npos) {
fmt = str.substr(pos, str.size() - pos);
outputArgument(sepStr, false, ConstStr,
reinterpret_cast<const uint32_t*>(fmt.data()));
}
}
bool
PrintfDbg::clearWorkitems(VirtualGPU& gpu, size_t idxStart, size_t number) const
{
// Go through all locations for every thread and copy 1
for (uint i = idxStart; i < idxStart + number; ++i) {
amd::Coord3D dst(i * wiDbgSize(), 0, 0);
amd::Coord3D size(sizeof(uint32_t), 0, 0);
// Copy 1 into the corresponding location in the debug buffer
if (!initCntValue_.partialMemCopyTo(
gpu, amd::Coord3D(0, 0, 0), dst, size, *dbgBuffer_)) {
return false;
}
}
return true;
}
uint32_t*
PrintfDbg::mapWorkitem(VirtualGPU& gpu, size_t idx, bool* realloc)
{
uint32_t wiSize = 0;
amd::Coord3D src(idx * wiDbgSize(), 0, 0);
xferBufRead_ = &(dev().xferRead().acquire());
// Copy workitem size from the corresponding location in the debug buffer
if (!dbgBuffer_->partialMemCopyTo(gpu,
src, amd::Coord3D(0, 0, 0), amd::Coord3D(sizeof(uint32_t), 0, 0),
*xferBufRead_)) {
return nullptr;
}
// Get memory pointer to the satged buffer
uint32_t* workitem = reinterpret_cast<uint32_t*>(xferBufRead_->map(&gpu));
if (nullptr == workitem) {
return nullptr;
}
// Copy size value
wiSize = *workitem;
xferBufRead_->unmap(&gpu);
// Check if the cuurent workitem almost reached the size limit
if ((wiDbgSize() - static_cast<size_t>(wiSize)) < 3) {
*realloc = true;
}
// If the current workitem had any output then get the data
if ((wiSize > 1) && (wiSize <= wiDbgSize())) {
amd::Coord3D size(wiSize * sizeof(uint32_t), 0, 0);
// Copy the current workitem output data to the staged buffer
if (!dbgBuffer_->partialMemCopyTo(
gpu, src, amd::Coord3D(0, 0, 0), size, *xferBufRead_) ||
// Clear the write pointer back to index 1 for the current workitem
!clearWorkitems(gpu, idx, 1)) {
LogError("Reading the workitem data failed!");
return nullptr;
}
// Get a pointer to the workitem data
uint32_t* workitem = reinterpret_cast<uint32_t*>
(xferBufRead_->map(&gpu));
return workitem;
}
return nullptr;
}
void
PrintfDbg::unmapWorkitem(VirtualGPU& gpu , const uint32_t* workitemData) const
{
if (nullptr != workitemData) {
xferBufRead_->unmap(&gpu);
}
dev().xferRead().release(gpu, *xferBufRead_);
}
bool
PrintfDbgHSA::init(
VirtualGPU& gpu,
bool printfEnabled)
{
// Set up debug output buffer (if printf active)
if (printfEnabled) {
if (!allocate()) {
return false;
}
// The first two DWORDs in the printf buffer are as follows:
// First DWORD = Offset to where next information is to
// be written, initialized to 0
// Second DWORD = Number of bytes available for printf data
// = buffer size 2*sizeof(uint32_t)
const uint8_t initSize = 2*sizeof(uint32_t);
uint8_t sysMem[initSize];
memset(sysMem, 0, initSize);
uint32_t dbgBufferSize = dbgBuffer_->size() - initSize;
memcpy(&sysMem[4], &dbgBufferSize, sizeof(dbgBufferSize));
// Copy offset and number of bytes available for printf data
// into the corresponding location in the debug buffer
dbgBuffer_->writeRawData(gpu, initSize, sysMem, true);
}
return true;
}
bool
PrintfDbgHSA::output(
VirtualGPU& gpu,
bool printfEnabled,
const std::vector<PrintfInfo>& printfInfo)
{
if (printfEnabled) {
uint32_t offsetSize = 0;
xferBufRead_ = &(dev().xferRead().acquire());
// Copy offset from the first DWORD in the debug buffer
if (!dbgBuffer_->partialMemCopyTo(gpu,
amd::Coord3D(0, 0, 0), amd::Coord3D(0, 0, 0),
amd::Coord3D(sizeof(uint32_t), 0, 0),*xferBufRead_)) {
return false;
}
// Get memory pointer to the satged buffer
uint32_t* dbgBufferPtr = reinterpret_cast<uint32_t*>(xferBufRead_->map(&gpu));
if (nullptr == dbgBufferPtr) {
return false;
}
offsetSize = *dbgBufferPtr;
xferBufRead_->unmap(&gpu);
if (offsetSize == 0) {
LogError("\n The printf buffer is empty!");
return false;
}
size_t bufSize = dev().xferRead().bufSize();
size_t copySize = offsetSize;
while (copySize != 0) {
// Copy the buffer data (i.e., the printfID followed by the
//argument data for each printf call in th kernel) to the staged buffer
if (!dbgBuffer_->partialMemCopyTo(gpu,
amd::Coord3D(2*sizeof(uint32_t) + offsetSize - copySize, 0, 0),
amd::Coord3D(0, 0, 0),
std::min(copySize, bufSize), *xferBufRead_)) {
return false;
}
// Get a pointer to the buffer data
dbgBufferPtr = reinterpret_cast<uint32_t*>(xferBufRead_->map(&gpu));
if (nullptr == dbgBufferPtr) {
return false;
}
std::vector<uint>::const_iterator ita;
uint sb = 0;
uint sbt = 0;
// parse the debug buffer
while (sbt < copySize) {
assert(((*dbgBufferPtr) < printfInfo.size()) &&
"Cound't find the reported PrintfID!");
const PrintfInfo& info = printfInfo[(*dbgBufferPtr)];
sb += sizeof(uint32_t);
for (ita = info.arguments_.begin();
ita != info.arguments_.end(); ++ita){
sb += *ita;
}
if (sbt + sb > bufSize) {
break; // Need new portion of data in staging buffer
}
size_t idx = 1;
// There's something in the debug buffer
outputDbgBuffer(info, dbgBufferPtr, idx);
sbt += sb;
dbgBufferPtr += sb/sizeof(uint32_t);
sb = 0;
}
copySize -= sbt;
xferBufRead_->unmap(&gpu);
}
dev().xferRead().release(gpu, *xferBufRead_);
}
return true;
}
} // namespace pal
+192
查看文件
@@ -0,0 +1,192 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALPRINTFDBG_HPP_
#define PALPRINTFDBG_HPP_
#include "device/pal/palmemory.hpp"
/*! \addtogroup GPU GPU Device Implementation
* @{
*/
#ifndef isinf
#ifdef _MSC_VER
#define isinf(X) (!_finite(X) && !_isnan(X))
#endif //_MSC_VER
#endif //isinf
#ifndef isnan
#ifdef _MSC_VER
#define isnan(X) (_isnan(X))
#endif //_MSC_VER
#endif //isnan
#ifndef copysign
#ifdef _MSC_VER
#define copysign(X,Y) (_copysign(X,Y))
#endif //_MSC_VER
#endif //copysign
//! GPU Device Implementation
namespace pal {
//! Printf info structure
struct PrintfInfo
{
std::string fmtString_; //!< formated string for printf
std::vector<uint> arguments_; //!< passed arguments to the printf() call
};
class Kernel;
class VirtualGPU;
class Memory;
class PrintfDbg : public amd::HeapObject
{
public:
//! Debug buffer size per workitem
static const uint WorkitemDebugSize = 4096;
//! Default constructor
PrintfDbg(
Device& device,
FILE* file = NULL
);
//! Destructor
~PrintfDbg();
//! Creates the PrintfDbg object
bool create();
//! Initializes the debug buffer before kernel's execution
bool init(
VirtualGPU& gpu, //!< Virtual GPU object
bool printfEnabled, //!< checks for printf
const amd::NDRange& size //!< Kernel's workload
);
//! Prints the kernel's debug informaiton from the buffer
bool output(
VirtualGPU& gpu, //!< Virtual GPU object
bool printfEnabled, //!< checks for printf
const amd::NDRange& size, //!< Kernel's workload
const std::vector<PrintfInfo>& printfInfo //!< printf info
);
//! Debug buffer size per workitem
size_t wiDbgSize() const { return wiDbgSize_; }
//! Returns debug buffer object
Memory* dbgBuffer() const { return dbgBuffer_; }
protected:
Memory* dbgBuffer_; //!< Buffer to hold debug output
FILE* dbgFile_; //!< Debug file
Device& gpuDevice_; //!< GPU device object
Memory* xferBufRead_; //!< Transfer buffer for the dump read
//! Gets GPU device object
Device& dev() const { return gpuDevice_; }
//! Allocates the debug buffer
bool allocate(
bool realloc = false //!< If TRUE then reallocate the debug memory
);
//! Returns TRUE if a float value has to be printed
bool checkFloat(
const std::string& fmt //!< Format string
) const;
//! Returns TRUE if a string value has to be printed
bool checkString(
const std::string& fmt //!< Format string
) const;
//! Finds the specifier in the format string
int checkVectorSpecifier(
const std::string& fmt, //!< Format string
size_t startPos, //!< Start position for processing
size_t& curPos //!< End position for processing
) const;
//! Outputs an argument
size_t outputArgument(
const std::string& fmt, //!< Format strint
bool printFloat, //!< Argument is a float value
size_t size, //!< Argument's size
const uint32_t* argument //!< Argument's location
) const;
//! Displays the PrintfDbg
void outputDbgBuffer(
const PrintfInfo& info, //!< printf info
const uint32_t* workitemData, //!< The PrintfDbg dump buffer
size_t& i //!< index to the data in the buffer
) const;
private:
//! Disable copy constructor
PrintfDbg(const PrintfDbg&);
//! Disable assignment
PrintfDbg& operator=(const PrintfDbg&);
//! Returns the pointer to the workitem data block
bool clearWorkitems(
VirtualGPU& gpu, //!< Virtual GPU object
size_t idxStart, //!< Workitem global index start
size_t number //!< Number of workitems to clear
) const;
//! Returns the pointer to the workitem data block
uint32_t* mapWorkitem(
VirtualGPU& gpu, //!< Virtual GPU object
size_t idx, //!< Workitem global index
bool* realloc //!< Returns TRUE if workitem reached the buffer limit
);
//! Unamp the staged buffer
void unmapWorkitem(
VirtualGPU& gpu, //!< Virtual GPU object
const uint32_t* workitemData //!< The PrintfDbg dump buffer
) const;
size_t wiDbgSize_; //!< Workitem debug size
Memory initCntValue_; //!< Initialized count value
};
class PrintfDbgHSA : public PrintfDbg
{
public:
//! Default constructor
PrintfDbgHSA(
Device& device,
FILE* file = NULL
): PrintfDbg(device, file) { }
//! Initializes the debug buffer before kernel's execution
bool init(
VirtualGPU& gpu, //!< Virtual GPU object
bool printfEnabled //!< checks for printf
);
//! Prints the kernel's debug informaiton from the buffer
bool output(
VirtualGPU& gpu, //!< Virtual GPU object
bool printfEnabled, //!< checks for printf
const std::vector<PrintfInfo>& printfInfo //!< printf info
);
private:
//! Disable copy constructor
PrintfDbgHSA(const PrintfDbgHSA&);
//! Disable assignment
PrintfDbgHSA& operator=(const PrintfDbgHSA&);
};
/*@}*/} // namespace pal
#endif /*PALPRINTFDBG_HPP_*/
+925
查看文件
@@ -0,0 +1,925 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "os/os.hpp"
#include "utils/flags.hpp"
#include "include/aclTypes.h"
#include "utils/amdilUtils.hpp"
#include "utils/bif_section_labels.hpp"
#include "device/pal/palprogram.hpp"
#include "device/pal/palblit.hpp"
#include "macrodata.h"
#include "MDParser/AMDILMDInterface.h"
#include <fstream>
#include <sstream>
#include <cstdio>
#include <algorithm>
#include "utils/options.hpp"
#include "hsa.h"
#include "hsa_ext_image.h"
#include "amd_hsa_loader.hpp"
namespace pal {
HSAILProgram::HSAILProgram(Device& device)
: Program(device)
, llvmBinary_()
, binaryElf_(nullptr)
, rawBinary_(nullptr)
, kernels_(nullptr)
, maxScratchRegs_(0)
, isNull_(false)
, executable_(nullptr)
, loaderContext_(this)
{
memset(&binOpts_, 0, sizeof(binOpts_));
binOpts_.struct_size = sizeof(binOpts_);
binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64);
binOpts_.bitness = ELFDATA2LSB;
binOpts_.alloc = &::malloc;
binOpts_.dealloc = &::free;
loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
}
HSAILProgram::HSAILProgram(NullDevice& device)
: Program(device)
, llvmBinary_()
, binaryElf_(nullptr)
, rawBinary_(nullptr)
, kernels_(nullptr)
, maxScratchRegs_(0)
, isNull_(true)
, executable_(nullptr)
, loaderContext_(this)
{
memset(&binOpts_, 0, sizeof(binOpts_));
binOpts_.struct_size = sizeof(binOpts_);
binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64);
binOpts_.bitness = ELFDATA2LSB;
binOpts_.alloc = &::malloc;
binOpts_.dealloc = &::free;
loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
}
HSAILProgram::~HSAILProgram()
{
// Destroy internal static samplers
for (auto& it : staticSamplers_) {
delete it;
}
if (rawBinary_ != nullptr) {
free(rawBinary_);
}
acl_error error;
// Free the elf binary
if (binaryElf_ != nullptr) {
error = aclBinaryFini(binaryElf_);
if (error != ACL_SUCCESS) {
LogWarning( "Error while destroying the acl binary \n" );
}
}
releaseClBinary();
if (executable_ != nullptr) {
loader_->DestroyExecutable(executable_);
}
delete kernels_;
amd::hsa::loader::Loader::Destroy(loader_);
}
bool
HSAILProgram::initBuild(amd::option::Options *options)
{
if (!device::Program::initBuild(options)) {
return false;
}
const char* devName = dev().hwInfo()->machineTarget_;
options->setPerBuildInfo(
(devName && (devName[0] != '\0')) ? devName : "gpu",
clBinary()->getEncryptCode(), true);
// Elf Binary setup
std::string outFileName;
// true means fsail required
clBinary()->init(options, true);
if (options->isDumpFlagSet(amd::option::DUMP_BIF)) {
outFileName = options->getDumpFileName(".bin");
}
if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64),
(outFileName.size() > 0) ? outFileName.c_str() : nullptr)) {
LogError("Setup elf out for gpu failed");
return false;
}
return true;
}
bool
HSAILProgram::finiBuild(bool isBuildGood)
{
clBinary()->resetElfOut();
clBinary()->resetElfIn();
if (!isBuildGood) {
// Prevent the encrypted binary form leaking out
clBinary()->setBinary(nullptr, 0);
}
return device::Program::finiBuild(isBuildGood);
}
bool
HSAILProgram::linkImpl(
const std::vector<device::Program *> &inputPrograms,
amd::option::Options *options,
bool createLibrary)
{
std::vector<device::Program *>::const_iterator it
= inputPrograms.begin();
std::vector<device::Program *>::const_iterator itEnd
= inputPrograms.end();
acl_error errorCode;
// For each program we need to extract the LLVMIR and create
// aclBinary for each
std::vector<aclBinary *> binaries_to_link;
for (size_t i = 0; it != itEnd; ++it, ++i) {
HSAILProgram *program = (HSAILProgram *)*it;
// Check if the program was created with clCreateProgramWIthBinary
binary_t binary = program->binary();
if ((binary.first != nullptr) && (binary.second > 0)) {
// Binary already exists -- we can also check if there is no
// opencl source code
// Need to check if LLVMIR exists in the binary
// If LLVMIR does not exist then is it valid
// We need to pull out all the compiled kernels
// We cannot do this at present because we need at least
// Hsail text to pull the kernels oout
void *mem = const_cast<void *>(binary.first);
binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode);
if (errorCode != ACL_SUCCESS) {
LogWarning("Error while linking : Could not read from raw binary");
return false;
}
}
// At this stage each HSAILProgram contains a valid binary_elf
// Check if LLVMIR is in the binary
// @TODO - Memory leak , cannot free this buffer
// need to fix this.. File EPR on compiler library
size_t llvmirSize = 0;
const void *llvmirText = aclExtractSection(dev().compiler(),
binaryElf_, &llvmirSize, aclLLVMIR, &errorCode);
if (errorCode != ACL_SUCCESS) {
bool spirv = false;
size_t boolSize = sizeof(bool);
errorCode = aclQueryInfo(dev().compiler(), binaryElf_,
RT_CONTAINS_SPIRV, nullptr, &spirv, &boolSize);
if (errorCode != ACL_SUCCESS) {
spirv = false;
}
if (spirv) {
errorCode = aclCompile(dev().compiler(), binaryElf_,
options->origOptionStr.c_str(), ACL_TYPE_SPIRV_BINARY,
ACL_TYPE_LLVMIR_BINARY, nullptr);
buildLog_ += aclGetCompilerLog(dev().compiler());
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error while linking: Could not load SPIR-V" ;
return false;
}
} else {
buildLog_ +="Error while linking : \
Invalid binary (Missing LLVMIR section)" ;
return false;
}
}
// Create a new aclBinary for each LLVMIR and save it in a list
aclBIFVersion ver = aclBinaryVersion(binaryElf_);
aclBinary *bin = aclCreateFromBinary(binaryElf_, ver);
binaries_to_link.push_back(bin);
}
errorCode = aclLink(dev().compiler(),
binaries_to_link[0], binaries_to_link.size() - 1,
binaries_to_link.size() > 1 ? &binaries_to_link[1] : NULL,
ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL);
if (errorCode != ACL_SUCCESS) {
buildLog_ += aclGetCompilerLog(dev().compiler());
buildLog_ +="Error while linking : aclLink failed" ;
return false;
}
// Store the newly linked aclBinary for this program.
binaryElf_ = binaries_to_link[0];
// Free all the other aclBinaries
for (size_t i = 1; i < binaries_to_link.size(); i++) {
aclBinaryFini(binaries_to_link[i]);
}
if (createLibrary) {
size_t size = 0;
void *mem = NULL;
aclWriteToMem(binaryElf_, &mem, &size);
setBinary(static_cast<char*>(mem), size);
buildLog_ += aclGetCompilerLog(dev().compiler());
setType(TYPE_LIBRARY);
return true;
}
// Now call linkImpl with the new options
return linkImpl(options);
}
aclType
HSAILProgram::getCompilationStagesFromBinary(std::vector<aclType>& completeStages, bool& needOptionsCheck)
{
acl_error errorCode;
size_t secSize = 0;
completeStages.clear();
aclType from = ACL_TYPE_DEFAULT;
needOptionsCheck = true;
size_t boolSize = sizeof(bool);
//! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
// Checking llvmir in .llvmir section
bool containsSpirv = true;
errorCode = aclQueryInfo(dev().compiler(), binaryElf_,
RT_CONTAINS_SPIRV, nullptr, &containsSpirv, &boolSize);
if (errorCode != ACL_SUCCESS) {
containsSpirv = false;
}
if (containsSpirv) {
completeStages.push_back(from);
from = ACL_TYPE_SPIRV_BINARY;
}
bool containsSpirText = true;
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_SPIR, nullptr, &containsSpirText, &boolSize);
if (errorCode != ACL_SUCCESS) {
containsSpirText = false;
}
if (containsSpirText) {
completeStages.push_back(from);
from = ACL_TYPE_SPIR_BINARY;
}
bool containsLlvmirText = true;
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LLVMIR, nullptr, &containsLlvmirText, &boolSize);
if (errorCode != ACL_SUCCESS) {
containsLlvmirText = false;
}
// Checking compile & link options in .comment section
bool containsOpts = true;
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_OPTIONS, nullptr, &containsOpts, &boolSize);
if (errorCode != ACL_SUCCESS) {
containsOpts = false;
}
if (containsLlvmirText && containsOpts) {
completeStages.push_back(from);
from = ACL_TYPE_LLVMIR_BINARY;
}
// Checking HSAIL in .cg section
bool containsHsailText = true;
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_HSAIL, nullptr, &containsHsailText, &boolSize);
if (errorCode != ACL_SUCCESS) {
containsHsailText = false;
}
// Checking BRIG sections
bool containsBrig = true;
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_BRIG, nullptr, &containsBrig, &boolSize);
if (errorCode != ACL_SUCCESS) {
containsBrig = false;
}
if (containsBrig) {
completeStages.push_back(from);
from = ACL_TYPE_HSAIL_BINARY;
} else if (containsHsailText) {
completeStages.push_back(from);
from = ACL_TYPE_HSAIL_TEXT;
}
// Checking Loader Map symbol from CG section
bool containsLoaderMap = true;
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LOADER_MAP, nullptr, &containsLoaderMap, &boolSize);
if (errorCode != ACL_SUCCESS) {
containsLoaderMap = false;
}
if (containsLoaderMap) {
completeStages.push_back(from);
from = ACL_TYPE_CG;
}
// Checking ISA in .text section
bool containsShaderIsa = true;
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_ISA, nullptr, &containsShaderIsa, &boolSize);
if (errorCode != ACL_SUCCESS) {
containsShaderIsa = false;
}
if (containsShaderIsa) {
completeStages.push_back(from);
from = ACL_TYPE_ISA;
}
std::string sCurOptions = compileOptions_ + linkOptions_;
amd::option::Options curOptions;
if (!amd::option::parseAllOptions(sCurOptions, curOptions)) {
buildLog_ += curOptions.optionsLog();
LogError("Parsing compile options failed.");
return ACL_TYPE_DEFAULT;
}
switch (from) {
// compile from HSAIL text, no matter prev. stages and options
case ACL_TYPE_HSAIL_TEXT:
needOptionsCheck = false;
break;
case ACL_TYPE_HSAIL_BINARY:
// do not check options, if LLVMIR is absent or might be absent or options are absent
if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) {
needOptionsCheck = false;
}
break;
case ACL_TYPE_CG:
case ACL_TYPE_ISA:
// do not check options, if LLVMIR is absent or might be absent or options are absent
if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) {
needOptionsCheck = false;
}
// do not check options, if BRIG is absent or might be absent or LoaderMap is absent
if (!curOptions.oVariables->BinCG || !containsBrig || !containsLoaderMap) {
needOptionsCheck = false;
}
break;
// recompilation might be needed
case ACL_TYPE_LLVMIR_BINARY:
case ACL_TYPE_DEFAULT:
default:
break;
}
return from;
}
aclType
HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) {
aclType continueCompileFrom = ACL_TYPE_DEFAULT;
binary_t binary = this->binary();
// If the binary already exists
if ((binary.first != nullptr) && (binary.second > 0)) {
void *mem = const_cast<void *>(binary.first);
acl_error errorCode;
binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode);
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error: Reading the binary from memory failed.\n";
return continueCompileFrom;
}
// Calculate the next stage to compile from, based on sections in binaryElf_;
// No any validity checks here
std::vector<aclType> completeStages;
bool needOptionsCheck = true;
continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck);
// Saving binary in the interface class,
// which also load compile & link options from binary
setBinary(static_cast<char*>(mem), binary.second);
if (!options || !needOptionsCheck) {
return continueCompileFrom;
}
bool recompile = false;
//! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
switch (continueCompileFrom) {
case ACL_TYPE_HSAIL_BINARY:
case ACL_TYPE_CG:
case ACL_TYPE_ISA: {
// Compare options loaded from binary with current ones, recompile if differ;
// If compile options are absent in binary, do not compare and recompile
if (compileOptions_.empty())
break;
const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions);
assert(symbol && "symbol not found");
std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]);
size_t symSize = 0;
const void *opts = aclExtractSymbol(dev().compiler(),
binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode);
if (errorCode != ACL_SUCCESS) {
recompile = true;
break;
}
std::string sBinOptions = std::string((char*)opts, symSize);
std::string sCurOptions = compileOptions_ + linkOptions_;
amd::option::Options curOptions, binOptions;
if (!amd::option::parseAllOptions(sBinOptions, binOptions)) {
buildLog_ += binOptions.optionsLog();
LogError("Parsing compile options from binary failed.");
return ACL_TYPE_DEFAULT;
}
if (!amd::option::parseAllOptions(sCurOptions, curOptions)) {
buildLog_ += curOptions.optionsLog();
LogError("Parsing compile options failed.");
return ACL_TYPE_DEFAULT;
}
if (!curOptions.equals(binOptions)) {
recompile = true;
}
break;
}
default:
break;
}
if (recompile) {
while (!completeStages.empty()) {
continueCompileFrom = completeStages.back();
if (continueCompileFrom == ACL_TYPE_SPIRV_BINARY ||
continueCompileFrom == ACL_TYPE_LLVMIR_BINARY ||
continueCompileFrom == ACL_TYPE_SPIR_BINARY ||
continueCompileFrom == ACL_TYPE_DEFAULT) {
break;
}
completeStages.pop_back();
}
}
}
return continueCompileFrom;
}
inline static std::vector<std::string>
splitSpaceSeparatedString(char *str)
{
std::string s(str);
std::stringstream ss(s);
std::istream_iterator<std::string> beg(ss), end;
std::vector<std::string> vec(beg, end);
return vec;
}
bool
HSAILProgram::linkImpl(amd::option::Options* options)
{
acl_error errorCode;
aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
bool finalize = true;
bool hsaLoad = true;
// If !binaryElf_ then program must have been created using clCreateProgramWithBinary
if (!binaryElf_) {
continueCompileFrom = getNextCompilationStageFromBinary(options);
}
switch (continueCompileFrom) {
case ACL_TYPE_SPIRV_BINARY:
case ACL_TYPE_SPIR_BINARY:
// Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases:
// 1. if the program is not created with binary;
// 2. if the program is created with binary and contains only .llvmir & .comment
// 3. if the program is created with binary, contains .llvmir, .comment, brig sections,
// but the binary's compile & link options differ from current ones (recompilation);
case ACL_TYPE_LLVMIR_BINARY:
// Compilation from ACL_TYPE_HSAIL_BINARY to ACL_TYPE_CG in cases:
// 1. if the program is created with binary and contains only brig sections
case ACL_TYPE_HSAIL_BINARY:
// Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases:
// 1. if the program is created with binary and contains only hsail text
case ACL_TYPE_HSAIL_TEXT: {
std::string curOptions = options->origOptionStr + hsailOptions();
errorCode = aclCompile(dev().compiler(), binaryElf_,
curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, nullptr);
buildLog_ += aclGetCompilerLog(dev().compiler());
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error: BRIG code generation failed.\n";
return false;
}
break;
}
case ACL_TYPE_CG:
break;
case ACL_TYPE_ISA:
finalize = false;
break;
default:
buildLog_ += "Error: The binary is incorrect or incomplete. Finalization to ISA couldn't be performed.\n";
return false;
}
if (finalize) {
std::string fin_options(options->origOptionStr + hsailOptions());
// Append an option so that we can selectively enable a SCOption on CZ
// whenever IOMMUv2 is enabled.
if (dev().settings().svmFineGrainSystem_) {
fin_options.append(" -sc-xnack-iommu");
}
errorCode = aclCompile(dev().compiler(), binaryElf_,
fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, nullptr);
buildLog_ += aclGetCompilerLog(dev().compiler());
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error: BRIG finalization to ISA failed.\n";
return false;
}
}
// ACL_TYPE_CG stage is not performed for offline compilation
hsa_agent_t agent;
agent.handle = 1;
if (!isNull() && hsaLoad) {
executable_ = loader_->CreateExecutable(HSA_PROFILE_BASE, nullptr);
if (executable_ == nullptr) {
buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n";
return false;
}
size_t size = 0;
hsa_code_object_t code_object;
code_object.handle = reinterpret_cast<uint64_t>(aclExtractSection(dev().compiler(), binaryElf_, &size, aclTEXT, &errorCode));
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error: Extracting AMD HSA Code Object from binary failed.\n";
return false;
}
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
if (status != HSA_STATUS_SUCCESS) {
buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
return false;
}
}
size_t kernelNamesSize = 0;
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize);
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
return false;
}
if (!isNull() && kernelNamesSize > 0) {
char* kernelNames = new char[kernelNamesSize];
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames, &kernelNamesSize);
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
delete kernelNames;
return false;
}
std::vector<std::string> vKernels = splitSpaceSeparatedString(kernelNames);
delete kernelNames;
std::vector<std::string>::iterator it = vKernels.begin();
bool dynamicParallelism = false;
aclMetadata md;
md.numHiddenKernelArgs = 0;
size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs);
for (it; it != vKernels.end(); ++it) {
std::string kernelName(*it);
std::string openclKernelName = device::Kernel::openclMangledName(kernelName);
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS,
openclKernelName.c_str(), &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs);
if (errorCode != ACL_SUCCESS) {
buildLog_ += "Error: Querying of kernel '" + openclKernelName +
"' extra arguments count from AMD HSA Code Object failed. Kernel initialization failed.\n";
return false;
}
HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions(),
md.numHiddenKernelArgs);
kernels()[kernelName] = aKernel;
amd::hsa::loader::Symbol *sym = executable_->GetSymbol("", openclKernelName.c_str(), agent, 0);
if (!sym) {
buildLog_ += "Error: Getting kernel ISA code symbol '" + openclKernelName +
"' from AMD HSA Code Object failed. Kernel initialization failed.\n";
return false;
}
if (!aKernel->init(sym, false)) {
buildLog_ += "Error: Kernel '" + openclKernelName + "' initialization failed.\n";
return false;
}
buildLog_ += aKernel->buildLog();
aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
dynamicParallelism |= aKernel->dynamicParallelism();
// Find max scratch regs used in the program. It's used for scratch buffer preallocation
// with dynamic parallelism, since runtime doesn't know which child kernel will be called
maxScratchRegs_ = std::max(static_cast<uint>(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
}
// Allocate kernel table for device enqueuing
if (!isNull() && dynamicParallelism && !allocKernelTable()) {
return false;
}
}
// Save the binary in the interface class
size_t size = 0;
void *mem = nullptr;
aclWriteToMem(binaryElf_, &mem, &size);
setBinary(static_cast<char*>(mem), size);
buildLog_ += aclGetCompilerLog(dev().compiler());
setType(TYPE_EXECUTABLE);
return true;
}
bool
HSAILProgram::createBinary(amd::option::Options *options)
{
return true;
}
bool
HSAILProgram::initClBinary()
{
if (clBinary_ == nullptr) {
clBinary_ = new ClBinaryHsa(static_cast<const Device &>(device()));
if (clBinary_ == nullptr) {
return false;
}
}
return true;
}
void
HSAILProgram::releaseClBinary()
{
if (clBinary_ != nullptr) {
delete clBinary_;
clBinary_ = nullptr;
}
}
std::string
HSAILProgram::hsailOptions()
{
std::string hsailOptions;
// Set options for the standard device specific options
// All our devices support these options now
if (dev().settings().reportFMAF_) {
hsailOptions.append(" -DFP_FAST_FMAF=1");
}
if (dev().settings().reportFMA_) {
hsailOptions.append(" -DFP_FAST_FMA=1");
}
if (!dev().settings().singleFpDenorm_) {
hsailOptions.append(" -cl-denorms-are-zero");
}
// Check if the host is 64 bit or 32 bit
LP64_ONLY(hsailOptions.append(" -m64"));
// Append each extension supported by the device
std::string token;
std::istringstream iss("");
iss.str(device().info().extensions_);
while (getline(iss, token, ' ')) {
if (!token.empty()) {
hsailOptions.append(" -D");
hsailOptions.append(token);
hsailOptions.append("=1");
}
}
return hsailOptions;
}
bool
HSAILProgram::allocKernelTable()
{
uint size = kernels().size() * sizeof(size_t);
kernels_ = new pal::Memory(dev(), size);
// Initialize kernel table
if ((kernels_ == nullptr) || !kernels_->create(Resource::RemoteUSWC)) {
delete kernels_;
return false;
}
else {
size_t* table = reinterpret_cast<size_t*>(
kernels_->map(nullptr, pal::Resource::WriteOnly));
for (auto& it : kernels()) {
HSAILKernel* kernel = static_cast<HSAILKernel*>(it.second);
table[kernel->index()] = static_cast<size_t>(
kernel->gpuAqlCode()->vmAddress());
}
kernels_->unmap(nullptr);
}
return true;
}
void
HSAILProgram::fillResListWithKernels(
std::vector<const Memory*>& memList) const
{
for (auto& it : kernels()) {
memList.push_back(
static_cast<HSAILKernel*>(it.second)->gpuAqlCode());
}
}
const aclTargetInfo &
HSAILProgram::info(const char * str) {
acl_error err;
std::string arch = "hsail";
if (dev().settings().use64BitPtr_) {
arch = "hsail64";
}
info_ = aclGetTargetInfo(arch.c_str(), ( str && str[0] == '\0' ?
dev().hwInfo()->targetName_ : str ), &err);
if (err != ACL_SUCCESS) {
LogWarning("aclGetTargetInfo failed");
}
return info_;
}
hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) {
hsa_isa_t isa = {0};
if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; }
if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; }
if (!strcmp(Gfx800, name)) { isa.handle = gfx800; return isa; }
if (!strcmp(Gfx801, name)) { isa.handle = gfx801; return isa; }
if (!strcmp(Gfx804, name)) { isa.handle = gfx804; return isa; }
if (!strcmp(Gfx810, name)) { isa.handle = gfx810; return isa; }
if (!strcmp(Gfx900, name)) { isa.handle = gfx900; return isa; }
if (!strcmp(Gfx901, name)) { isa.handle = gfx901; return isa; }
return isa;
}
bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
switch (program_->dev().hwInfo()->gfxipVersion_) {
default:
LogError("Unsupported gfxip version");
return false;
case gfx700:
case gfx701:
case gfx702:
// gfx701 only differs from gfx700 by faster fp operations and can be loaded on either device.
return isa.handle == gfx700 || isa.handle == gfx701;
case gfx800:
switch (program_->dev().properties().revision) {
case Pal::AsicRevision::Iceland:
case Pal::AsicRevision::Tonga:
return isa.handle == gfx800;
case Pal::AsicRevision::Carrizo:
return isa.handle == gfx801;
case Pal::AsicRevision::Fiji:
case Pal::AsicRevision::Ellesmere:
case Pal::AsicRevision::Baffin:
// gfx800 ISA has only sgrps limited and can be loaded.
// gfx801 ISA has XNACK limitations and can be loaded.
return isa.handle == gfx800 || isa.handle == gfx801 || isa.handle == gfx804;
case Pal::AsicRevision::Stoney:
return isa.handle == gfx810;
default:
assert(0);
return false;
}
case gfx900:
switch (program_->dev().properties().revision) {
case 0:
/* case Pal::AsicRevision::Greenland:
return isa.handle == gfx900 || isa.handle == gfx901;*/
default:
assert(0);
return false;
}
}
}
void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
hsa_agent_t agent, size_t size, size_t align, bool zero) {
assert(size);
assert(align);
switch (segment) {
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
return AgentGlobalAlloc(agent, size, align, zero);
case AMDGPU_HSA_SEGMENT_CODE_AGENT:
return KernelCodeAlloc(agent, size, align, zero);
default:
assert(false); return 0;
}
}
bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment,
hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) {
switch (segment) {
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
return AgentGlobalCopy(dst, offset, src, size);
case AMDGPU_HSA_SEGMENT_CODE_AGENT:
return KernelCodeCopy(dst, offset, src, size);
default:
assert(false); return false;
}
}
void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment,
hsa_agent_t agent, void* seg, size_t size) {
switch (segment) {
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
case AMDGPU_HSA_SEGMENT_READONLY_AGENT: AgentGlobalFree(seg, size); break;
case AMDGPU_HSA_SEGMENT_CODE_AGENT: KernelCodeFree(seg, size); break;
default:
assert(false); return;
}
}
void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment,
hsa_agent_t agent, void* seg, size_t offset) {
assert(seg);
switch (segment) {
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
case AMDGPU_HSA_SEGMENT_READONLY_AGENT: {
pal::Memory *gpuMem = reinterpret_cast<pal::Memory*>(seg);
return reinterpret_cast<void*>(gpuMem->vmAddress() + offset);
}
case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset;
default:
assert(false); return nullptr;
}
}
hsa_status_t ORCAHSALoaderContext::SamplerCreate(
hsa_agent_t agent,
const hsa_ext_sampler_descriptor_t *sampler_descriptor,
hsa_ext_sampler_t *sampler_handle) {
if (!agent.handle) {
return HSA_STATUS_ERROR_INVALID_AGENT;
}
if (!sampler_descriptor || !sampler_handle) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
uint32_t state = 0;
switch (sampler_descriptor->coordinate_mode) {
case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED: state = amd::Sampler::StateNormalizedCoordsFalse; break;
case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED: state = amd::Sampler::StateNormalizedCoordsTrue; break;
default:
assert(false);
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
switch (sampler_descriptor->filter_mode) {
case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: state |= amd::Sampler::StateFilterNearest; break;
case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR: state |= amd::Sampler::StateFilterLinear; break;
default:
assert(false);
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
switch (sampler_descriptor->address_mode) {
case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: state |= amd::Sampler::StateAddressClampToEdge; break;
case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break;
case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT: state |= amd::Sampler::StateAddressRepeat; break;
case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break;
case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: state |= amd::Sampler::StateAddressNone; break;
default:
assert(false);
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
assert(!program_->dev().settings().hsailDirectSRD_);
pal::Sampler* sampler = new pal::Sampler(program_->dev());
if (!sampler || !sampler->create(state)) {
delete sampler;
return HSA_STATUS_ERROR;
}
program_->addSampler(sampler);
sampler_handle->handle = sampler->hwSrd();
return HSA_STATUS_SUCCESS;
}
hsa_status_t ORCAHSALoaderContext::SamplerDestroy(
hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) {
if (!agent.handle) {
return HSA_STATUS_ERROR_INVALID_AGENT;
}
if (!sampler_handle.handle) {
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
}
return HSA_STATUS_SUCCESS;
}
void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) {
assert(size);
assert(align);
assert(sizeof(void*) == 8 || sizeof(void*) == 4);
void* ptr = amd::Os::alignedMalloc(size, align);
if (zero) {
memset(ptr, 0, size);
}
return ptr;
}
bool ORCAHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size) {
if (!dst || !src || dst == src) {
return false;
}
if (0 == size) {
return true;
}
amd::Os::fastMemcpy((char*)dst + offset, src, size);
return true;
}
void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
assert(size);
assert(align);
assert(sizeof(void*) == 8 || sizeof(void*) == 4);
pal::Memory* mem = new pal::Memory(program_->dev(), amd::alignUp(size, align));
if (!mem || !mem->create(pal::Resource::Local)) {
delete mem;
return nullptr;
}
assert(program_->dev().xferQueue());
if (zero) {
char pattern = 0;
program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size));
}
program_->addGlobalStore(mem);
program_->setGlobalVariableTotalSize(program_->globalVariableTotalSize() + size);
return mem;
}
bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) {
if (!dst || !src || dst == src) {
return false;
}
if (0 == size) {
return true;
}
assert(program_->dev().xferQueue());
pal::Memory* mem = reinterpret_cast<pal::Memory*>(dst);
return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true);
return true;
}
} // namespace pal
+292
查看文件
@@ -0,0 +1,292 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALPROGRAM_HPP_
#define PALPROGRAM_HPP_
#include "device/pal/palkernel.hpp"
#include "device/pal/palbinary.hpp"
#include "amd_hsa_loader.hpp"
namespace amd {
namespace option {
class Options;
} // option
namespace hsa {
namespace loader {
class Loader;
class Executable;
class Context;
} // loader
} // hsa
} // amd
//! \namespace pal PAL Device Implementation
namespace pal {
/*! \addtogroup pal PAL Device Implementation
* @{
*/
using namespace amd::hsa::loader;
class HSAILProgram;
class ClBinaryHsa;
class ORCAHSALoaderContext final: public Context {
public:
ORCAHSALoaderContext(HSAILProgram* program): program_(program) {}
virtual ~ORCAHSALoaderContext() {}
hsa_isa_t IsaFromName(const char *name) override;
bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override;
void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
hsa_agent_t agent, size_t size, size_t align, bool zero) override;
bool SegmentCopy(amdgpu_hsa_elf_segment_t segment,
hsa_agent_t agent, void* dst, size_t offset,
const void* src, size_t size) override;
void SegmentFree(amdgpu_hsa_elf_segment_t segment,
hsa_agent_t agent, void* seg, size_t size = 0) override;
void* SegmentAddress(amdgpu_hsa_elf_segment_t segment,
hsa_agent_t agent, void* seg, size_t offset) override;
bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment,
hsa_agent_t agent, void* seg, size_t size) override { return false; }
bool ImageExtensionSupported() override { return false; }
hsa_status_t ImageCreate(
hsa_agent_t agent,
hsa_access_permission_t image_permission,
const hsa_ext_image_descriptor_t *image_descriptor,
const void *image_data,
hsa_ext_image_t *image_handle) override {
// not supported
assert(false);
return HSA_STATUS_ERROR;
}
hsa_status_t ImageDestroy(
hsa_agent_t agent, hsa_ext_image_t image_handle) override {
// not supported
assert(false);
return HSA_STATUS_ERROR;
}
hsa_status_t SamplerCreate(
hsa_agent_t agent,
const hsa_ext_sampler_descriptor_t *sampler_descriptor,
hsa_ext_sampler_t *sampler_handle) override;
//! All samplers are owned by HSAILProgram and are deleted in its destructor.
hsa_status_t SamplerDestroy(
hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override;
private:
void* AgentGlobalAlloc(
hsa_agent_t agent, size_t size, size_t align, bool zero) {
return GpuMemAlloc(size, align, zero);
}
bool AgentGlobalCopy(void *dst, size_t offset, const void *src, size_t size) {
return GpuMemCopy(dst, offset, src, size);
}
void AgentGlobalFree(void *ptr, size_t size) {
GpuMemFree(ptr, size);
}
void* KernelCodeAlloc(
hsa_agent_t agent, size_t size, size_t align, bool zero) {
return CpuMemAlloc(size, align, zero);
}
bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size) {
return CpuMemCopy(dst, offset, src, size);
}
void KernelCodeFree(void *ptr, size_t size) {
CpuMemFree(ptr, size);
}
void* CpuMemAlloc(size_t size, size_t align, bool zero);
bool CpuMemCopy(void *dst, size_t offset, const void* src, size_t size);
void CpuMemFree(void *ptr, size_t size) {
amd::Os::alignedFree(ptr);
}
void* GpuMemAlloc(size_t size, size_t align, bool zero);
bool GpuMemCopy(void *dst, size_t offset, const void *src, size_t size);
void GpuMemFree(void *ptr, size_t size = 0) {
delete reinterpret_cast<pal::Memory*>(ptr);
}
ORCAHSALoaderContext(const ORCAHSALoaderContext &c);
ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext &c);
enum gfx_handle {
gfx700 = 700,
gfx701 = 701,
gfx702 = 702,
gfx800 = 800,
gfx801 = 801,
gfx804 = 804,
gfx810 = 810,
gfx900 = 900,
gfx901 = 901
};
pal::HSAILProgram* program_;
};
//! \class HSAIL program
class HSAILProgram : public device::Program
{
friend class ClBinary;
public:
//! Default constructor
HSAILProgram(Device& device);
HSAILProgram(NullDevice& device);
//! Default destructor
~HSAILProgram();
//! Returns the aclBinary associated with the progrm
aclBinary* binaryElf() const {
return static_cast<aclBinary*>(binaryElf_); }
void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); }
const std::vector<Memory*>& globalStores() const { return globalStores_; }
//! Return a typecasted GPU device
pal::Device& dev()
{ return const_cast<pal::Device&>(
static_cast<const pal::Device&>(device())); }
//! Returns GPU kernel table
const Memory* kernelTable() const { return kernels_; }
//! Adds all kernels to the mem handle lists
void fillResListWithKernels(std::vector<const Memory*>& memList) const;
//! Returns the maximum number of scratch regs used in the program
uint maxScratchRegs() const { return maxScratchRegs_; }
//! Add internal static sampler
void addSampler(Sampler* sampler) { staticSamplers_.push_back(sampler); }
//! Returns TRUE if the program just compiled
bool isNull() const { return isNull_; }
protected:
//! pre-compile setup for GPU
virtual bool initBuild(amd::option::Options* options);
//! post-compile setup for GPU
virtual bool finiBuild(bool isBuildGood);
/*! \brief Compiles GPU CL program to LLVM binary (compiler frontend)
*
* \return True if we successefully compiled a GPU program
*/
virtual bool compileImpl(
const std::string& sourceCode, //!< the program's source code
const std::vector<const std::string*>& headers,
const char** headerIncludeNames,
amd::option::Options* options //!< compile options's object
);
/* \brief Returns the next stage to compile from, based on sections in binary,
* also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT,
* sets needOptionsCheck to true if options check is needed to decide whether or not to recompile
*/
aclType getCompilationStagesFromBinary(std::vector<aclType>& completeStages, bool& needOptionsCheck);
/* \brief Returns the next stage to compile from, based on sections and options in binary
*/
aclType getNextCompilationStageFromBinary(amd::option::Options* options);
/*! \brief Compiles LLVM binary to FSAIL code (compiler backend: link+opt+codegen)
*
* \return The build error code
*/
int compileBinaryToFSAIL(
amd::option::Options* options //!< options for compilation
);
virtual bool linkImpl(amd::option::Options* options);
//! Link the device programs.
virtual bool linkImpl (const std::vector<device::Program*>& inputPrograms,
amd::option::Options* options,
bool createLibrary);
virtual bool createBinary(amd::option::Options* options);
//! Initialize Binary
virtual bool initClBinary();
//! Release the Binary
virtual void releaseClBinary();
virtual const aclTargetInfo & info(const char * str = "");
virtual bool isElf(const char* bin) const {
return amd::isElfMagic(bin);
//return false;
}
//! Returns the binary
// This should ensure that the binary is updated with all the kernels
// ClBinary& clBinary() { return binary_; }
ClBinaryHsa* clBinary() {
return static_cast<ClBinaryHsa*>(device::Program::clBinary());
}
const ClBinaryHsa* clBinary() const {
return static_cast<const ClBinaryHsa*>(device::Program::clBinary());
}
private:
//! Disable default copy constructor
HSAILProgram(const HSAILProgram&);
//! Disable operator=
HSAILProgram& operator=(const HSAILProgram&);
//! Returns all the options to be appended while passing to the
//compiler library
std::string hsailOptions();
//! Allocate kernel table
bool allocKernelTable();
std::string openCLSource_; //!< Original OpenCL source
std::string HSAILProgram_; //!< FSAIL program after compilation
std::string llvmBinary_; //!< LLVM IR binary code
aclBinary* binaryElf_; //!< Binary for the new compiler library
void* rawBinary_; //!< Pointer to the raw binary
aclBinaryOptions binOpts_; //!< Binary options to create aclBinary
std::vector<Memory*> globalStores_; //!< Global memory for the program
Memory* kernels_; //!< Table with kernel object pointers
uint maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel
std::list<Sampler*> staticSamplers_; //!< List od internal static samplers
bool isNull_; //!< Null program no memory allocations
amd::hsa::loader::Loader* loader_; //!< Loader object
amd::hsa::loader::Executable* executable_; //!< Executable for HSA Loader
ORCAHSALoaderContext loaderContext_; //!< Context for HSA Loader
};
/*@}*/} // namespace pal
#endif /*PALPROGRAM_HPP_*/
檔案差異因為檔案過大而無法顯示 載入差異
+508
查看文件
@@ -0,0 +1,508 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALRESOURCE_HPP_
#define PALRESOURCE_HPP_
#include "platform/command.hpp"
#include "platform/program.hpp"
#include "device/pal/paldefs.hpp"
//! \namespace pal PAL Resource Implementation
namespace pal {
class Device;
class VirtualGPU;
/*! \addtogroup PAL PAL Resource Implementation
* @{
*/
class GpuMemoryReference : public amd::ReferenceCountedObject
{
public:
static GpuMemoryReference* Create(
const Device& dev,
const Pal::GpuMemoryCreateInfo& createInfo);
static GpuMemoryReference* Create(
const Device& dev,
const void* sysMem,
size_t memSize);
static GpuMemoryReference* Create(
const Device& dev,
const Pal::ExternalResourceOpenInfo& openInfo);
static GpuMemoryReference* Create(
const Device& dev,
const Pal::ExternalImageOpenInfo& openInfo,
Pal::ImageCreateInfo* imgCreateInfo,
Pal::IImage** image);
//! Default constructor
GpuMemoryReference();
//! Get PAL memory object
Pal::IGpuMemory* iMem() const { return gpuMem_; }
Pal::IGpuMemory* gpuMem_; //!< PAL GPU memory object
void* cpuAddress_; //!< CPU address of this memory
protected:
//! Default destructor
~GpuMemoryReference();
private:
//! Disable copy constructor
GpuMemoryReference(const GpuMemoryReference&);
//! Disable operator=
GpuMemoryReference& operator=(const GpuMemoryReference&);
};
//! GPU resource
class Resource : public amd::HeapObject
{
public:
enum InteropType {
InteropTypeless = 0,
InteropVertexBuffer,
InteropIndexBuffer,
InteropRenderBuffer,
InteropTexture,
InteropTextureViewLevel,
InteropTextureViewCube,
InteropSurface
};
struct CreateParams : public amd::StackObject {
amd::Memory* owner_; //!< Resource's owner
VirtualGPU* gpu_; //!< Resource won't be shared between multiple queues
CreateParams(): owner_(NULL), gpu_(NULL) {}
};
struct PinnedParams : public CreateParams {
const amd::HostMemoryReference* hostMemRef_;//!< System memory pointer for pinning
size_t size_; //!< System memory size
};
struct ViewParams : public CreateParams {
size_t offset_; //!< Alias resource offset
size_t size_; //!< Alias resource size
const Resource* resource_; //!< Parent resource for the view creation
const void* memory_;
};
struct ImageViewParams : public CreateParams {
size_t level_; //!< Image mip level for a new view
size_t layer_; //!< Image layer for a new view
const Resource* resource_; //!< Parent resource for the view creation
const void* memory_;
};
struct ImageBufferParams : public CreateParams {
const Resource* resource_; //!< Parent resource for the image creation
const void* memory_;
};
struct OGLInteropParams : public CreateParams {
InteropType type_; //!< OGL resource type
uint handle_; //!< OGL resource handle
uint mipLevel_; //!< Texture mip level
uint layer_; //!< Texture layer
void* glPlatformContext_;
void* glDeviceContext_;
uint flags_;
};
#ifdef _WIN32
struct D3DInteropParams : public CreateParams {
InteropType type_; //!< D3D resource type
void* iDirect3D_; //!< D3D resource interface object
void* handle_; //!< D3D resource handle
uint mipLevel_; //!< Texture mip level
int layer_; //!< Texture layer
uint misc; //!< miscellaneous cases
};
#endif // _WIN32
//! Resource memory
enum MemoryType
{
Empty = 0x0, //!< resource is empty
Local, //!< resource in local memory
Persistent, //!< resource in persistent memory
Remote, //!< resource in nonlocal memory
RemoteUSWC, //!< resource in nonlocal memory
Pinned, //!< resource in pinned system memory
View, //!< resource is an alias
OGLInterop, //!< resource is an OGL memory object
D3D10Interop, //!< resource is a D3D10 memory object
D3D11Interop, //!< resource is a D3D11 memory object
ImageView, //!< resource is a view to some image
ImageBuffer, //!< resource is an image view of a buffer
BusAddressable, //!< resource is a bus addressable memory
ExternalPhysical, //!< resource is an external physical memory
D3D9Interop, //!< resource is a D3D9 memory object
Scratch, //!< resource is scratch memory
Shader, //!< resource is a shader
};
//! Resource map flags
enum MapFlags
{
Discard = 0x00000001, //!< discard lock
NoOverwrite = 0x00000002, //!< lock with no overwrite
ReadOnly = 0x00000004, //!< lock for read only operation
WriteOnly = 0x00000008, //!< lock for write only operation
NoWait = 0x00000010, //!< lock with no wait
};
//! Resource descriptor
struct Descriptor : public amd::HeapObject
{
MemoryType type_; //!< Memory type
size_t width_; //!< Resource width
size_t height_; //!< Resource height
size_t depth_; //!< Resource depth
uint mipLevels_; //!< Number of mip levels
uint flags_; //!< Resource flags, used in creation
size_t pitch_; //!< Resource pitch, valid if locked
size_t slice_; //!< Resource slice, valid if locked
cl_image_format format_; //!< CL image format
cl_mem_object_type topology_;//!< CL mem object type
union {
struct {
uint dimSize_ : 2; //!< Dimension size
uint cardMemory_ : 1; //!< GSL resource is in video memory
uint imageArray_ : 1; //!< GSL resource is an array of images
uint buffer_ : 1; //!< GSL resource is a buffer
uint tiled_ : 1; //!< GSL resource is tiled
uint SVMRes_ : 1; //!< SVM flag to the cal resource
uint scratch_ : 1; //!< Scratch buffer
uint isAllocExecute_ : 1; //!< SVM resource allocation attribute for shader\cmdbuf
};
uint state_;
};
};
//! Constructor of 1D Resource object
Resource(
const Device& gpuDev, //!< GPU device object
size_t size //!< Resource size
);
//! Constructor of Image Resource object
Resource(
const Device& gpuDev, //!< GPU device object
size_t width, //!< resource width
size_t height, //!< resource height
size_t depth, //!< resource depth
cl_image_format format, //!< resource format
cl_mem_object_type imageType, //!< CL image type
uint mipLevels = 1 //!< Number of mip levels
);
//! Destructor of the resource
virtual ~Resource();
/*! \brief Creates a CAL object, associated with the resource
*
* \return True if we succesfully created a CAL resource
*/
virtual bool create(
MemoryType memType, //!< memory type
CreateParams* params = 0 //!< special parameters for resource allocation
);
/*! \brief Copies a subregion of memory from one resource to another
*
* This is a general copy from anything to anything (as long as it fits).
* All positions and sizes are given in bytes. Note, however, that only
* a subset of this general interface is currently implemented.
*
* \return true if successful
*/
bool partialMemCopyTo(
VirtualGPU& gpu, //!< Virtual GPU device object
const amd::Coord3D& srcOrigin, //!< Origin of the source region
const amd::Coord3D& dstOrigin, //!< Origin of the destination region
const amd::Coord3D& size, //!< Size of the region to copy
Resource& dstResource, //!< Destination resource
bool enableRectCopy = false, //!< Rectangular DMA support
bool flushDMA = false, //!< Flush DMA if requested
uint bytesPerElement = 1 //!< Bytes Per Element
) const;
/*! \brief Copies size/4 DWORD of memory to a surface
*
* This is a raw copy to any surface using a CP packet.
* Size needs to be atleast a DWORD or multiple
*
*/
void writeRawData(
VirtualGPU& gpu, //!< Virtual GPU device object
size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS)
const void* data, //!< Data to be copied
bool waitForEvent //!< Wait for event complete
) const;
//! Returns the offset in GPU memory for aliases
size_t offset() const { return offset_; }
//! Returns the pinned memory offset
uint64_t pinOffset() const { return pinOffset_; }
//! Returns the GPU device that owns this resource
const Device& dev() const { return gpuDevice_; }
//! Returns the descriptor for resource
const Descriptor& desc() const { return desc_; }
//! Returns the PAL memory object
Pal::IGpuMemory* iMem() const { return memRef_->iMem(); }
//! Returns global memory offset
uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; }
//! Returns global memory offset
uint64_t vmSize() const { return iMem()->Desc().size - offset_; }
//! Returns global memory offset
bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; }
//! Checks if persistent memory can have a direct map
bool isPersistentDirectMap() const;
/*! \brief Locks the resource and returns a physical pointer
*
* \note This operation stalls HW pipeline!
*
* \return Pointer to the physical memory
*/
void* map(
VirtualGPU* gpu, //!< Virtual GPU device object
uint flags = 0, //!< flags for the map operation
// Optimization for multilayer map/unmap
uint startLayer = 0, //!< Start layer for multilayer map
uint numLayers = 0 //!< End layer for multilayer map
);
//! Unlocks the resource if it was locked
void unmap(
VirtualGPU* gpu //!< Virtual GPU device object
);
//! Marks the resource as busy
void setBusy(
VirtualGPU& gpu, //!< Virtual GPU device object
GpuEvent calEvent //!< CAL event
) const;
//! Wait for the resource
void wait(
VirtualGPU& gpu, //!< Virtual GPU device object
bool waitOnBusyEngine = false//!< Wait only if engine has changed
) const;
//! Performs host write to the resource GPU memory
bool hostWrite(
VirtualGPU* gpu, //!< Virtual GPU device object
const void* hostPtr, //!< Host pointer to the SRC data
const amd::Coord3D& origin, //!< Offsets for the update
const amd::Coord3D& size, //!< The number of bytes to write
uint flags = 0, //!< Map flags
size_t rowPitch = 0, //!< Raw data row pitch
size_t slicePitch = 0 //!< Raw data slice pitch
);
//! Performs host read from the resource GPU memory
bool hostRead(
VirtualGPU* gpu, //!< Virtual GPU device object
void* hostPtr, //!< Host pointer to the DST data
const amd::Coord3D& origin, //!< Offsets for the update
const amd::Coord3D& size, //!< The number of bytes to write
size_t rowPitch = 0, //!< Raw data row pitch
size_t slicePitch = 0 //!< Raw data slice pitch
);
//! Warms up the rename list for this resource
void warmUpRenames(VirtualGPU& gpu);
//! Gets the resource element size
uint elementSize() const { return elementSize_; }
//! Get the mapped address of this resource
address data() const { return reinterpret_cast<address>(address_); }
//! Frees all allocated CAL memories and resources,
//! associated with this objects. And also destroys all rename structures
//! Note: doesn't destroy the object itself
void free();
//! Return memory type
MemoryType memoryType() const { return desc().type_; }
//! Retunrs true if memory type matches specified
bool isMemoryType(MemoryType memType) const;
//! Returns TRUE if resource was allocated as cacheable
bool isCacheable() const
{ return (isMemoryType(Remote) || isMemoryType(Pinned)) ? true : false; }
bool gslGLAcquire() ;
bool gslGLRelease() ;
//! Returns HW state for the resource (used for images only)
const void* hwState() const { return hwState_; }
//! Returns CPU HW SRD for the resource (used for images only)
uint64_t hwSrd() const { return hwSrd_; }
uint numComponents() const {
return Pal::Formats::NumComponents(image_->GetImageCreateInfo().format.chFmt); }
protected:
uint elementSize_; //!< Size of a single element in bytes
private:
//! Disable copy constructor
Resource(const Resource&);
//! Disable operator=
Resource& operator=(const Resource&);
typedef std::vector<GpuMemoryReference*> RenameList;
//! Rename current resource
bool rename(
VirtualGPU& gpu, //!< Virtual GPU device object
bool force = false //!< Force renaming
);
//! Sets the rename as active
void setActiveRename(
VirtualGPU& gpu, //!< Virtual GPU device object
GpuMemoryReference* rename //!< new active rename
);
//! Gets the active rename
bool getActiveRename(
VirtualGPU& gpu, //!< Virtual GPU device object
GpuMemoryReference** rename //!< Saved active rename
);
/*! \brief Locks the resource with layers and returns a physical pointer
*
* \return Pointer to the physical memory
*/
void* mapLayers(
VirtualGPU* gpu, //!< Virtual GPU device object
uint flags = 0 //!< flags for the map operation
);
//! Unlocks the resource with layers if it was locked
void unmapLayers(
VirtualGPU* gpu //!< Virtual GPU device object
);
//! Calls GSL to map a resource
void* gpuMemoryMap(
size_t* pitch, //!< Pitch value for the image
uint flags, //!< Map flags
Pal::IGpuMemory* resource //!< GSL memory object
) const;
//! Uses GSL to unmap a resource
void gpuMemoryUnmap(
Pal::IGpuMemory* resource //!< GSL memory object
) const;
//! Fress all GSL resources associated with OCL resource
void gslFree() const;
//! Converts Resource memory type to the PAL heaps
void memTypeToHeap(
Pal::GpuMemoryCreateInfo* createInfo //!< Memory create info
);
const Device& gpuDevice_; //!< GPU device
Descriptor desc_; //!< Descriptor for this resource
amd::Atomic<int> mapCount_; //!< Total number of maps
void* address_; //!< Physical address of this resource
size_t offset_; //!< Resource offset
size_t curRename_; //!< Current active rename in the list
RenameList renames_; //!< Rename resource list
GpuMemoryReference* memRef_; //!< GSL resource reference
const Resource* viewOwner_; //!< GPU resource, which owns this view
uint64_t pinOffset_; //!< Pinned memory offset
void* glInteropMbRes_;//!< Mb Res handle
uint32_t glType_; //!< GL interop type
void* glPlatformContext_;
void* glDeviceContext_;
// Optimization for multilayer map/unmap
uint startLayer_; //!< Start layer for map/unmapLayer
uint numLayers_; //!< Number of layers for map/unmapLayer
uint mapFlags_; //!< Map flags for map/umapLayer
//! @note: This field is necessary for the thread safe release only
VirtualGPU* gpu_; //!< Resource will be used only on this queue
Pal::IImage* image_; //!< PAL image object
uint32_t* hwState_; //!< HW state for image object
uint64_t hwSrd_; //!< GPU pointer to HW SRD
};
class ResourceCache : public amd::HeapObject
{
public:
//! Default constructor
ResourceCache(size_t cacheSizeLimit)
: lockCacheOps_("PAL resource cache", true)
, cacheSize_(0)
, cacheSizeLimit_(cacheSizeLimit)
{}
//! Default destructor
~ResourceCache();
//! Adds a CAL resource to the cache
bool addGpuMemory(
Resource::Descriptor* desc, //!< Resource descriptor - cache key
GpuMemoryReference* ref //!< Resource reference
);
//! Finds a CAL resource from the cache
GpuMemoryReference* findGpuMemory(
Resource::Descriptor* desc, //!< Resource descriptor - cache key
Pal::gpusize size,
Pal::gpusize alignment
);
//! Destroys cache
bool free(size_t minCacheEntries = 0);
private:
//! Disable copy constructor
ResourceCache(const ResourceCache&);
//! Disable operator=
ResourceCache& operator=(const ResourceCache&);
//! Removes one last entry from the cache
void removeLast();
amd::Monitor lockCacheOps_; //!< Lock to serialise cache access
size_t cacheSize_; //!< Current cache size in bytes
size_t cacheSizeLimit_; //!< Cache size limit in bytes
//! CAL resource cache
std::list<std::pair<Resource::Descriptor*, GpuMemoryReference*> > resCache_;
};
/*@}*/} // namespace pal
#endif /*PALRESOURCE_HPP_*/
+78
查看文件
@@ -0,0 +1,78 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALSCHED_HPP_
#define PALSCHED_HPP_
#include "hsa.h"
namespace pal {
//! AmdAqlWrap slot state
enum AqlWrapState {
AQL_WRAP_FREE = 0,
AQL_WRAP_RESERVED,
AQL_WRAP_READY,
AQL_WRAP_MARKER,
AQL_WRAP_BUSY,
AQL_WRAP_DONE
};
struct AmdVQueueHeader {
uint32_t aql_slot_num; //!< [LRO/SRO] The total number of the AQL slots (multiple of 64).
uint32_t event_slot_num; //!< [LRO] The number of kernel events in the events buffer
uint64_t event_slot_mask; //!< [LRO] A pointer to the allocation bitmask array for the events
uint64_t event_slots; //!< [LRO] Pointer to a buffer for the events.
// Array of event_slot_num entries of AmdEvent
uint64_t aql_slot_mask; //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots
uint32_t command_counter; //!< [LRW] The global counter for the submitted commands into the queue
uint32_t wait_size; //!< [LRO] The wait list size (in clk_event_t)
uint32_t arg_size; //!< [LRO] The size of argument buffer (in bytes)
uint32_t mask_groups; //!< Processed mask groups by one thread
uint64_t kernel_table; //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry)
uint32_t reserved[2]; //!< For the future usage
};
struct AmdAqlWrap {
uint32_t state; //!< [LRW/SRW] The current state of the AQL wrapper: FREE, RESERVED, READY,
// MARKER, BUSY and DONE. The block could be returned back to a free state.
uint32_t enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start
uint32_t command_id; //!< [LWO/SRO] The unique command ID
uint32_t child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels.
// It’s incremented on the
// start and decremented on the finish. The parent kernel can be considered as
// done when the value is 0 and the state is DONE
uint64_t completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t)
uint64_t parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*)
uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects
uint32_t reserved[5]; //!< For the future usage
hsa_kernel_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet 64 bytes AQL packet
};
struct AmdEvent {
uint32_t state; //!< [LRO/SRW] Event state: START, END, COMPLETE
uint32_t counter; //!< [LRW] Event retain/release counter. 0 means the event is free
uint64_t timer[3]; //!< [LRO/SWO] Timer values for profiling for each state
uint64_t captureInfo; //!< [LRW/SRO] Profiling capture info for CLK_PROFILING_COMMAND_EXEC_TIME
};
struct SchedulerParam {
uint32_t signal; //!< Signal to stop the child queue(address must be 16 bytes aligned)
uint32_t eng_clk; //!< Engine clock in Mhz
uint64_t hw_queue; //!< Address to HW queue
uint64_t hsa_queue; //!< Address to HSA dummy queue
uint32_t useATC; //!< GPU access to shader program by ATC.
uint32_t scratchSize; //!< Scratch buffer size
uint64_t scratch; //!< GPU address to the scratch buffer
uint32_t numMaxWaves; //!< The max number of possible waves
uint32_t releaseHostCP; //!< Releases CP on the host queue
uint64_t parentAQL; //!< Host parent AmdAqlWrap packet
uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue
uint32_t scratchOffset; //!< Scratch buffer offset
uint32_t reserved[2]; //!< Reserved
};
} // namespace pal
#endif
+23
查看文件
@@ -0,0 +1,23 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
namespace pal {
#define SCHEDULER_KERNEL(...) #__VA_ARGS__
const char* SchedulerSourceCode = SCHEDULER_KERNEL(
\n
extern void __amd_scheduler(__global void *, __global void *, uint);
\n
__kernel void
scheduler(
__global void * queue,
__global void * params,
uint paramIdx)
{
__amd_scheduler(queue, params, paramIdx);
}
\n
);
} // namespace pal
+433
查看文件
@@ -0,0 +1,433 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "top.hpp"
#include "os/os.hpp"
#include "device/device.hpp"
#include "device/pal/paldefs.hpp"
#include "device/pal/palsettings.hpp"
#include <algorithm>
namespace pal {
/*! \brief information for adjusting maximum workload time
*
* This structure contains the time and OS minor version for max workload time
* adjustment for Windows 7 or 8.
*/
struct ModifyMaxWorkload
{
uint32_t time; //!< max work load time (10x ms)
uint32_t minorVersion; //!< OS minor version
};
Settings::Settings()
{
// Initialize the GPU device default settings
oclVersion_ = OpenCL12;
debugFlags_ = 0;
singleHeap_ = false;
syncObject_ = GPU_USE_SYNC_OBJECTS;
remoteAlloc_ = REMOTE_ALLOC;
stagedXferRead_ = true;
stagedXferWrite_ = true;
stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki;
// We will enable staged read/write if we use local memory
disablePersistent_ = false;
// By Default persistent writes will be disabled.
stagingWritePersistent_ = GPU_STAGING_WRITE_PERSISTENT;
maxRenames_ = 4;
maxRenameSize_ = 4 * Mi;
imageSupport_ = false;
hwLDSSize_ = 0;
// Set this to true when we drop the flag
doublePrecision_ = ::CL_KHR_FP64;
// Fill workgroup info size
// @todo: revisit the 256 limitation on workgroup size
maxWorkGroupSize_ = 256;
hostMemDirectAccess_ = HostMemDisable;
libSelector_ = amd::LibraryUndefined;
// Enable workload split by default (for 24 bit arithmetic or timeout)
workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT;
// By default use host blit
blitEngine_ = BlitEngineHost;
const static size_t MaxPinnedXferSize = 32;
pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi;
pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
// Disable FP_FAST_FMA defines by default
reportFMAF_ = false;
reportFMA_ = false;
// GPU device by default
apuSystem_ = false;
// Disable 64 bit pointers support by default
use64BitPtr_ = false;
// Max alloc size is 16GB
maxAllocSize_ = 16 * static_cast<uint64_t>(Gi);
// Disable memory dependency tracking by default
numMemDependencies_ = 0;
// By default cache isn't present
cacheLineSize_ = 0;
cacheSize_ = 0;
// Initialize transfer buffer size to 1MB by default
xferBufSize_ = 1024 * Ki;
// Use image DMA if requested
imageDMA_ = GPU_IMAGE_DMA;
// Disable ASIC specific features by default
ciPlus_ = false;
viPlus_ = false;
aiPlus_ = false;
// Number of compute rings.
numComputeRings_ = 0;
minWorkloadTime_ = 1; // 0.1 ms
maxWorkloadTime_ = 5000; // 500 ms
// Controls tiled images in persistent
//!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
linearPersistentImage_ = false;
useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;
// Device enqueuing settings
numDeviceEvents_ = 1024;
numWaitEvents_ = 8;
// Disable HSAIL by default
hsail_ = false;
// Don't support platform atomics by default.
svmAtomics_ = false;
// Use direct SRD by default
hsailDirectSRD_ = GPU_DIRECT_SRD;
// Use host queue for device enqueuing by default
useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
// Don't support Denormals for single precision by default
singleFpDenorm_ = false;
}
bool
Settings::create(
const Pal::DeviceProperties& palProp,
const Pal::GpuMemoryHeapProperties* heaps,
bool reportAsOCL12Device
)
{
// uint target = calAttr.target;
uint32_t osVer = 0x0;
// Disable thread trace by default for all devices
threadTraceEnable_ = false;
bool doublePrecision = true;
if (doublePrecision) {
// Report FP_FAST_FMA define if double precision HW
reportFMA_ = true;
// FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper
// Bonaire, Kalindi, Spectre and Spooky so disable
// FP_FMA_FMAF for those parts in switch below
reportFMAF_ = true;
}
// Update GPU specific settings and info structure if we have any
ModifyMaxWorkload modifyMaxWorkload = {0};
switch (palProp.revision) {
/* case Pal::AsicRevision:::
case CAL_TARGET_GREENLAND:
//TODO: specific codes for AI
aiPlus_ = true;*/
// Fall through to VI ...
case Pal::AsicRevision::Carrizo:
case Pal::AsicRevision::Stoney:
if (!aiPlus_) {
// APU systems for VI
apuSystem_ = true;
}
case Pal::AsicRevision::Iceland:
case Pal::AsicRevision::Tonga:
case Pal::AsicRevision::Fiji:
case Pal::AsicRevision::Ellesmere:
case Pal::AsicRevision::Baffin:
// Disable tiling aperture on VI+
linearPersistentImage_ = true;
// Keep this false even though we have support
// singleFpDenorm_ = true;
viPlus_ = true;
// Fall through to CI ...
case Pal::AsicRevision::Kalindi:
case Pal::AsicRevision::Spectre:
if (!viPlus_) {
// APU systems for CI
apuSystem_ = true;
// Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903)
modifyMaxWorkload.time = 2500; // 250ms
modifyMaxWorkload.minorVersion = 1; // Win 7
}
// Fall through ...
case Pal::AsicRevision::Bonaire:
case Pal::AsicRevision::Hawaii:
ciPlus_ = true;
hsail_ = true;
threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE;
reportFMAF_ = false;
if (palProp.revision == Pal::AsicRevision::Hawaii) {
reportFMAF_ = true;
}
// Cache line size is 64 bytes
cacheLineSize_ = 64;
// L1 cache size is 16KB
cacheSize_ = 16 * Ki;
if (ciPlus_) {
libSelector_ = amd::GPU_Library_CI;
if (LP64_SWITCH(WINDOWS_SWITCH(viPlus_, false), true)) {
oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ?
XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12;
}
if (GPU_FORCE_OCL20_32BIT) {
force32BitOcl20_ = true;
oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ?
XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12;
}
if (OPENCL_VERSION < 200) {
oclVersion_ = OpenCL12;
}
numComputeRings_ = 8;
}
else {
numComputeRings_ = 2;
libSelector_ = amd::GPU_Library_SI;
}
// This needs to be cleaned once 64bit addressing is stable
if (oclVersion_ < OpenCL20) {
use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) ? LP64_SWITCH(false,
/*calAttr.isWorkstation ||*/ hsail_) : GPU_FORCE_64BIT_PTR;
}
else {
if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
|| (oclVersion_ >= OpenCL20)))) {
use64BitPtr_ = true;
}
}
if (oclVersion_ >= OpenCL20) {
supportDepthsRGB_ = true;
}
if (use64BitPtr_) {
if (GPU_ENABLE_LARGE_ALLOCATION /*&& calAttr.isWorkstation*/) {
maxAllocSize_ = 64ULL * Gi;
}
else {
maxAllocSize_ = 4048 * Mi;
}
}
else {
maxAllocSize_ = 3ULL * Gi;
}
supportRA_ = false;
partialDispatch_ = GPU_PARTIAL_DISPATCH;
numMemDependencies_ = GPU_NUM_MEM_DEPENDENCY;
break;
default:
assert(0 && "Unknown ASIC type!");
return false;
}
// Enable atomics support
enableExtension(ClKhrInt64BaseAtomics);
enableExtension(ClKhrInt64ExtendedAtomics);
enableExtension(ClKhrGlobalInt32BaseAtomics);
enableExtension(ClKhrGlobalInt32ExtendedAtomics);
enableExtension(ClKhrLocalInt32BaseAtomics);
enableExtension(ClKhrLocalInt32ExtendedAtomics);
enableExtension(ClKhrByteAddressableStore);
enableExtension(ClKhrGlSharing);
enableExtension(ClKhrGlEvent);
enableExtension(ClAmdMediaOps);
enableExtension(ClAmdMediaOps2);
enableExtension(ClAmdPopcnt);
enableExtension(ClKhr3DImageWrites);
enableExtension(ClAmdVec3);
enableExtension(ClAmdPrintf);
enableExtension(ClKhrImage2dFromBuffer);
hwLDSSize_ = 32 * Ki;
imageSupport_ = true;
singleHeap_ = true;
// Use kernels for blit if appropriate
blitEngine_ = BlitEngineKernel;
hostMemDirectAccess_ |= HostMemBuffer;
// HW doesn't support untiled image writes
// hostMemDirectAccess_ |= HostMemImage;
// Make sure device actually supports double precision
doublePrecision_ = (doublePrecision) ? doublePrecision_ : false;
if (doublePrecision_) {
// Enable KHR double precision extension
enableExtension(ClKhrFp64);
}
if (doublePrecision) {
// Enable AMD double precision extension
doublePrecision_ = true;
enableExtension(ClAmdFp64);
}
//! @todo
/*
if (calAttr.totalSDIHeap > 0) {
//Enable bus addressable memory extension
enableExtension(ClAMDBusAddressableMemory);
}
if (calAttr.longIdleDetect) {
// KMD is unable to detect if we map the visible memory for CPU access, so
// accessing persistent staged buffer may fail if LongIdleDetct is enabled.
disablePersistent_ = true;
}
svmFineGrainSystem_ = calAttr.isSVMFineGrainSystem;
svmAtomics_ = (calAttr.svmAtomics || calAttr.isSVMFineGrainSystem) ? true : false;
*/
// Enable some platform extensions
enableExtension(ClAmdDeviceAttributeQuery);
enableExtension(ClKhrSpir);
// SVM is not currently supported for DX Interop
#if defined(_WIN32)
enableExtension(ClKhrD3d9Sharing);
enableExtension(ClKhrD3d10Sharing);
enableExtension(ClKhrD3d11Sharing);
#endif // _WIN32
// Enable some OpenCL 2.0 extensions
if (oclVersion_ >= OpenCL20) {
enableExtension(ClKhrGLDepthImages);
enableExtension(ClKhrSubGroups);
enableExtension(ClKhrDepthImages);
if (GPU_MIPMAP) {
enableExtension(ClKhrMipMapImage);
enableExtension(ClKhrMipMapImageWrites);
}
// Enable HW debug
if (GPU_ENABLE_HW_DEBUG) {
enableHwDebug_ = true;
}
}
if (apuSystem_ &&
((heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize) < (150*Mi))) {
remoteAlloc_ = true;
}
// Save resource cache size
#ifdef ATI_OS_LINUX
// Due to EPR#406216, set the default value for Linux for now
resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi;
#else
if (remoteAlloc_) {
resourceCacheSize_ = std::max((heaps[Pal::GpuHeapGartUswc].heapSize / 8),
GPU_RESOURCE_CACHE_SIZE * Mi);
}
else {
resourceCacheSize_ = std::max(((heaps[Pal::GpuHeapLocal].heapSize +
heaps[Pal::GpuHeapInvisible].heapSize) / 8),
GPU_RESOURCE_CACHE_SIZE * Mi);
}
resourceCacheSize_ = std::min(resourceCacheSize_, 512 * Mi);
#endif
// Override current device settings
override();
return true;
}
void
Settings::override()
{
// Limit reported workgroup size
if (GPU_MAX_WORKGROUP_SIZE != 0) {
maxWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
}
// Override blit engine type
if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) {
blitEngine_ = GPU_BLIT_ENGINE_TYPE;
}
if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
debugFlags_ = DEBUG_GPU_FLAGS;
}
if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
debugFlags_ = DEBUG_GPU_FLAGS;
}
if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
}
if (!flagIsDefault(GPU_USE_SYNC_OBJECTS)) {
syncObject_ = GPU_USE_SYNC_OBJECTS;
}
if (!flagIsDefault(GPU_NUM_COMPUTE_RINGS)) {
numComputeRings_ = GPU_NUM_COMPUTE_RINGS;
}
if (!flagIsDefault(GPU_RESOURCE_CACHE_SIZE)) {
resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi;
}
if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) {
switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) {
case 0:
singleFpDenorm_ = false;
break;
case 1:
singleFpDenorm_ = true;
break;
default:
break;
}
}
}
} // namespace pal
+128
查看文件
@@ -0,0 +1,128 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALSETTINGS_HPP_
#define PALSETTINGS_HPP_
#include "top.hpp"
#include "library.hpp"
#include "inc\core\palDevice.h"
/*! \addtogroup pal PAL Resource Implementation
* @{
*/
//! PAL Device Implementation
namespace pal {
//! Device settings
class Settings : public device::Settings
{
public:
//! Debug GPU flags
enum DebugGpuFlags
{
CheckForILSource = 0x00000001,
StubCLPrograms = 0x00000002, //!< Enables OpenCL programs stubbing
LockGlobalMemory = 0x00000004,
};
enum BlitEngineType
{
BlitEngineDefault = 0x00000000,
BlitEngineHost = 0x00000001,
BlitEngineCAL = 0x00000002,
BlitEngineKernel = 0x00000003,
};
enum HostMemFlags
{
HostMemDisable = 0x00000000,
HostMemBuffer = 0x00000001,
HostMemImage = 0x00000002,
};
union {
struct {
uint singleHeap_: 1; //!< Device will use a preallocated heap
uint remoteAlloc_: 1; //!< Allocate remote memory for the heap
uint stagedXferRead_: 1; //!< Uses a staged buffer read
uint stagedXferWrite_: 1; //!< Uses a staged buffer write
uint disablePersistent_: 1; //!< Disables using persistent memory for staging
uint imageSupport_: 1; //!< Report images support
uint doublePrecision_: 1; //!< Enables double precision support
uint reportFMAF_: 1; //!< Report FP_FAST_FMAF define in CL program
uint reportFMA_: 1; //!< Report FP_FAST_FMA define in CL program
uint use64BitPtr_: 1; //!< Use 64bit pointers on GPU
uint force32BitOcl20_: 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU
uint imageDMA_: 1; //!< Enable direct image DMA transfers
uint syncObject_: 1; //!< Enable syncobject
uint ciPlus_: 1; //!< CI and post CI features
uint viPlus_: 1; //!< VI and post VI features
uint aiPlus_: 1; //!< AI and post AI features
uint threadTraceEnable_: 1; //!< Thread trace enable
uint linearPersistentImage_: 1; //!< Allocates linear images in persistent
uint useSingleScratch_: 1; //!< Allocates single scratch per device
uint hsail_: 1; //!< Enables HSAIL compilation
uint stagingWritePersistent_: 1; //!< Enables persistent writes
uint svmAtomics_: 1; //!< SVM device atomics
uint svmFineGrainSystem_: 1; //!< SVM fine grain system support
uint apuSystem_: 1; //!< Device is APU system with shared memory
uint hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
uint useDeviceQueue_: 1; //!< Submit to separate device queue
uint singleFpDenorm_: 1; //!< Support Single FP Denorm
uint reserved_: 5;
};
uint value_;
};
uint oclVersion_; //!< Reported OpenCL version support
uint debugFlags_; //!< Debug GPU flags
size_t stagedXferSize_; //!< Staged buffer size
uint maxRenames_; //!< Maximum number of possible renames
uint maxRenameSize_; //!< Maximum size for all renames
uint hwLDSSize_; //!< HW local data store size
uint maxWorkGroupSize_; //!< Requested workgroup size for this device
uint hostMemDirectAccess_; //!< Enables direct access to the host memory
amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
uint workloadSplitSize_; //!< Workload split size
uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms
uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms
uint blitEngine_; //!< Blit engine type
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
size_t resourceCacheSize_; //!< Resource cache size in MB
uint64_t maxAllocSize_; //!< Maximum single allocation size
size_t numMemDependencies_;//!< The array size for memory dependencies tracking
uint cacheLineSize_; //!< Cache line size in bytes
uint cacheSize_; //!< L1 cache size in bytes
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings
uint numDeviceEvents_; //!< The number of device events
uint numWaitEvents_; //!< The number of wait events for device enqueue
//! Default constructor
Settings();
//! Creates settings
bool create(
const Pal::DeviceProperties& palProp, //!< PAL device properties
const Pal::GpuMemoryHeapProperties* heaps, //!< PAL heap settings
bool reportAsOCL12Device = false //!< Report As OpenCL1.2 Device
);
private:
//! Disable copy constructor
Settings(const Settings&);
//! Disable assignment
Settings& operator=(const Settings&);
//! Overrides current settings based on registry/environment
void override();
};
/*@}*/} // namespace pal
#endif /*PALSETTINGS_HPP_*/
+67
查看文件
@@ -0,0 +1,67 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "device/pal/palthreadtrace.hpp"
#include "device/pal/palvirtual.hpp"
namespace pal {
CalThreadTraceReference::~CalThreadTraceReference() {
// The thread trace object is always associated with a particular queue,
// so we have to lock just this queue
amd::ScopedLock lock(gpu_.execution());
if (0 != threadTrace_) {
//gpu().cs()->destroyQuery(gslThreadTrace());
}
}
ThreadTrace::~ThreadTrace()
{
if (calRef_ == nullptr) {
return;
}
Unimplemented();
for(uint i = 0; i < amdThreadTraceMemObjsNum_;++i) {
// threadTraceBufferObjs_[i]->attachMemObject(gpu().cs(), nullptr, 0, 0, 0, i);
// gpu().cs()->destroyShaderTraceBuffer(threadTraceBufferObjs_[i]);
}
// Release the thread trace reference object
//calRef_->release();
}
bool
ThreadTrace::create(CalThreadTraceReference* calRef)
{
assert(&gpu() == &calRef->gpu());
calRef_ = calRef;
threadTrace_ = calRef->gslThreadTrace();
return true;
}
bool
ThreadTrace::info(uint infoType, uint* info, uint infoSize) const
{
switch (infoType) {
case CL_THREAD_TRACE_BUFFERS_SIZE: {
if (infoSize < amdThreadTraceMemObjsNum_) {
LogError("The amount of buffers should be equal to the amount of Shader Engines");
return false;
}
else {
Unimplemented();
//gslThreadTrace()->GetResultAll(gpu().cs(), info);
}
break;
}
default:
LogError("Wrong ThreadTrace::getInfo parameter");
return false;
}
return true;
}
} // namespace pal
+136
查看文件
@@ -0,0 +1,136 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef GPUTHREADTRACE_HPP_
#define GPUTHREADTRACE_HPP_
#include "top.hpp"
#include "device/device.hpp"
#include "device/pal/paldevice.hpp"
#include "palPerfExperiment.h"
#include <vector>
namespace pal {
class VirtualGPU;
class CalThreadTraceReference : public amd::ReferenceCountedObject
{
public:
//! Default constructor
CalThreadTraceReference(
VirtualGPU& gpu, //!< Virtual GPU device object
Pal::IPerfExperiment* gslThreadTrace) //!< GSL query thread trace object
: gpu_(gpu)
, threadTrace_(gslThreadTrace){}
//! Get GSL thread race object
Pal::IPerfExperiment* gslThreadTrace() const { return threadTrace_; }
//! Returns the virtual GPU device
const VirtualGPU& gpu() const { return gpu_; }
protected:
//! Default destructor
~CalThreadTraceReference();
private:
//! Disable copy constructor
CalThreadTraceReference(const CalThreadTraceReference&);
//! Disable operator=
CalThreadTraceReference& operator=(const CalThreadTraceReference&);
VirtualGPU& gpu_; //!< The virtual GPU device object
Pal::IPerfExperiment* threadTrace_; //!< GSL thread trace query object
};
//! ThreadTrace implementation on GPU
class ThreadTrace : public device::ThreadTrace
{
public:
//! Destructor for the GPU ThreadTrace object
virtual ~ThreadTrace();
//! Creates the current object
bool create(
CalThreadTraceReference* calRef //!< Reference ThreadTrace
);
//! Returns the GPU device, associated with the current object
const Device& dev() const { return gpuDevice_; }
//! Returns the virtual GPU device
const VirtualGPU& gpu() const { return gpu_; }
//! Constructor for the GPU ThreadTrace object
ThreadTrace(
Device& device, //!< A GPU device object
VirtualGPU& gpu, //!< Virtual GPU device object
uint amdThreadTraceMemObjsNum)
: gpuDevice_(device)
, gpu_(gpu)
, calRef_(NULL)
, index_(0)
, amdThreadTraceMemObjsNum_(amdThreadTraceMemObjsNum)
{
threadTraceBufferObjs_ = new Pal::ThreadTraceLayout[amdThreadTraceMemObjsNum];
Unimplemented();
for (uint i = 0; i < amdThreadTraceMemObjsNum;++i) {
//threadTraceBufferObjs_[i] = gpu.cs()->createShaderTraceBuffer();
}
}
//! Returns the specific information about the thread trace object
bool info(
uint infoType, //!< The type of returned information
uint* info, //!< The returned information
uint infoSize //!< The size of returned information
) const;
//! Set the ThreadTrace memory buffer size
void setMemBufferSizeTT(uint memBufferSizeTT) { memBufferSizeTT_ = memBufferSizeTT;}
//! Set isNewBufferBinded_ to true/false if new buffer was binded/unbinded respectively
void setNewBufferBinded(bool isNewBufferBinded) { isNewBufferBinded_ = isNewBufferBinded; }
//! Attach Pal::IGpuMemory to the TreadTrace buffer
void attachMemToThreadTraceBuffer();
void setMemObj(size_t memObjSize,std::vector<amd::Memory*> memObj)
{
memObj_ = memObj;
memBufferSizeTT_ = memObjSize;
}
//! Get GSL thread trace object
Pal::IPerfExperiment* gslThreadTrace() const { return threadTrace_; }
//! Get GSL Thread Trace Buffer objects
Pal::ThreadTraceLayout* getThreadTraceBufferObjects() {return threadTraceBufferObjs_;}
private:
//! Disable default copy constructor
ThreadTrace(const ThreadTrace&);
//! Disable default operator=
ThreadTrace& operator=(const ThreadTrace&);
const Device& gpuDevice_; //!< The backend device
VirtualGPU& gpu_; //!< The virtual GPU device object
CalThreadTraceReference* calRef_; //!< Reference ThreadTrace
Pal::ThreadTraceLayout* threadTraceBufferObjs_; //!< The buffer object for Thread Trace recording
uint index_; //!< ThreadTrace index in the CAL container
uint memBufferSizeTT_; //!< ThreadTrace memory buffer size
std::vector<amd::Memory*> memObj_; //!< ThreadTrace memory object
Pal::IPerfExperiment* threadTrace_; //!< GSL thread trace query object
uint amdThreadTraceMemObjsNum_; //!< ThreadTrace memory object`s number (should be equal to the SE number)
bool isNewBufferBinded_; //!< The indicator if new buffer was binded to the ThreadTrace object
bool isBufferOnSubmit_; //!< The indicator if "new buffer on submit" mode is used
};
} // namespace pal
#endif // PALTHREADTRACE_HPP_
+123
查看文件
@@ -0,0 +1,123 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "os/os.hpp"
#include "platform/perfctr.hpp"
#include "device/pal/paldefs.hpp"
#include "device/pal/paltimestamp.hpp"
#include "device/pal/palvirtual.hpp"
#include "device/pal/palcounters.hpp"
namespace pal {
TimeStamp::TimeStamp(
const VirtualGPU& gpu,
Pal::IGpuMemory* iMem,
uint memOffset,
address cpuAddr)
: gpu_(gpu)
, iMem_(iMem)
, memOffset_(memOffset)
{
values_ = reinterpret_cast<volatile uint64_t*>(cpuAddr + memOffset);
}
TimeStamp::~TimeStamp()
{
}
void
TimeStamp::begin(bool sdma)
{
if (!flags_.beginIssued_) {
gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeTop, *iMem_,
memOffset_ + CommandStartTime * sizeof(uint64_t));
flags_.beginIssued_ = true;
}
}
void
TimeStamp::end(bool sdma)
{
CondLog(!flags_.beginIssued_, "We didn't issue a begin operation!");
gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeBottom, *iMem_,
memOffset_ + CommandEndTime * sizeof(uint64_t));
flags_.endIssued_ = true;
flags_.sdma_ = sdma;
}
inline void
SetValue(uint64_t* time, uint64_t val, double nanos)
{
*time = static_cast<uint64_t>(static_cast<double>(val) * nanos);
}
void
TimeStamp::value(uint64_t* startTime, uint64_t* endTime)
{
CondLog(!flags_.endIssued_, "We didn't send the counter end operation!");
//! @todo optimize!
const double NanoSecondsPerTick = 1000000000.0 / (gpu_.dev().properties().timestampFrequency);
SetValue(startTime, values_[CommandStartTime], NanoSecondsPerTick);
SetValue(endTime, values_[CommandEndTime], NanoSecondsPerTick);
}
TimeStampCache::~TimeStampCache()
{
// Release all time stamp objects from the cache
for (uint i = 0; i < freedTS_.size(); ++i) {
delete freedTS_[i];
}
freedTS_.clear();
// Release all memory objects
for (uint i = 0; i < tsBuf_.size(); ++i) {
tsBuf_[i]->unmap(&gpu_);
gpu_.queue(MainEngine).removeMemRef(tsBuf_[i]->iMem());
gpu_.queue(SdmaEngine).removeMemRef(tsBuf_[i]->iMem());
delete tsBuf_[i];
}
tsBuf_.clear();
}
TimeStamp*
TimeStampCache::allocTimeStamp()
{
TimeStamp* ts = nullptr;
if (0 != freedTS_.size()) {
ts = freedTS_.back();
freedTS_.pop_back();
}
if (nullptr == ts) {
if ((tsBufCpu_ == nullptr) || ((tsOffset_ + TimerSlotSize) > TimerBufSize)) {
Memory* buf = new Memory(gpu_.dev(), TimerBufSize);
if (buf == nullptr || !buf->create(Resource::Remote)) {
return nullptr;
}
gpu_.queue(MainEngine).addMemRef(buf->iMem());
gpu_.queue(SdmaEngine).addMemRef(buf->iMem());
tsBufCpu_ = reinterpret_cast<address>(buf->map(&gpu_));
memset(tsBufCpu_, 0, TimerBufSize);
tsOffset_ = 0;
tsBuf_.push_back(buf);
}
// Allocate a TimeStamp object
ts = new TimeStamp(gpu_, tsBuf_[(tsBuf_.size() - 1)]->iMem(),
tsOffset_, tsBufCpu_);
// Create a timestamp
if (ts == nullptr) {
return nullptr;
}
tsOffset_ += TimerSlotSize;
}
// Set this timestamp into DRM profile mode if it was requested
ts->clearStates();
return ts;
}
} // namespace pal
+132
查看文件
@@ -0,0 +1,132 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALTIMESTAMP_HPP_
#define PALTIMESTAMP_HPP_
#include "device/pal/paldefs.hpp"
#include "device/pal/palresource.hpp"
/*! \addtogroup pal PAL Resource Implementation
* @{
*/
//! PAL Device Implementation
namespace pal {
class Device;
class VirtualGPU;
class Memory;
class TimeStamp : public amd::HeapObject
{
public:
//! Enums for the timestamp information
//! \note *4 is the limitaiton of SDMA HW
//! (address has to be aligned by 256 bit)
enum TimeStampValue {
CommandStartTime = 0,
CommandEndTime = 4,
CommandTotal = 8
};
//! The TimeStamp object flags
union Flags
{
struct
{
uint32_t beginIssued_ : 1;
uint32_t endIssued_ : 1;
uint32_t sdma_ : 1;
};
uint32_t value_;
Flags(): value_(0) {}
};
//! Default constructor
TimeStamp(
const VirtualGPU& gpu, //!< Virtual GPU
Pal::IGpuMemory* iMem, //!< Buffer with the timer values
uint memOffset, //!< Offset in the buffer for the current TS
address cpuAddr //!< CPU pointer for the values in memory
);
//! Default destructor
~TimeStamp();
//! Starts the timestamp
void begin(bool sdma = false);
//! Ends the timestamp
void end(bool sdma = false);
//! Returns the timestamp result in nano seconds
void value(uint64_t* startTime, uint64_t* endTime);
//! Clear all TimeStamp states
void clearStates()
{ flags_.value_ = 0;
values_[CommandStartTime] = 0;
values_[CommandEndTime] = 0;
}
//! Timer commands were submitted to HW
bool isValid() const { return (flags_.endIssued_) ? true : false; }
private:
//! Disable copy constructor
TimeStamp(const TimeStamp&);
//! Disable operator=
TimeStamp& operator=(const TimeStamp&);
//! Returns the GPU device object
const VirtualGPU& gpu() const { return gpu_; }
const VirtualGPU& gpu_; //!< Virtual GPU
Flags flags_; //!< The time stamp state
Pal::IGpuMemory* iMem_; //!< Buffer with the timer values
uint memOffset_; //!< Offset in the buffer for the current timer
volatile uint64_t* values_; //!< CPU pointer to the timer values
};
class TimeStampCache : public amd::HeapObject
{
public:
//! Default constructor
TimeStampCache(
VirtualGPU& gpu //!< Virtual GPU object
)
: gpu_(gpu)
, tsBufCpu_(NULL)
, tsOffset_(0) {}
//! Default destructor
~TimeStampCache();
//! Gets a time stamp object. It will find a freed object or allocate a new one
TimeStamp* allocTimeStamp();
//! Frees a time stamp object
void freeTimeStamp(TimeStamp* ts) { freedTS_.push_back(ts); }
private:
static const uint TimerSlotSize = TimeStamp::CommandTotal * sizeof(uint64_t);
static const uint TimerBufSize = TimerSlotSize * 4096;
//! Disable copy constructor
TimeStampCache(const TimeStampCache&);
//! Disable operator=
TimeStampCache& operator=(const TimeStampCache&);
std::vector<TimeStamp*> freedTS_; //!< Array of freed time stamp objects
VirtualGPU& gpu_; //!< Virtual GPU
std::vector<Memory*> tsBuf_; //!< Array of memory objects with the timer value
address tsBufCpu_; //!< CPU pointer for current TS memory
uint tsOffset_; //!< Active offset in the current mem object
};
/*@}*/} // namespace pal
#endif /*PALTIMESTAMP_HPP_*/
+187
查看文件
@@ -0,0 +1,187 @@
/*******************************************************************************
* The source of the runtime trap handler, "runtimetraphandler.sp3".
* The binary is created by the SP3 tool with the following command:
*
* sp3.exe runtimetraphandler.sp3 -hex runtimeTrapCode.hex
*
*******************************************************************************
shader main
asic(TAHITI) // for SI/CI or asic(VI) for VI
type(CS)
// clear wave exception state
v_clrexcp
s_waitcnt 0
//==========================================================================
// Handle the workaround for HW bug that causes the incorrect TMA value.
// Retrieve the TMA values, which are stored at TBA buffer at location
// 256 (0x100).
// Construct the memory descriptor with TBA as the start address
// we are using the registers ttmp[8:11] for that.
s_mov_b32 ttmp8, tba_lo
s_and_b32 ttmp9, tba_hi, 0xffff
// 0x100=256 bytes, which is the size of the buffer to
// store all the level 2 trap handler info
s_or_b32 ttmp9, ttmp9, 0x01000000
s_mov_b32 ttmp10, 0x00002000
s_mov_b32 ttmp11, 0x00024fac
// TMA is stored 256 (0x100) bytes before the TBA value
s_sub_u32 ttmp8, ttmp8, 0x100
// Backup the s0 since ttmp registers cannot be target of
// buffer read instruction
s_mov_b32 ttmp7, s0
s_buffer_load_dword s0, ttmp8, 0x0 // VI: offset=0x0 (bytes)
s_waitcnt 0
s_mov_b32 tma_lo, s0
s_buffer_load_dword s0, ttmp8, 0x1 // VI: offset=0x4 (bytes)
s_waitcnt 0
s_mov_b32 tma_hi, s0
s_mov_b32 s0, ttmp7
//===================================================
// setup the mmeory descriptor for TMA
s_mov_b32 ttmp6, 0x18
s_add_u32 ttmp8, tma_lo, ttmp6
s_and_b32 ttmp9, tma_hi, 0xffff
//0x68=104 bytes, which is the size of the buffer to
//store all the level2 trap handler info
s_or_b32 ttmp9, ttmp9, 0x00680000
s_mov_b32 ttmp10, 0x00002000
s_mov_b32 ttmp11, 0x00024fac
//===================================================
// backup the TMA values to be restored later
// level-one TMA saved in the ttmp6,ttmp7
s_mov_b32 ttmp6, tma_lo
s_mov_b32 ttmp7, tma_hi
//===================================================
// setup the TMA for the level-two trap handler
// level-two TMA saved in tma_hi, tma_lo
s_mov_b32 ttmp3, s0
s_buffer_load_dword s0, ttmp8, 0x2 // VI: offset=0x8 (bytes)
s_waitcnt 0x0000
s_mov_b32 tma_lo, s0
s_buffer_load_dword s0, ttmp8, 0x3 // VI: offset=0xc (bytes)
s_waitcnt 0x0000
s_mov_b32 tma_hi, s0
//===================================================
// setup the TBA for the level-two trap handler
// level-two TBA saved in ttmp9, ttmp8
s_buffer_load_dword s0, ttmp8, 0x0 // VI: offset=0x0 (bytes)
s_waitcnt 0x0000
s_mov_b32 ttmp2, s0
s_buffer_load_dword s0, ttmp8, 0x1 // VI: offset=0x4 (bytes)
s_waitcnt 0x0000
//swap the values of s0 and ttmp3 without using other registers
s_xor_b32 ttmp3, s0, ttmp3
s_xor_b32 s0, s0, ttmp3
s_xor_b32 ttmp3, s0, ttmp3
//store the debug trap handler start address in ttmp8,9
s_mov_b32 ttmp8, ttmp2
s_mov_b32 ttmp9, ttmp3
//===================================================
// get the pc value to resume execution
s_getpc_b64 [ttmp2, ttmp3]
s_add_u32 ttmp2, ttmp2, 0x8
//===================================================
//set the pc value to jump to the debug trap handler
s_setpc_b64 [ttmp8, ttmp9]
//===================================================
// restore the tamp values
s_mov_b32 tma_hi, ttmp7
s_mov_b32 tma_lo, ttmp6
label_return:
//===================================================
// return from the trap handler to the saved PC
s_and_b32 ttmp1, ttmp1, 0xffff
s_rfe_b64 [ttmp0,ttmp1]
end
*******************************************************************************/
/// shader codes with "asic(TAHITI)" instruction
static const uint32_t RuntimeTrapCode [] = {
0x7e008200, 0xbf8c0000,
0xbef8036c, 0x8779ff6d,
0x0000ffff, 0x8879ff79,
0x01000000, 0xbefa03ff,
0x00002000, 0xbefb03ff,
0x00024fac, 0x80f8ff78,
0x00000100, 0xbef70300,
0xc2007900, 0xbf8c0000,
0xbeee0300, 0xc2007901,
0xbf8c0000, 0xbeef0300,
0xbe800377, 0xbef60398,
0x8078766e, 0x8779ff6f,
0x0000ffff, 0x8879ff79,
0x00680000, 0xbefa03ff,
0x00002000, 0xbefb03ff,
0x00024fac, 0xbef6036e,
0xbef7036f, 0xbef30300,
0xc2007902, 0xbf8c0000,
0xbeee0300, 0xc2007903,
0xbf8c0000, 0xbeef0300,
0xc2007900, 0xbf8c0000,
0xbef20300, 0xc2007901,
0xbf8c0000, 0x89737300,
0x89007300, 0x89737300,
0xbef80372, 0xbef90373,
0xbef21f00, 0x80728872,
0xbe802078, 0xbeef0377,
0xbeee0376, 0x8771ff71,
0x0000ffff, 0xbe802270
};
/// shader codes with "asic(VI)" instruction
static const uint32_t RuntimeTrapCodeVi [] = {
0x7e006a00, 0xbf8c0000,
0xbef8006c, 0x8679ff6d,
0x0000ffff, 0x8779ff79,
0x01000000, 0xbefa00ff,
0x00002000, 0xbefb00ff,
0x00024fac, 0x80f8ff78,
0x00000100, 0xbef70000,
0xc022003c, 0x00000000,
0xbf8c0000, 0xbeee0000,
0xc022003c, 0x00000004,
0xbf8c0000, 0xbeef0000,
0xbe800077, 0xbef60098,
0x8078766e, 0x8679ff6f,
0x0000ffff, 0x8779ff79,
0x00680000, 0xbefa00ff,
0x00002000, 0xbefb00ff,
0x00024fac, 0xbef6006e,
0xbef7006f, 0xbef30000,
0xc022003c, 0x00000008,
0xbf8c0000, 0xbeee0000,
0xc022003c, 0x0000000c,
0xbf8c0000, 0xbeef0000,
0xc022003c, 0x00000000,
0xbf8c0000, 0xbef20000,
0xc022003c, 0x00000004,
0xbf8c0000, 0x88737300,
0x88007300, 0x88737300,
0xbef80072, 0xbef90073,
0xbef21c00, 0x80728872,
0xbe801d78, 0xbeef0077,
0xbeee0076, 0x8671ff71,
0x0000ffff, 0xbe801f70
};
檔案差異因為檔案過大而無法顯示 載入差異
+576
查看文件
@@ -0,0 +1,576 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALVIRTUAL_HPP_
#define PALVIRTUAL_HPP_
#include "device/pal/paldefs.hpp"
#include "device/pal/palconstbuf.hpp"
#include "device/pal/palprintf.hpp"
#include "device/pal/paltimestamp.hpp"
#include "device/pal/palsched.hpp"
#include "device/pal/paldebugger.hpp"
#include "device/blit.hpp"
#include "palCmdBuffer.h"
#include "palCmdAllocator.h"
#include "palQueue.h"
/*! \addtogroup PAL PAL Resource Implementation
* @{
*/
//! PAL Device Implementation
namespace pal {
class Device;
class Kernel;
class Memory;
class CalCounterReference;
class VirtualGPU;
class Program;
class BlitManager;
class ThreadTrace;
class HSAILKernel;
//! Virtual GPU
class VirtualGPU : public device::VirtualDevice
{
public:
class Queue : public amd::HeapObject
{
public:
static const uint MaxCmdBuffers = 8;
static const uint MaxCommands = 512;
static const uint StartCmdBufIdx = 1;
static const uint FirstMemoryReference = 0x80000000;
static Queue* Create(
Pal::IDevice* palDev, //!< PAL device object
Pal::QueueType queueType, //!< PAL queue type
uint engineIdx, //!< Select particular engine index
Pal::ICmdAllocator* cmdAlloc//!< PAL CMD buffer allocator
);
Queue(Pal::IDevice* palDev)
: iDev_(palDev), iQueue_(NULL),
cmdBufIdSlot_(StartCmdBufIdx), cmdBufIdCurrent_(StartCmdBufIdx),
cmbBufIdRetired_(0), cmdCnt_(0)
{
for (uint i = 0; i < MaxCmdBuffers; ++i) {
iCmdBuffs_[i] = NULL;
iCmdFences_[i] = NULL;
}
}
~Queue();
void addCmdMemRef(Pal::IGpuMemory* iMem);
void removeCmdMemRef(Pal::IGpuMemory* iMem);
void addMemRef(Pal::IGpuMemory* iMem) const
{
iDev_->AddGpuMemoryReferences(1, &iMem, NULL);
}
void removeMemRef(Pal::IGpuMemory* iMem) const
{
iDev_->RemoveGpuMemoryReferences(1, &iMem, NULL);
}
//! Flushes the current command buffer to HW
//! Returns ID associated with the submission
uint submit();
bool flush();
bool waitForEvent(uint id);
bool isDone(uint id);
Pal::ICmdBuffer* iCmd() const { return iCmdBuffs_[cmdBufIdSlot_]; }
Pal::IQueue* iQueue_; //!< PAL queue object
Pal::ICmdBuffer* iCmdBuffs_[MaxCmdBuffers]; //!< PAL command buffers
Pal::IFence* iCmdFences_[MaxCmdBuffers]; //!< PAL fences, associated with CMD
private:
Pal::IDevice* iDev_; //!< PAL device
uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
uint cmdBufIdCurrent_; //!< Current global command buffer ID
uint cmbBufIdRetired_; //!< The last retired command buffer ID
uint cmdCnt_; //!< Counter of commands
std::map<Pal::IGpuMemory*, uint> memReferences_;
};
struct CommandBatch : public amd::HeapObject
{
amd::Command* head_; //!< Command batch head
GpuEvent events_[AllEngines]; //!< Last known GPU events
TimeStamp* lastTS_; //!< TS associated with command batch
//! Constructor
CommandBatch(
amd::Command* head, //!< Command batch head
const GpuEvent* events, //!< HW events on all engines
TimeStamp* lastTS //!< Last TS in command batch
): head_(head), lastTS_(lastTS)
{
memcpy(&events_, events, AllEngines * sizeof(GpuEvent));
}
};
//! The virtual GPU states
union State
{
struct
{
uint boundGlobal_ : 1; //!< Global buffer was bound
uint profiling_ : 1; //!< Profiling is enabled
uint forceWait_ : 1; //!< Forces wait in flush()
uint boundCb_ : 1; //!< Constant buffer was bound
uint boundPrintf_ : 1; //!< Printf buffer was bound
uint profileEnabled_: 1; //!< Profiling is enabled for WaveLimiter
};
uint value_;
State(): value_(0) {}
};
//! CAL descriptor for the GPU virtual device
struct CalVirtualDesc : public amd::EmbeddedObject
{
GpuEvent events_[AllEngines]; //!< Last known GPU events
uint iterations_; //!< Number of iterations for the execution
TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU
};
typedef std::vector<ConstBuffer*> constbufs_t;
class MemoryDependency : public amd::EmbeddedObject
{
public:
//! Default constructor
MemoryDependency()
: memObjectsInQueue_(NULL)
, numMemObjectsInQueue_(0)
, maxMemObjectsInQueue_(0) {}
~MemoryDependency() { delete [] memObjectsInQueue_; }
//! Creates memory dependecy structure
bool create(size_t numMemObj);
//! Notify the tracker about new kernel
void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; }
//! Validates memory object on dependency
void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly);
//! Clear memory dependency
void clear(bool all = true);
private:
struct MemoryState {
uint64_t start_; //! Busy memory start address
uint64_t end_; //! Busy memory end address
bool readOnly_; //! Current GPU state in the queue
};
MemoryState* memObjectsInQueue_; //!< Memory object state in the queue
size_t endMemObjectsInQueue_; //!< End of mem objects in the queue
size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
};
class DmaFlushMgmt : public amd::EmbeddedObject
{
public:
DmaFlushMgmt(const Device& dev);
// Resets DMA command buffer workload
void resetCbWorkload(const Device& dev);
// Finds split size for the current dispatch
void findSplitSize(
const Device& dev, //!< GPU device object
uint64_t threads, //!< Total number of execution threads
uint instructions //!< Number of ALU instructions
);
// Returns TRUE if DMA command buffer is ready for a flush
bool isCbReady(
VirtualGPU& gpu, //!< Virtual GPU object
uint64_t threads, //!< Total number of execution threads
uint instructions //!< Number of ALU instructions
);
// Returns dispatch split size
uint dispatchSplitSize() const { return dispatchSplitSize_; }
private:
uint64_t maxDispatchWorkload_; //!< Maximum number of operations for a single dispatch
uint64_t maxCbWorkload_; //!< Maximum number of operations for DMA command buffer
uint64_t cbWorkload_; //!< Current number of operations in DMA command buffer
uint aluCnt_; //!< All ALUs on the chip
uint dispatchSplitSize_; //!< Dispath split size in elements
};
public:
VirtualGPU(Device& device);
//! Creates virtual gpu object
bool create(
bool profiling, //!< Enables profilng on the queue
uint deviceQueueSize = 0 //!< Device queue size, 0 if host queue
);
~VirtualGPU();
void submitReadMemory(amd::ReadMemoryCommand& vcmd);
void submitWriteMemory(amd::WriteMemoryCommand& vcmd);
void submitCopyMemory(amd::CopyMemoryCommand& vcmd);
void submitMapMemory(amd::MapMemoryCommand& vcmd);
void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd);
void submitKernel(amd::NDRangeKernelCommand& vcmd);
bool submitKernelInternal(
const amd::NDRangeContainer& sizes, //!< Workload sizes
const amd::Kernel& kernel, //!< Kernel for execution
const_address parameters, //!< Parameters for the kernel
bool nativeMem = true, //!< Native memory objects
amd::Event* enqueueEvent = NULL //!< Event provided in the enqueue kernel command
);
void submitNativeFn(amd::NativeFnCommand& vcmd);
void submitFillMemory(amd::FillMemoryCommand& vcmd);
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
void submitMarker(amd::Marker& vcmd);
void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd);
void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd);
void submitPerfCounter(amd::PerfCounterCommand& vcmd);
void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd);
void submitThreadTrace(amd::ThreadTraceCommand& vcmd);
void submitSignal(amd::SignalCommand & vcmd);
void submitMakeBuffersResident(amd::MakeBuffersResidentCommand & vcmd);
virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd);
virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd);
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
void releaseMemory(Pal::IGpuMemory* iMem, bool wait = true);
void flush(amd::Command* list = NULL, bool wait = false);
bool terminate() { return true; }
//! Returns GPU device object associated with this kernel
const Device& dev() const { return gpuDevice_; }
//! Returns CAL descriptor of the virtual device
const CalVirtualDesc* cal() const { return &cal_; }
//! Returns a GPU event, associated with GPU memory
GpuEvent* getGpuEvent(
Pal::IGpuMemory* iMem //!< PAL mem object
);
//! Assigns a GPU event, associated with GPU memory
void assignGpuEvent(
Pal::IGpuMemory* iMem, //!< PAL mem object
GpuEvent gpuEvent
);
//! Set the last known GPU event
void setGpuEvent(
GpuEvent gpuEvent, //!< GPU event for tracking
bool flush = false //!< TRUE if flush is required
);
//! Flush DMA buffer on the specified engine
void flushDMA(
uint engineID //!< Engine ID for DMA flush
);
//! Wait for all engines on this Virtual GPU
//! Returns TRUE if CPU didn't wait for GPU
bool waitAllEngines(
CommandBatch* cb = NULL //!< Command batch
);
//! Waits for the latest GPU event with a lock to prevent multiple entries
void waitEventLock(
CommandBatch* cb //!< Command batch
);
//! Returns a resource associated with the constant buffer
const ConstBuffer* cb(uint idx) const { return constBufs_[idx]; }
//! Adds CAL objects into the constant buffer vector
void addConstBuffer(ConstBuffer* cb) { constBufs_.push_back(cb); }
constbufs_t constBufs_; //!< constant buffers
//! Start the command profiling
void profilingBegin(
amd::Command& command, //!< Command queue object
bool drmProfiling = false //!< Measure DRM time
);
//! End the command profiling
void profilingEnd(amd::Command& command);
//! Collect the profiling results
bool profilingCollectResults(
CommandBatch* cb, //!< Command batch
const amd::Event* waitingEvent //!< Waiting event
);
//! Adds a memory handle into the GSL memory array for Virtual Heap
bool addVmMemory(
const Memory* memory //!< GPU memory object
);
//! Adds a stage write buffer into a list
void addXferWrite(Memory& memory);
//! Adds a pinned memory object into a map
void addPinnedMem(amd::Memory* mem);
//! Release pinned memory objects
void releasePinnedMem();
//! Finds if pinned memory is cached
amd::Memory* findPinnedMem(void* addr, size_t size);
//! Returns the monitor object for execution access by VirtualGPU
amd::Monitor& execution() { return execution_; }
//! Returns the virtual gpu unique index
uint index() const { return index_; }
//! Get the PrintfDbg object
PrintfDbg& printfDbg() const { return *printfDbg_; }
//! Get the PrintfDbgHSA object
PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; }
//! Enables synchronized transfers
void enableSyncedBlit() const;
//! Checks if profiling is enabled
bool profiling() const { return state_.profiling_; }
//! Returns memory dependency class
MemoryDependency& memoryDependency() { return memoryDependency_; }
//! Returns hsaQueueMem_
const Memory* hsaQueueMem() const { return hsaQueueMem_;}
//! Returns DMA flush management structure
const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; }
//! Releases GSL memory objects allocated on this queue
void releaseMemObjects(bool scratch = true);
//! Returns the HW ring used on this virtual device
uint hwRing() const { return hwRing_; }
//! Returns current timestamp object for profiling
TimeStamp* currTs() const { return cal_.lastTS_; }
//! Returns virtual queue object for device enqueuing
Memory* vQueue() const { return virtualQueue_; }
//! Update virtual queue header
void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable);
//! Returns TRUE if virtual queue was successfully allocatted
bool createVirtualQueue(
uint deviceQueueSize //!< Device queue size
);
EngineType engineID_; //!< Engine ID for this VirtualGPU
State state_; //!< virtual GPU current state
CalVirtualDesc cal_; //!< CAL virtual device descriptor
void flushCuCaches(HwDbgGpuCacheMask cache_mask); //!< flush/invalidate SQ cache
//! Returns PAL command buffer interface
Pal::ICmdBuffer* iCmd() const {
Queue* queue = queues_[engineID_];
return queue->iCmd();
}
//! Returns queue, associated with VirtualGPU
Queue& queue(EngineType id) const { return *queues_[id]; }
void flushCUCaches() const
{
Pal::BarrierInfo barrier = {};
barrier.pipePointWaitCount = 1;
Pal::HwPipePoint point = Pal::HwPipePostCs;
barrier.pPipePoints = &point;
barrier.transitionCount = 1;
Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader,
{nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
barrier.pTransitions = &trans;
barrier.waitPoint = Pal::HwPipePreCs;
iCmd()->CmdBarrier(barrier);
}
void eventBegin(EngineType engId) const {
const static bool Begin = true;
profileEvent(engId, Begin);
}
void eventEnd(EngineType engId, GpuEvent& event) const {
const static bool End = false;
profileEvent(engId, End);
event.id = queues_[engId]->submit();
event.engineId_ = engId;
}
void waitForEvent(GpuEvent* event) const {
if (event->isValid()) {
assert(event->engineId_ < AllEngines);
queues_[event->engineId_]->waitForEvent(event->id);
event->invalidate();
}
}
bool isDone(GpuEvent* event) {
if (event->isValid()) {
assert(event->engineId_ < AllEngines);
if (queues_[event->engineId_]->isDone(event->id)) {
event->invalidate();
return true;
}
return false;
}
return true;
}
//! Returns TRUE if SDMA requires overlap synchronizaiton
bool validateSdmaOverlap(
const Resource& src, //!< Source resource for SDMA transfer
const Resource& dst //!< Destination resource for SDMA transfer
);
protected:
void profileEvent(EngineType engine, bool type) const;
//! Creates buffer object from image
amd::Memory* createBufferFromImage(
amd::Memory& amdImage //! The parent image object(untiled images only)
) const;
private:
struct MemoryRange {
uint64_t start_; //!< Memory range start address
uint64_t end_; //!< Memory range end address
MemoryRange(): start_(0), end_(0) {}
};
typedef std::map<const Pal::IGpuMemory*, GpuEvent> GpuEvents;
//! Finds total amount of necessary iterations
inline void findIterations(
const amd::NDRangeContainer& sizes, //!< Original workload sizes
const amd::NDRange& local, //!< Local workgroup size
amd::NDRange& groups, //!< Calculated workgroup sizes
amd::NDRange& remainder, //!< Calculated remainder sizes
size_t& extra //!< Amount of extra executions for remainder
);
//! Allocates constant buffers
bool allocConstantBuffers();
//! Releases stage write buffers
void releaseXferWrite();
//! Allocate hsaQueueMem_
bool allocHsaQueueMem();
//! Awaits a command batch with a waiting event
bool awaitCompletion(
CommandBatch* cb, //!< Command batch for to wait
const amd::Event* waitingEvent = NULL //!< A waiting event
);
//! Detects memory dependency for HSAIL kernels and flushes caches
bool processMemObjectsHSA(
const amd::Kernel& kernel, //!< AMD kernel object for execution
const_address params, //!< Pointer to the param's store
bool nativeMem, //!< Native memory objects
std::vector<const Memory*>* memList //!< Memory list for KMD tracking
);
//! Common function for fill memory used by both svm Fill and non-svm fill
bool fillMemory(
cl_command_type type, //!< the command type
amd::Memory* amdMemory, //!< memory object to fill
const void* pattern, //!< pattern to fill the memory
size_t patternSize, //!< pattern size
const amd::Coord3D& origin, //!< memory origin
const amd::Coord3D& size //!< memory size for filling
);
bool copyMemory(
cl_command_type type, //!< the command type
amd::Memory& srcMem, //!< source memory object
amd::Memory& dstMem, //!< destination memory object
bool entire, //!< flag of entire memory copy
const amd::Coord3D& srcOrigin, //!< source memory origin
const amd::Coord3D& dstOrigin, //!< destination memory object
const amd::Coord3D& size, //!< copy size
const amd::BufferRect& srcRect, //!< region of source for copy
const amd::BufferRect& dstRect //!< region of destination for copy
);
void buildKernelInfo(
const HSAILKernel& hsaKernel, //!< hsa kernel
hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch
HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch
amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command
);
void assignDebugTrapHandler(
const DebugToolInfo& dbgSetting, //!< debug settings
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
);
GpuEvents gpuEvents_; //!< GPU events
Device& gpuDevice_; //!< physical GPU device
amd::Monitor execution_; //!< Lock to serialise access to all device objects
uint index_; //!< The virtual device unique index
PrintfDbg* printfDbg_; //!< GPU printf implemenation
PrintfDbgHSA* printfDbgHSA_; //!< HSAIL printf implemenation
TimeStampCache* tsCache_; //!< TimeStamp cache
MemoryDependency memoryDependency_; //!< Memory dependency class
DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management
std::list<Memory*> xferWriteBuffers_; //!< Stage write buffers
std::list<amd::Memory*> pinnedMems_;//!< Pinned memory list
typedef std::list<CommandBatch*> CommandBatchList;
CommandBatchList cbList_; //!< List of command batches
uint hwRing_; //!< HW ring used on this virtual device
uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps
TimeStamp* currTs_; //!< current timestamp for command
AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header
Memory* virtualQueue_; //!< Virtual device queue
Memory* schedParams_; //!< The scheduler parameters
uint schedParamIdx_; //!< Index in the scheduler parameters buffer
uint deviceQueueSize_; //!< Device queue size
uint maskGroups_; //!< The number of mask groups processed in the scheduler by one thread
Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object
Pal::ICmdAllocator* cmdAllocator_; //!< Command buffer allocator
Queue* queues_[AllEngines]; //!< HW queues for all engines
MemoryRange sdmaRange_; //!< SDMA memory range for write access
};
/*@}*/} // namespace pal
#endif /*PALVIRTUAL_HPP_*/
+354
查看文件
@@ -0,0 +1,354 @@
//
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
//
#include "device/pal/palkernel.hpp"
#include "device/pal/palwavelimiter.hpp"
#include "os/os.hpp"
#include "utils/flags.hpp"
#include <cstdlib>
using namespace std;
namespace pal {
uint WaveLimiter::MaxWave;
uint WaveLimiter::WarmUpCount;
uint WaveLimiter::RunCount;
uint WLAlgorithmSmooth::AdaptCount;
uint WLAlgorithmSmooth::AbandonThresh;
uint WLAlgorithmSmooth::DscThresh;
WaveLimiter::WaveLimiter(
HSAILKernel* owner,
uint seqNum,
bool enable,
bool enableDump):
owner_(owner),
dumper_(owner_->name() + "_" + std::to_string(seqNum), enableDump) {
auto gpuDev = static_cast<const Device*>(&owner_->dev());
Unimplemented();
//auto attrib = gpuDev->getAttribs();
auto hwInfo = gpuDev->hwInfo();
setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH,
/*attrib.numberOfCUsperShaderArray*/ 8 * hwInfo->simdPerCU_);
MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
WarmUpCount = GPU_WAVE_LIMIT_WARMUP;
RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
state_ = WARMUP;
if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + owner_->name() +
".txt");
}
waves_ = MaxWave;
currWaves_ = MaxWave;
bestWave_ = MaxWave;
enable_ = enable;
}
WaveLimiter::~WaveLimiter() {
if (traceStream_.is_open()) {
traceStream_.close();
}
}
uint WaveLimiter::getWavesPerSH(){
currWaves_ = waves_;
return waves_ * SIMDPerSH_;
}
WLAlgorithmSmooth::WLAlgorithmSmooth(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump):
WaveLimiter(owner, seqNum, enable, enableDump) {
AdaptCount = 2 * MaxWave + 1;
AbandonThresh = GPU_WAVE_LIMIT_ABANDON;
DscThresh = GPU_WAVE_LIMIT_DSC_THRESH;
dynRunCount_ = RunCount;
measure_.resize(MaxWave + 1);
reference_.resize(MaxWave + 1);
trial_.resize(MaxWave + 1);
ratio_.resize(MaxWave + 1);
clearData();
}
WLAlgorithmSmooth::~WLAlgorithmSmooth() {
}
void WLAlgorithmSmooth::clearData() {
waves_ = MaxWave;
countAll_ = 0;
clear(measure_);
clear(reference_);
clear(trial_);
clear(ratio_);
discontinuous_ = false;
dataCount_ = 0;
}
void WLAlgorithmSmooth::updateData(ulong time) {
auto count = dataCount_ - 1;
assert(count < 2 * MaxWave + 1);
assert(time > 0);
assert(currWaves_ == waves_);
if (count % 2 == 0) {
assert(waves_ == MaxWave);
auto pos = count / 2;
measure_[pos] = time;
if (pos > 0) {
auto wave = MaxWave + 1 - pos;
if (abs(static_cast<long>(measure_[pos - 1]) -
static_cast<long>(measure_[pos])) * 100 / measure_[pos] >
DscThresh) {
discontinuous_ = true;
}
reference_[wave] = (time + measure_[pos - 1]) / 2;
ratio_[wave] = trial_[wave] * 100 / reference_[wave];
if (ratio_[bestWave_] > ratio_[wave] && !discontinuous_) {
bestWave_ = wave;
}
}
} else {
assert(waves_ == MaxWave - count / 2);
trial_[waves_] = time;
}
outputTrace();
}
void WLAlgorithmSmooth::outputTrace() {
if (!traceStream_.is_open()) {
return;
}
traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
<< " currWaves=" << currWaves_ << " waves=" << waves_
<< " bestWave=" << bestWave_ << '\n';
output(traceStream_, "\n measure = ", measure_);
output(traceStream_, "\n reference = ", reference_);
output(traceStream_, "\n ratio = ", ratio_);
traceStream_ << "\n\n";
}
void WLAlgorithmSmooth::callback(ulong duration) {
dumper_.addData(duration, currWaves_, static_cast<char>(state_));
if (!enable_) {
return;
}
countAll_++;
switch (state_) {
case WARMUP:
if (countAll_ < WarmUpCount) {
return;
}
state_ = ADAPT;
bestWave_ = MaxWave;
clearData();
return;
case ADAPT:
assert(duration > 0);
if (waves_ == currWaves_) {
dataCount_++;
updateData(duration);
waves_ = MaxWave + 1 - dataCount_ / 2;
if (dataCount_ == 1 || (dataCount_ < AdaptCount &&
!discontinuous_ && (dataCount_ % 2 == 0 ||
ratio_[waves_] < AbandonThresh))) {
if (dataCount_ % 2 == 1) {
--waves_;
} else {
waves_ = MaxWave;
}
return;
}
waves_ = bestWave_;
if (dataCount_ >= AdaptCount) {
dynRunCount_ = RunCount;
} else {
dynRunCount_ = AdaptCount;
}
countAll_ = rand() % MaxWave;
state_ = RUN;
}
return;
case RUN:
if (countAll_ < dynRunCount_) {
return;
}
state_ = ADAPT;
bestWave_ = MaxWave;
clearData();
return;
}
}
WaveLimiter::DataDumper::DataDumper(const std::string &kernelName, bool enable) {
enable_ = enable;
if (enable_) {
fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv";
}
}
WaveLimiter::DataDumper::~DataDumper() {
if (!enable_) {
return;
}
std::ofstream OFS(fileName_);
for (size_t i = 0, e = time_.size(); i != e; ++i) {
OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ','
<< static_cast<uint>(state_[i]) << '\n';
}
OFS.close();
}
void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
if (!enable_) {
return;
}
time_.push_back(time);
wavePerSIMD_.push_back(wave);
state_.push_back(state);
}
WLAlgorithmAvrg::WLAlgorithmAvrg(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump):
WaveLimiter(owner, seqNum, enable, enableDump) {
measure_.resize(MaxWave + 1);
clear(measure_);
countAll_ = 0;
}
WLAlgorithmAvrg::~WLAlgorithmAvrg() {
}
void WLAlgorithmAvrg::outputTrace() {
if (!traceStream_.is_open()) {
return;
}
traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
<< " currWaves=" << currWaves_ << " waves=" << waves_
<< " bestWave=" << bestWave_ << '\n';
output(traceStream_, "\n measure = ", measure_);
traceStream_ << "\n\n";
}
void WLAlgorithmAvrg::callback(ulong duration) {
dumper_.addData(duration, currWaves_, static_cast<char>(state_));
if (!enable_) {
return;
}
countAll_++;
switch (state_) {
case WARMUP:
state_ = ADAPT;
case ADAPT:
measure_[waves_] += duration;
if (countAll_ <= MaxWave * 5) {
waves_--;
if (waves_ == 0) {
waves_ = MaxWave;
}
}
else {
bestWave_ = MaxWave;
for (uint i=1; i<MaxWave; i++ ) {
if (measure_[i] < measure_[bestWave_]) {
bestWave_ = i;
}
}
waves_ = bestWave_;
state_ = RUN;
}
break;
case RUN:
default:
break;
}
}
WaveLimiterManager::WaveLimiterManager(HSAILKernel* kernel):
owner_(kernel),
enable_(false),
enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
auto gpuDev = static_cast<const Device*>(&owner_->dev());
Unimplemented();
//auto attrib = gpuDev->getAttribs();
auto hwInfo = gpuDev->hwInfo();
unsigned simdPerSH = 0;
setIfNotDefault(simdPerSH, GPU_WAVE_LIMIT_CU_PER_SH,
/*attrib.numberOfCUsperShaderArray*/ 8 * hwInfo->simdPerCU_);
fixed_ = GPU_WAVES_PER_SIMD * simdPerSH;
}
WaveLimiterManager::~WaveLimiterManager() {
for (auto &I: limiters_) {
delete I.second;
}
}
uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice *vdev) const {
if (fixed_ > 0) {
return fixed_;
}
if (!enable_) {
return 0;
}
auto loc = limiters_.find(vdev);
if (loc == limiters_.end()) {
return 0;
}
assert(loc->second != nullptr);
return loc->second->getWavesPerSH();
}
amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
const device::VirtualDevice *vdev) {
assert(vdev != nullptr);
if (!enable_ && !enableDump_) {
return nullptr;
}
amd::ScopedLock SL(monitor_);
auto loc = limiters_.find(vdev);
if (loc != limiters_.end()) {
return loc->second;
}
auto limiter = new WLAlgorithmSmooth(owner_, limiters_.size(), enable_,
enableDump_);
if (limiter == nullptr) {
enable_ = false;
return nullptr;
}
limiters_[vdev] = limiter;
return limiter;
}
void WaveLimiterManager::enable() {
if (fixed_ > 0) {
return;
}
auto gpuDev = static_cast<const Device*>(&owner_->dev());
auto hwInfo = gpuDev->hwInfo();
Unimplemented();
// Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
// Disabled for SI due to bug #10817
setIfNotDefault(enable_, GPU_WAVE_LIMIT_ENABLE,
/*owner_->workGroupInfo()->limitWave_*/ false && gpuDev->settings().ciPlus_);
}
} // namespace pal
+154
查看文件
@@ -0,0 +1,154 @@
//
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
//
#ifndef PALWAVELIMITER_HPP_
#define PALWAVELIMITER_HPP_
#include "platform/command.hpp"
#include "thread/thread.hpp"
#include <cstdio>
#include <cstdlib>
#include <cstdint>
#include <fstream>
#include <unordered_map>
//! \namespace pal PAL Device Implementation
namespace pal {
class HSAILKernel;
// Adaptively limit the number of waves per SIMD based on kernel execution time
class WaveLimiter: public amd::ProfilingCallback {
public:
explicit WaveLimiter(HSAILKernel*, uint seqNum, bool enable, bool enableDump);
virtual ~WaveLimiter();
//! Get waves per shader array to be used for kernel execution.
uint getWavesPerSH();
protected:
enum StateKind {
WARMUP, ADAPT, RUN
};
class DataDumper {
public:
explicit DataDumper(const std::string &kernelName, bool enable);
~DataDumper();
//! Record execution time, waves/simd and state of wave limiter.
void addData(ulong time, uint wave, char state);
//! Whether this data dumper is enabled.
bool enabled() const { return enable_;}
private:
bool enable_;
std::string fileName_;
std::vector<ulong> time_;
std::vector<uint> wavePerSIMD_;
std::vector<char> state_;
};
std::vector<ulong> measure_;
bool enable_;
uint SIMDPerSH_; // Number of SIMDs per SH
uint waves_; // Waves per SIMD to be set
uint bestWave_; // Optimal waves per SIMD
uint countAll_; // Number of kernel executions
StateKind state_;
HSAILKernel *owner_;
DataDumper dumper_;
std::ofstream traceStream_;
uint currWaves_; // Current waves per SIMD
static uint MaxWave; // Maximum number of waves per SIMD
static uint WarmUpCount; // Number of kernel executions for warm up
static uint RunCount; // Number of kernel executions for normal run
//! Call back from Event::recordProfilingInfo to get execution time.
virtual void callback(ulong duration)=0;
//! Output trace of measurement/adaptation.
virtual void outputTrace()=0;
template<class T> void clear(T& A) {
for (auto &I : A) {
I = 0;
}
}
template<class T> void output(std::ofstream &ofs, const std::string &prompt,
T& A) {
ofs << prompt;
for (auto &I : A) {
ofs << ' ' << static_cast<ulong>(I);
}
}
};
class WLAlgorithmSmooth: public WaveLimiter {
public:
explicit WLAlgorithmSmooth(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump);
virtual ~WLAlgorithmSmooth();
private:
std::vector<ulong> reference_;
std::vector<ulong> trial_;
std::vector<ulong> ratio_;
bool discontinuous_; // Measured data is discontinuous
uint dynRunCount_;
uint dataCount_;
static uint AdaptCount; // Number of kernel executions for adapting
static uint AbandonThresh; // Threshold to abandon adaptation
static uint DscThresh; // Threshold for identifying discontinuities
//! Update measurement data and optimal waves/simd with execution time.
void updateData(ulong time);
//! Clear measurement data for the next adaptation.
void clearData();
//! Call back from Event::recordProfilingInfo to get execution time.
void callback(ulong duration);
//! Output trace of measurement/adaptation.
void outputTrace();
};
class WLAlgorithmAvrg: public WaveLimiter {
public:
explicit WLAlgorithmAvrg(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump);
virtual ~WLAlgorithmAvrg();
private:
//! Call back from Event::recordProfilingInfo to get execution time.
void callback(ulong duration);
//! Output trace of measurement/adaptation.
void outputTrace();
};
// Create wave limiter for each virtual device for a kernel and manages the wave limiters.
class WaveLimiterManager {
public:
explicit WaveLimiterManager(HSAILKernel* owner);
virtual ~WaveLimiterManager();
//! Get waves per shader array for a specific virtual device.
uint getWavesPerSH(const device::VirtualDevice *) const;
//! Provide call back function for a specific virtual device.
amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice *);
//! Enable wave limiter manager by kernel metadata and flags.
void enable();
private:
HSAILKernel* owner_; //!< The kernel which owns this object
std::unordered_map<const device::VirtualDevice *,
WaveLimiter*> limiters_; //!< Maps virtual device to wave limiter
bool enable_; //!< Whether the adaptation is enabled
bool enableDump_; //!< Whether the data dumper is enabled
uint fixed_; //!< The fixed waves/simd value if not zero
amd::Monitor monitor_; //!< The mutex for updating the wave limiter map
};
}
#endif
+8
查看文件
@@ -137,6 +137,10 @@ class HeapObject
public:
void* operator new(size_t size);
void operator delete(void* obj);
void* operator new(size_t size, size_t extSize)
{ return HeapObject::operator new (size + extSize); };
void operator delete(void* obj, size_t extSize)
{ HeapObject::operator delete (obj); }
};
/*! \brief For all reference counted objects.
@@ -154,6 +158,10 @@ public:
void* operator new(size_t size) { return ::operator new(size); }
void operator delete(void* p) { return ::operator delete(p); }
void* operator new(size_t size, size_t extSize)
{ return ReferenceCountedObject::operator new (size + extSize); };
void operator delete(void* obj, size_t extSize)
{ ReferenceCountedObject::operator delete (obj); }
uint referenceCount() const { return referenceCount_; }