P4 to Git Change 1230521 by gandryey@gera-w8 on 2016/01/22 17:58:08
SWDEV-86035 - Add OCL backend for PAL
- PAL backend build is disabled by default. "BUILD_PAL_DEVICE = yes" enables the build. You also have to update the client workspace with PAL mapping: //depot/stg/pal/... //<your_opencl_location>/runtime/device/pal/palbe/...
Affected files ...
... //depot/stg/opencl/drivers/opencl/api/opencl/amdocl/build/Makefile.api#130 edit
... //depot/stg/opencl/drivers/opencl/compiler/sclibdefs.opencl#8 edit
... //depot/stg/opencl/drivers/opencl/opencldefs#166 edit
... //depot/stg/opencl/drivers/opencl/openclrules#91 edit
... //depot/stg/opencl/drivers/opencl/runtime/Makefile#21 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/device.cpp#192 edit
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/Makefile#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/build/Makefile#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/build/Makefile.pal#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palappprofile.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbinary.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palbinary.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palblit.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcompiler.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palconstbuf.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palcounters.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugger.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugmanager.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldebugmanager.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldefs.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevice.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d10.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d11.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldeviced3d9.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paldevicegl.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palkernel.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palmemory.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprintf.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprintf.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palprogram.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palresource.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsched.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palschedcl.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palsettings.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palthreadtrace.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palthreadtrace.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltimestamp.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/paltrap.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palvirtual.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.cpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/device/pal/palwavelimiter.hpp#1 add
... //depot/stg/opencl/drivers/opencl/runtime/runtimedefs#36 edit
... //depot/stg/opencl/drivers/opencl/runtime/top.hpp#23 edit
[ROCm/clr commit: c99d679c9e]
此提交包含在:
@@ -15,6 +15,13 @@ extern amd::AppProfile* oclhsaCreateAppProfile();
|
||||
#include "device/cpu/cpudevice.hpp"
|
||||
#endif // WITH_CPU_DEVICE
|
||||
|
||||
#if defined(WITH_PAL_DEVICE)
|
||||
//namespace pal {
|
||||
extern bool PalDeviceLoad();
|
||||
extern void PalDeviceUnload();
|
||||
//}
|
||||
#endif // WITH_PAL_DEVICE
|
||||
|
||||
#if defined(WITH_GPU_DEVICE)
|
||||
extern bool DeviceLoad();
|
||||
extern void DeviceUnload();
|
||||
@@ -177,9 +184,12 @@ Device::init()
|
||||
ret |= oclhsa::NullDevice::init();
|
||||
}
|
||||
#endif // WITH_HSA_DEVICE
|
||||
#if defined(WITH_GPU_DEVICE)
|
||||
#if defined(WITH_GPU_DEVICE) && !defined(WITH_PAL_DEVICE)
|
||||
ret |= DeviceLoad();
|
||||
#endif // WITH_GPU_DEVICE
|
||||
#if defined(WITH_PAL_DEVICE)
|
||||
ret |= PalDeviceLoad();
|
||||
#endif // WITH_PAL_DEVICE
|
||||
#if defined(WITH_CPU_DEVICE)
|
||||
ret |= cpu::Device::init();
|
||||
#endif // WITH_CPU_DEVICE
|
||||
@@ -203,9 +213,12 @@ Device::tearDown()
|
||||
oclhsaAppProfile_ = NULL;
|
||||
}
|
||||
#endif // WITH_HSA_DEVICE
|
||||
#if defined(WITH_GPU_DEVICE)
|
||||
#if defined(WITH_GPU_DEVICE) && !defined(WITH_PAL_DEVICE)
|
||||
DeviceUnload();
|
||||
#endif // WITH_GPU_DEVICE
|
||||
#if defined(WITH_PAL_DEVICE)
|
||||
PalDeviceUnload();
|
||||
#endif // WITH_PAL_DEVICE
|
||||
#if defined(WITH_CPU_DEVICE)
|
||||
cpu::Device::tearDown();
|
||||
#endif // WITH_CPU_DEVICE
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
|
||||
#include "top.hpp"
|
||||
#include "utils/debug.hpp"
|
||||
#include "device/appprofile.hpp"
|
||||
#include "device/pal/palappprofile.hpp"
|
||||
|
||||
namespace pal {
|
||||
|
||||
AppProfile::AppProfile()
|
||||
: amd::AppProfile()
|
||||
, enableHighPerformanceState_(true)
|
||||
, reportAsOCL12Device_(false)
|
||||
{
|
||||
propertyDataMap_.insert(DataMap::value_type("HighPerfState",
|
||||
PropertyData(DataType_Boolean, &enableHighPerformanceState_)));
|
||||
|
||||
propertyDataMap_.insert(DataMap::value_type("OCL12Device",
|
||||
PropertyData(DataType_Boolean, &reportAsOCL12Device_)));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
|
||||
#ifndef PALAPPPROFILE_HPP_
|
||||
#define PALAPPPROFILE_HPP_
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
|
||||
namespace pal {
|
||||
|
||||
class AppProfile : public amd::AppProfile
|
||||
{
|
||||
public:
|
||||
AppProfile();
|
||||
|
||||
//! return the value of enableHighPerformanceState_
|
||||
bool enableHighPerformanceState() const { return enableHighPerformanceState_; }
|
||||
bool reportAsOCL12Device() const { return reportAsOCL12Device_; }
|
||||
|
||||
private:
|
||||
|
||||
bool enableHighPerformanceState_;
|
||||
bool reportAsOCL12Device_;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif // PALAPPPROFILE_HPP_
|
||||
@@ -0,0 +1,7 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
namespace pal {
|
||||
|
||||
|
||||
} // namespace pal
|
||||
@@ -0,0 +1,48 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALBINARY_HPP_
|
||||
#define PALBINARY_HPP_
|
||||
|
||||
#include "top.hpp"
|
||||
#include "device/pal/paldevice.hpp"
|
||||
#include "device/pal/palkernel.hpp"
|
||||
|
||||
namespace pal {
|
||||
|
||||
class ClBinaryHsa : public device::ClBinary
|
||||
{
|
||||
public:
|
||||
ClBinaryHsa(const Device& dev, BinaryImageFormat bifVer = BIF_VERSION3)
|
||||
: device::ClBinary(dev, bifVer)
|
||||
{}
|
||||
|
||||
//! Destructor
|
||||
~ClBinaryHsa() {}
|
||||
|
||||
|
||||
protected:
|
||||
bool setElfTarget() {
|
||||
uint32_t target = static_cast<uint32_t>(21);//dev().calTarget());
|
||||
assert (((0xFFFF8000 & target) == 0) && "ASIC target ID >= 2^15");
|
||||
uint16_t elf_target = (uint16_t)(0x7FFF & target);
|
||||
return elfOut()->setTarget(elf_target, amd::OclElf::CAL_PLATFORM);
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
//! Disable default copy constructor
|
||||
ClBinaryHsa(const ClBinaryHsa&);
|
||||
|
||||
//! Disable default operator=
|
||||
ClBinaryHsa& operator=(const ClBinaryHsa&);
|
||||
|
||||
//! Returns the HSA device for this object
|
||||
const Device& dev() const { return static_cast<const Device&>(dev_); }
|
||||
|
||||
};
|
||||
|
||||
} // namespace pal
|
||||
|
||||
#endif // PALBINARY_HPP_
|
||||
|
||||
檔案差異因為檔案過大而無法顯示
載入差異
@@ -0,0 +1,451 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALBLIT_HPP_
|
||||
#define PALBLIT_HPP_
|
||||
|
||||
#include "top.hpp"
|
||||
#include "platform/command.hpp"
|
||||
#include "device/pal/paldefs.hpp"
|
||||
#include "device/device.hpp"
|
||||
#include "device/blit.hpp"
|
||||
|
||||
/*! \addtogroup PAL Blit Implementation
|
||||
* @{
|
||||
*/
|
||||
|
||||
//! PAL Blit Manager Implementation
|
||||
namespace pal {
|
||||
|
||||
class Device;
|
||||
class Kernel;
|
||||
class Memory;
|
||||
class VirtualGPU;
|
||||
|
||||
//! DMA Blit Manager
|
||||
class DmaBlitManager : public device::HostBlitManager
|
||||
{
|
||||
public:
|
||||
//! Constructor
|
||||
DmaBlitManager(
|
||||
VirtualGPU& gpu, //!< Virtual GPU to be used for blits
|
||||
Setup setup = Setup() //!< Specifies HW accelerated blits
|
||||
);
|
||||
|
||||
//! Destructor
|
||||
virtual ~DmaBlitManager() {}
|
||||
|
||||
//! Creates DmaBlitManager object
|
||||
virtual bool create(amd::Device& device) { return true; }
|
||||
|
||||
//! Copies a buffer object to system memory
|
||||
virtual bool readBuffer(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destination host memory
|
||||
const amd::Coord3D& origin, //!< Source origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to system memory
|
||||
virtual bool readBufferRect(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destinaiton host memory
|
||||
const amd::BufferRect& bufRect, //!< Source rectangle
|
||||
const amd::BufferRect& hostRect, //!< Destination rectangle
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies an image object to system memory
|
||||
virtual bool readImage(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destination host memory
|
||||
const amd::Coord3D& origin, //!< Source origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
size_t rowPitch, //!< Row pitch for host memory
|
||||
size_t slicePitch, //!< Slice pitch for host memory
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies system memory to a buffer object
|
||||
virtual bool writeBuffer(
|
||||
const void* srcHost, //!< Source host memory
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& origin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies system memory to a buffer object
|
||||
virtual bool writeBufferRect(
|
||||
const void* srcHost, //!< Source host memory
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::BufferRect& hostRect, //!< Destination rectangle
|
||||
const amd::BufferRect& bufRect, //!< Source rectangle
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies system memory to an image object
|
||||
virtual bool writeImage(
|
||||
const void* srcHost, //!< Source host memory
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& origin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
size_t rowPitch, //!< Row pitch for host memory
|
||||
size_t slicePitch, //!< Slice pitch for host memory
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to another buffer object
|
||||
virtual bool copyBuffer(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to another buffer object
|
||||
virtual bool copyBufferRect(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::BufferRect& srcRect, //!< Source rectangle
|
||||
const amd::BufferRect& dstRect, //!< Destination rectangle
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies an image object to a buffer object
|
||||
virtual bool copyImageToBuffer(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
size_t rowPitch = 0, //!< Pitch for buffer
|
||||
size_t slicePitch = 0 //!< Slice for buffer
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to an image object
|
||||
virtual bool copyBufferToImage(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
size_t rowPitch = 0, //!< Pitch for buffer
|
||||
size_t slicePitch = 0 //!< Slice for buffer
|
||||
) const;
|
||||
|
||||
//! Copies an image object to another image object
|
||||
virtual bool copyImage(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
protected:
|
||||
const static uint MaxPinnedBuffers = 4;
|
||||
|
||||
//! Synchronizes the blit operations if necessary
|
||||
inline void synchronize() const;
|
||||
|
||||
//! Returns the virtual GPU object
|
||||
VirtualGPU& gpu() const { return static_cast<VirtualGPU&>(vDev_); }
|
||||
|
||||
//! Returns the GPU device object
|
||||
const Device& dev() const { return static_cast<const Device&>(dev_); };
|
||||
|
||||
inline Memory& gpuMem(device::Memory& mem) const;
|
||||
|
||||
//! Pins host memory for GPU access
|
||||
amd::Memory* pinHostMemory(
|
||||
const void* hostMem, //!< Host memory pointer
|
||||
size_t pinSize, //!< Host memory size
|
||||
size_t& partial //!< Extra offset for memory alignment
|
||||
) const;
|
||||
|
||||
const size_t MinSizeForPinnedTransfer;
|
||||
bool completeOperation_; //!< DMA blit manager must complete operation
|
||||
amd::Context* context_; //!< A dummy context
|
||||
|
||||
private:
|
||||
|
||||
//! Disable copy constructor
|
||||
DmaBlitManager(const DmaBlitManager&);
|
||||
|
||||
//! Disable operator=
|
||||
DmaBlitManager& operator=(const DmaBlitManager&);
|
||||
|
||||
//! Reads video memory, using a staged buffer
|
||||
bool readMemoryStaged(
|
||||
Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destination host memory
|
||||
Memory** xferBuf, //!< Staged buffer for read
|
||||
size_t origin, //!< Original offset in the source memory
|
||||
size_t& offset, //!< Offset for the current copy pointer
|
||||
size_t& totalSize, //!< Total size for copy region
|
||||
size_t xferSize //!< Transfer size
|
||||
) const;
|
||||
|
||||
//! Write into video memory, using a staged buffer
|
||||
bool writeMemoryStaged(
|
||||
const void* srcHost, //!< Source host memory
|
||||
Memory& dstMemory, //!< Destination memory object
|
||||
Memory& xferBuf, //!< Staged buffer for write
|
||||
size_t origin, //!< Original offset in the destination memory
|
||||
size_t& offset, //!< Offset for the current copy pointer
|
||||
size_t& totalSize, //!< Total size for the copy region
|
||||
size_t xferSize //!< Transfer size
|
||||
) const;
|
||||
};
|
||||
|
||||
//! Kernel Blit Manager
|
||||
class KernelBlitManager : public DmaBlitManager
|
||||
{
|
||||
public:
|
||||
enum {
|
||||
BlitCopyImage = 0,
|
||||
BlitCopyImage1DA,
|
||||
BlitCopyImageToBuffer,
|
||||
BlitCopyBufferToImage,
|
||||
BlitCopyBufferRect,
|
||||
BlitCopyBufferRectAligned,
|
||||
BlitCopyBuffer,
|
||||
BlitCopyBufferAligned,
|
||||
FillBuffer,
|
||||
FillImage,
|
||||
Scheduler,
|
||||
BlitTotal
|
||||
};
|
||||
|
||||
//! Constructor
|
||||
KernelBlitManager(
|
||||
VirtualGPU& gpu, //!< Virtual GPU to be used for blits
|
||||
Setup setup = Setup() //!< Specifies HW accelerated blits
|
||||
);
|
||||
|
||||
//! Destructor
|
||||
virtual ~KernelBlitManager();
|
||||
|
||||
//! Creates DmaBlitManager object
|
||||
virtual bool create(amd::Device& device);
|
||||
|
||||
//! Copies a buffer object to another buffer object
|
||||
virtual bool copyBufferRect(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::BufferRect& srcRectIn, //!< Source rectangle
|
||||
const amd::BufferRect& dstRectIn, //!< Destination rectangle
|
||||
const amd::Coord3D& sizeIn, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to system memory
|
||||
virtual bool readBuffer(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destination host memory
|
||||
const amd::Coord3D& origin, //!< Source origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to system memory
|
||||
virtual bool readBufferRect(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destinaiton host memory
|
||||
const amd::BufferRect& bufRect, //!< Source rectangle
|
||||
const amd::BufferRect& hostRect, //!< Destination rectangle
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies system memory to a buffer object
|
||||
virtual bool writeBuffer(
|
||||
const void* srcHost, //!< Source host memory
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& origin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies system memory to a buffer object
|
||||
virtual bool writeBufferRect(
|
||||
const void* srcHost, //!< Source host memory
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::BufferRect& hostRect, //!< Destination rectangle
|
||||
const amd::BufferRect& bufRect, //!< Source rectangle
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to an image object
|
||||
virtual bool copyBuffer(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies a buffer object to an image object
|
||||
virtual bool copyBufferToImage(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
size_t rowPitch = 0, //!< Pitch for buffer
|
||||
size_t slicePitch = 0 //!< Slice for buffer
|
||||
) const;
|
||||
|
||||
//! Copies an image object to a buffer object
|
||||
virtual bool copyImageToBuffer(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
size_t rowPitch = 0, //!< Pitch for buffer
|
||||
size_t slicePitch = 0 //!< Slice for buffer
|
||||
) const;
|
||||
|
||||
//! Copies an image object to another image object
|
||||
virtual bool copyImage(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies an image object to system memory
|
||||
virtual bool readImage(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
void* dstHost, //!< Destination host memory
|
||||
const amd::Coord3D& origin, //!< Source origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
size_t rowPitch, //!< Row pitch for host memory
|
||||
size_t slicePitch, //!< Slice pitch for host memory
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Copies system memory to an image object
|
||||
virtual bool writeImage(
|
||||
const void* srcHost, //!< Source host memory
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& origin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
size_t rowPitch, //!< Row pitch for host memory
|
||||
size_t slicePitch, //!< Slice pitch for host memory
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Fills a buffer memory with a pattern data
|
||||
virtual bool fillBuffer(
|
||||
device::Memory& memory, //!< Memory object to fill with pattern
|
||||
const void* pattern, //!< Pattern data
|
||||
size_t patternSize, //!< Pattern size
|
||||
const amd::Coord3D& origin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Fills an image memory with a pattern data
|
||||
virtual bool fillImage(
|
||||
device::Memory& dstMemory, //!< Memory object to fill with pattern
|
||||
const void* pattern, //!< Pattern data
|
||||
const amd::Coord3D& origin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false //!< Entire buffer will be updated
|
||||
) const;
|
||||
|
||||
//! Fills an image memory with a pattern data
|
||||
virtual bool runScheduler(
|
||||
device::Memory& vqueue, //!< Memory object for virtual queue
|
||||
device::Memory& params, //!< Extra arguments for the scheduler
|
||||
uint paramIdx, //!< Parameter index
|
||||
uint threads //!< Number of scheduling threads
|
||||
) const;
|
||||
|
||||
private:
|
||||
static const size_t MaxXferBuffers = 2;
|
||||
|
||||
//! Copies a buffer object to an image object
|
||||
bool copyBufferToImageKernel(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
size_t rowPitch = 0, //!< Pitch for buffer
|
||||
size_t slicePitch = 0 //!< Slice for buffer
|
||||
) const;
|
||||
|
||||
//! Copies an image object to a buffer object
|
||||
bool copyImageToBufferKernel(
|
||||
device::Memory& srcMemory, //!< Source memory object
|
||||
device::Memory& dstMemory, //!< Destination memory object
|
||||
const amd::Coord3D& srcOrigin, //!< Source origin
|
||||
const amd::Coord3D& dstOrigin, //!< Destination origin
|
||||
const amd::Coord3D& size, //!< Size of the copy region
|
||||
bool entire = false, //!< Entire buffer will be updated
|
||||
size_t rowPitch = 0, //!< Pitch for buffer
|
||||
size_t slicePitch = 0 //!< Slice for buffer
|
||||
) const;
|
||||
|
||||
//! Creates a program for all blit operations
|
||||
bool createProgram(
|
||||
Device& device //!< Device object
|
||||
);
|
||||
|
||||
//! Creates a view memory object
|
||||
Memory* createView(
|
||||
const Memory& parent, //!< Parent memory object
|
||||
const cl_image_format format //!< The new format for a view
|
||||
) const;
|
||||
|
||||
//! Disable copy constructor
|
||||
KernelBlitManager(const KernelBlitManager&);
|
||||
|
||||
//! Disable operator=
|
||||
KernelBlitManager& operator=(const KernelBlitManager&);
|
||||
|
||||
amd::Program* program_; //!< GPU program obejct
|
||||
amd::Kernel* kernels_[BlitTotal]; //!< GPU kernels for blit
|
||||
amd::Memory* constantBuffer_; //!< An internal CB for blits
|
||||
amd::Memory* xferBuffers_[MaxXferBuffers]; //!< Transfer buffers for images
|
||||
size_t xferBufferSize_; //!< Transfer buffer size
|
||||
amd::Monitor* lockXferOps_; //!< Lock transfer operation
|
||||
};
|
||||
|
||||
static const char* BlitName[KernelBlitManager::BlitTotal] = {
|
||||
"copyImage",
|
||||
"copyImage1DA",
|
||||
"copyImageToBuffer",
|
||||
"copyBufferToImage",
|
||||
"copyBufferRect",
|
||||
"copyBufferRectAligned",
|
||||
"copyBuffer",
|
||||
"copyBufferAligned",
|
||||
"fillBuffer",
|
||||
"fillImage",
|
||||
"scheduler",
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
|
||||
#endif /*PALBLIT_HPP_*/
|
||||
@@ -0,0 +1,147 @@
|
||||
//
|
||||
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
|
||||
#include "os/os.hpp"
|
||||
#include "device/pal/paldevice.hpp"
|
||||
#include "device/pal/palprogram.hpp"
|
||||
#include "device/pal/palkernel.hpp"
|
||||
#include "utils/options.hpp"
|
||||
#include <cstdio>
|
||||
|
||||
//CLC_IN_PROCESS_CHANGE
|
||||
extern int openclFrontEnd(const char* cmdline, std::string*, std::string* typeInfo = nullptr);
|
||||
|
||||
namespace pal {
|
||||
|
||||
bool
|
||||
HSAILProgram::compileImpl(
|
||||
const std::string& sourceCode,
|
||||
const std::vector<const std::string*>& headers,
|
||||
const char** headerIncludeNames,
|
||||
amd::option::Options* options)
|
||||
{
|
||||
acl_error errorCode;
|
||||
aclTargetInfo target;
|
||||
|
||||
std::string arch = "hsail";
|
||||
if (dev().settings().use64BitPtr_) {
|
||||
arch += "64";
|
||||
}
|
||||
target = aclGetTargetInfo(arch.c_str(),
|
||||
dev().info().name_, &errorCode);
|
||||
|
||||
// end if asic info is ready
|
||||
// We dump the source code for each program (param: headers)
|
||||
// into their filenames (headerIncludeNames) into the TEMP
|
||||
// folder specific to the OS and add the include path while
|
||||
// compiling
|
||||
|
||||
// Find the temp folder for the OS
|
||||
std::string tempFolder = amd::Os::getTempPath();
|
||||
std::string tempFileName = amd::Os::getTempFileName();
|
||||
|
||||
// Iterate through each source code and dump it into tmp
|
||||
std::fstream f;
|
||||
std::vector<std::string> headerFileNames(headers.size());
|
||||
std::vector<std::string> newDirs;
|
||||
for (size_t i = 0; i < headers.size(); ++i) {
|
||||
std::string headerPath = tempFolder;
|
||||
std::string headerIncludeName(headerIncludeNames[i]);
|
||||
// replace / in path with current os's file separator
|
||||
if (amd::Os::fileSeparator() != '/') {
|
||||
for (std::string::iterator it = headerIncludeName.begin(),
|
||||
end = headerIncludeName.end(); it != end; ++it) {
|
||||
if (*it == '/') *it = amd::Os::fileSeparator();
|
||||
}
|
||||
}
|
||||
size_t pos = headerIncludeName.rfind(amd::Os::fileSeparator());
|
||||
if (pos != std::string::npos) {
|
||||
headerPath += amd::Os::fileSeparator();
|
||||
headerPath += headerIncludeName.substr(0, pos);
|
||||
headerIncludeName = headerIncludeName.substr(pos+1);
|
||||
}
|
||||
if (!amd::Os::pathExists(headerPath)) {
|
||||
bool ret = amd::Os::createPath(headerPath);
|
||||
assert(ret && "failed creating path!");
|
||||
newDirs.push_back(headerPath);
|
||||
}
|
||||
std::string headerFullName =
|
||||
headerPath + amd::Os::fileSeparator() + headerIncludeName;
|
||||
headerFileNames[i] = headerFullName;
|
||||
f.open(headerFullName.c_str(), std::fstream::out);
|
||||
// Should we allow asserts
|
||||
assert(!f.fail() && "failed creating header file!");
|
||||
f.write(headers[i]->c_str(), headers[i]->length());
|
||||
f.close();
|
||||
}
|
||||
|
||||
// Create Binary
|
||||
binaryElf_ = aclBinaryInit(sizeof(aclBinary),
|
||||
&target, &binOpts_, &errorCode);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: aclBinary init failure\n";
|
||||
LogWarning("aclBinaryInit failed");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Insert opencl into binary
|
||||
errorCode = aclInsertSection(dev().compiler(), binaryElf_,
|
||||
sourceCode.c_str(), strlen(sourceCode.c_str()), aclSOURCE);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Inserting openCl Source to binary\n";
|
||||
}
|
||||
|
||||
// Set the options for the compiler
|
||||
// Set the include path for the temp folder that contains the includes
|
||||
if (!headers.empty()) {
|
||||
compileOptions_.append(" -I");
|
||||
compileOptions_.append(tempFolder);
|
||||
}
|
||||
|
||||
//Add only for CL2.0 and above
|
||||
if (options->oVariables->CLStd[2] >= '2') {
|
||||
std::stringstream opts;
|
||||
opts << " -D" << "CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE="
|
||||
<< device().info().maxGlobalVariableSize_;
|
||||
compileOptions_.append(opts.str());
|
||||
}
|
||||
|
||||
#if !defined(_LP64) && defined(ATI_OS_LINUX)
|
||||
if (options->origOptionStr.find("-cl-std=CL2.0") != std::string::npos && !dev().settings().force32BitOcl20_) {
|
||||
errorCode = ACL_UNSUPPORTED;
|
||||
LogWarning("aclCompile failed");
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Compile source to IR
|
||||
compileOptions_.append(hsailOptions());
|
||||
errorCode = aclCompile(dev().compiler(), binaryElf_, compileOptions_.c_str(),
|
||||
ACL_TYPE_OPENCL, ACL_TYPE_LLVMIR_BINARY, nullptr);
|
||||
buildLog_ += aclGetCompilerLog(dev().compiler());
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
LogWarning("aclCompile failed");
|
||||
buildLog_ += "Error: Compiling CL to IR\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
clBinary()->storeCompileOptions(compileOptions_);
|
||||
// Save the binary in the interface class
|
||||
size_t size = 0;
|
||||
void* mem = nullptr;
|
||||
aclWriteToMem(binaryElf_, &mem, &size);
|
||||
setBinary(static_cast<char*>(mem), size);
|
||||
|
||||
// Save the binary inside the program
|
||||
// The FSAILProgram will be responsible to free it during destruction
|
||||
rawBinary_ = mem;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
@@ -0,0 +1,89 @@
|
||||
//
|
||||
// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
|
||||
#include "device/pal/palconstbuf.hpp"
|
||||
#include "device/pal/palvirtual.hpp"
|
||||
#include "device/pal/paldevice.hpp"
|
||||
#include "device/pal/palsettings.hpp"
|
||||
|
||||
namespace pal {
|
||||
|
||||
ConstBuffer::ConstBuffer(
|
||||
VirtualGPU& gpu,
|
||||
size_t size)
|
||||
: Memory(const_cast<pal::Device&>(gpu.dev()), size * VectorSize)
|
||||
, gpu_(gpu)
|
||||
, size_(size * VectorSize)
|
||||
, wrtOffset_(0)
|
||||
, lastWrtSize_(0)
|
||||
, wrtAddress_(nullptr)
|
||||
{
|
||||
}
|
||||
|
||||
ConstBuffer::~ConstBuffer()
|
||||
{
|
||||
if (wrtAddress_ != nullptr) {
|
||||
unmap(&gpu_);
|
||||
}
|
||||
|
||||
amd::AlignedMemory::deallocate(sysMemCopy_);
|
||||
}
|
||||
|
||||
bool
|
||||
ConstBuffer::create()
|
||||
{
|
||||
// Create sysmem copy for the constant buffer
|
||||
sysMemCopy_ = reinterpret_cast<address>(amd::AlignedMemory::allocate(size_, 256));
|
||||
if (sysMemCopy_ == nullptr) {
|
||||
LogPrintfError("We couldn't allocate sysmem copy for constant buffer,\
|
||||
size(%d)!", size_);
|
||||
return false;
|
||||
}
|
||||
memset(sysMemCopy_, 0, size_);
|
||||
|
||||
if (!Memory::create(Resource::RemoteUSWC)) {
|
||||
LogPrintfError("We couldn't create HW constant buffer, size(%d)!", size_);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Constant buffer warm-up
|
||||
warmUpRenames(gpu_);
|
||||
|
||||
wrtAddress_ = map(&gpu_, Resource::Discard);
|
||||
if (wrtAddress_ == nullptr) {
|
||||
LogPrintfError("We couldn't map HW constant buffer, size(%d)!", size_);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
ConstBuffer::uploadDataToHw(size_t size)
|
||||
{
|
||||
static const size_t HwCbAlignment = 256;
|
||||
|
||||
// Align copy size on the vector's boundary
|
||||
size_t count = amd::alignUp(size, VectorSize);
|
||||
wrtOffset_ += lastWrtSize_;
|
||||
|
||||
// Check if CB has enough space for copy
|
||||
if ((wrtOffset_ + count) > size_) {
|
||||
if (wrtAddress_ != nullptr) {
|
||||
unmap(&gpu_);
|
||||
}
|
||||
wrtAddress_ = map(&gpu_, Resource::Discard);
|
||||
wrtOffset_ = 0;
|
||||
lastWrtSize_ = 0;
|
||||
}
|
||||
|
||||
// Update memory with new CB data
|
||||
memcpy((reinterpret_cast<char*>(wrtAddress_) + wrtOffset_), sysMemCopy_, count);
|
||||
|
||||
// Adjust the size by the HW CB buffer alignment
|
||||
lastWrtSize_ = amd::alignUp(size, HwCbAlignment);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
@@ -0,0 +1,70 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
|
||||
#ifndef PALCONSTBUF_HPP_
|
||||
#define PALCONSTBUF_HPP_
|
||||
|
||||
#include "device/pal/palmemory.hpp"
|
||||
|
||||
//! \namespace pal PAL Resource Implementation
|
||||
namespace pal {
|
||||
|
||||
//! Cconstant buffer
|
||||
class ConstBuffer : public Memory
|
||||
{
|
||||
public:
|
||||
//! Vector size of the constant buffer
|
||||
static const size_t VectorSize = 16;
|
||||
|
||||
//! Constructor for the ConstBuffer class
|
||||
ConstBuffer(
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
size_t size //!< size of the constant buffer in vectors
|
||||
);
|
||||
|
||||
//! Destructor for the ConstBuffer class
|
||||
~ConstBuffer();
|
||||
|
||||
//! Creates the real HW constant buffer
|
||||
bool create();
|
||||
|
||||
/*! \brief Uploads current constant buffer data from sysMemCopy_ to HW
|
||||
*
|
||||
* \return True if the data upload was succesful
|
||||
*/
|
||||
bool uploadDataToHw(
|
||||
size_t size //!< real data size for upload
|
||||
);
|
||||
|
||||
//! Returns a pointer to the system memory copy for CB
|
||||
address sysMemCopy() const { return sysMemCopy_; }
|
||||
|
||||
//! Returns CB size
|
||||
size_t size() const { return size_; }
|
||||
|
||||
//! Returns current write offset for the constant buffer
|
||||
size_t wrtOffset() const { return wrtOffset_; }
|
||||
|
||||
//! Returns last write size for the constant buffer
|
||||
size_t lastWrtSize() const { return lastWrtSize_; }
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
ConstBuffer(const ConstBuffer&);
|
||||
|
||||
//! Disable operator=
|
||||
ConstBuffer& operator=(const ConstBuffer&);
|
||||
|
||||
VirtualGPU& gpu_; //!< Virtual GPU object
|
||||
address sysMemCopy_; //!< System memory copy
|
||||
size_t size_; //!< Constant buffer size
|
||||
size_t wrtOffset_; //!< Current write offset
|
||||
size_t lastWrtSize_; //!< Last write size
|
||||
void* wrtAddress_; //!< Write address in CB
|
||||
};
|
||||
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
|
||||
#endif /*PALCONSTBUF_HPP_*/
|
||||
@@ -0,0 +1,119 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
|
||||
#include "device/pal/paldefs.hpp"
|
||||
#include "device/pal/palcounters.hpp"
|
||||
#include "device/pal/palvirtual.hpp"
|
||||
|
||||
namespace pal {
|
||||
|
||||
PalCounterReference*
|
||||
PalCounterReference::Create(
|
||||
VirtualGPU& gpu,
|
||||
const Pal::PerfExperimentCreateInfo& createInfo)
|
||||
{
|
||||
Pal::Result result;
|
||||
size_t palExperSize = gpu.dev().iDev()->GetPerfExperimentSize(
|
||||
createInfo, &result);
|
||||
if (result != Pal::Result::Success) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
PalCounterReference* memRef = new (palExperSize) PalCounterReference(gpu);
|
||||
if (memRef != nullptr) {
|
||||
result = gpu.dev().iDev()->CreatePerfExperiment(createInfo,
|
||||
&memRef[1], &memRef->perfExp_);
|
||||
if (result != Pal::Result::Success) {
|
||||
memRef->release();
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
return memRef;
|
||||
}
|
||||
|
||||
PalCounterReference::~PalCounterReference() {
|
||||
// The counter object is always associated with a particular queue,
|
||||
// so we have to lock just this queue
|
||||
amd::ScopedLock lock(gpu_.execution());
|
||||
if (nullptr != iPerf()) {
|
||||
iPerf()->Destroy();
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
PalCounterReference::growResultArray(uint index) {
|
||||
if (results_ != nullptr) {
|
||||
delete [] results_;
|
||||
}
|
||||
results_ = new uint64_t [index + 1];
|
||||
if (results_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
PerfCounter::~PerfCounter()
|
||||
{
|
||||
if (calRef_ == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Release the counter reference object
|
||||
calRef_->release();
|
||||
}
|
||||
|
||||
bool
|
||||
PerfCounter::create(
|
||||
PalCounterReference* calRef)
|
||||
{
|
||||
assert(&gpu() == &calRef->gpu());
|
||||
|
||||
calRef_ = calRef;
|
||||
counter_ = calRef->iPerf();
|
||||
index_ = calRef->retain() - 2;
|
||||
calRef->growResultArray(index_);
|
||||
|
||||
// Initialize the counter
|
||||
Pal::PerfCounterInfo counterInfo = {};
|
||||
counterInfo.counterType = Pal::PerfCounterType::Global;
|
||||
counterInfo.block = static_cast<Pal::GpuBlock>(info_.blockIndex_);
|
||||
counterInfo.instance = info_.counterIndex_;
|
||||
counterInfo.eventId = info_.eventIndex_;
|
||||
Pal::Result result = counter_->AddCounter(counterInfo);
|
||||
if (result != Pal::Result::Success) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
uint64_t
|
||||
PerfCounter::getInfo(uint64_t infoType) const
|
||||
{
|
||||
switch (infoType) {
|
||||
case CL_PERFCOUNTER_GPU_BLOCK_INDEX: {
|
||||
// Return the GPU block index
|
||||
return info()->blockIndex_;
|
||||
}
|
||||
case CL_PERFCOUNTER_GPU_COUNTER_INDEX: {
|
||||
// Return the GPU counter index
|
||||
return info()->counterIndex_;
|
||||
}
|
||||
case CL_PERFCOUNTER_GPU_EVENT_INDEX: {
|
||||
// Return the GPU event index
|
||||
return info()->eventIndex_;
|
||||
}
|
||||
case CL_PERFCOUNTER_DATA: {
|
||||
Unimplemented();
|
||||
//gslCounter()->GetResult(gpu().cs(), reinterpret_cast<uint64*>(calRef_->results()));
|
||||
return calRef_->results()[index_];
|
||||
}
|
||||
default:
|
||||
LogError("Wrong PerfCounter::getInfo parameter");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
@@ -0,0 +1,152 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALCOUNTERS_HPP_
|
||||
#define PALCOUNTERS_HPP_
|
||||
|
||||
#include "top.hpp"
|
||||
#include "device/device.hpp"
|
||||
#include "device/pal/paldevice.hpp"
|
||||
#include "palPerfExperiment.h"
|
||||
|
||||
namespace pal {
|
||||
|
||||
class VirtualGPU;
|
||||
|
||||
class PalCounterReference : public amd::ReferenceCountedObject
|
||||
{
|
||||
public:
|
||||
static PalCounterReference* Create(
|
||||
VirtualGPU& gpu,
|
||||
const Pal::PerfExperimentCreateInfo& createInfo);
|
||||
|
||||
//! Default constructor
|
||||
PalCounterReference(
|
||||
VirtualGPU& gpu //!< Virtual GPU device object
|
||||
)
|
||||
: perfExp_(nullptr)
|
||||
, gpu_(gpu)
|
||||
, results_(nullptr) {}
|
||||
|
||||
//! Get PAL counter
|
||||
Pal::IPerfExperiment* iPerf() const { return perfExp_; }
|
||||
|
||||
//! Returns the virtual GPU device
|
||||
const VirtualGPU& gpu() const { return gpu_; }
|
||||
|
||||
//! Increases the results array for this PAL counter(container)
|
||||
bool growResultArray(
|
||||
uint maxIndex //!< the maximum HW counter index in the PAL counter
|
||||
);
|
||||
|
||||
void finalize() {
|
||||
iPerf()->Finalize();
|
||||
Pal::GlobalCounterLayout layout = {};
|
||||
layout.sampleCount = referenceCount() - 1;
|
||||
iPerf()->GetGlobalCounterLayout(&layout); }
|
||||
|
||||
//! Returns the PAL counter results
|
||||
uint64_t* results() const { return results_; }
|
||||
|
||||
Pal::IPerfExperiment* perfExp_; //!< PAL performance experiment object
|
||||
|
||||
protected:
|
||||
//! Default destructor
|
||||
~PalCounterReference();
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
PalCounterReference(const PalCounterReference&);
|
||||
|
||||
//! Disable operator=
|
||||
PalCounterReference& operator=(const PalCounterReference&);
|
||||
|
||||
VirtualGPU& gpu_; //!< The virtual GPU device object
|
||||
uint64_t* results_; //!< Counter results
|
||||
};
|
||||
|
||||
//! Performance counter implementation on GPU
|
||||
class PerfCounter : public device::PerfCounter
|
||||
{
|
||||
public:
|
||||
//! The performance counter info
|
||||
struct Info : public amd::EmbeddedObject
|
||||
{
|
||||
uint blockIndex_; //!< Index of the block to configure
|
||||
uint counterIndex_; //!< Index of the hardware counter
|
||||
uint eventIndex_; //!< Event you wish to count with the counter
|
||||
};
|
||||
|
||||
//! The PerfCounter flags
|
||||
enum Flags
|
||||
{
|
||||
BeginIssued = 0x00000001,
|
||||
EndIssued = 0x00000002,
|
||||
ResultReady = 0x00000004
|
||||
};
|
||||
|
||||
//! Constructor for the GPU PerfCounter object
|
||||
PerfCounter(
|
||||
const Device& device, //!< A GPU device object
|
||||
const VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
cl_uint blockIndex, //!< HW block index
|
||||
cl_uint counterIndex, //!< Counter index within the block
|
||||
cl_uint eventIndex) //!< Event index for profiling
|
||||
: gpuDevice_(device)
|
||||
, gpu_(gpu)
|
||||
, calRef_(NULL)
|
||||
, flags_(0)
|
||||
, counter_(0)
|
||||
, index_(0)
|
||||
{
|
||||
info_.blockIndex_ = blockIndex;
|
||||
info_.counterIndex_ = counterIndex;
|
||||
info_.eventIndex_ = eventIndex;
|
||||
}
|
||||
|
||||
//! Destructor for the GPU PerfCounter object
|
||||
virtual ~PerfCounter();
|
||||
|
||||
//! Creates the current object
|
||||
bool create(
|
||||
PalCounterReference* calRef //!< Reference counter
|
||||
);
|
||||
|
||||
//! Returns the specific information about the counter
|
||||
uint64_t getInfo(
|
||||
uint64_t infoType //!< The type of returned information
|
||||
) const;
|
||||
|
||||
//! Returns the GPU device, associated with the current object
|
||||
const Device& dev() const { return gpuDevice_; }
|
||||
|
||||
//! Returns the virtual GPU device
|
||||
const VirtualGPU& gpu() const { return gpu_; }
|
||||
|
||||
//! Returns the CAL performance counter descriptor
|
||||
const Info* info() const { return &info_; }
|
||||
|
||||
//! Returns the Info structure for performance counter
|
||||
Pal::IPerfExperiment* iPerf() const { return counter_; }
|
||||
|
||||
private:
|
||||
//! Disable default copy constructor
|
||||
PerfCounter(const PerfCounter&);
|
||||
|
||||
//! Disable default operator=
|
||||
PerfCounter& operator=(const PerfCounter&);
|
||||
|
||||
const Device& gpuDevice_; //!< The backend device
|
||||
const VirtualGPU& gpu_; //!< The virtual GPU device object
|
||||
|
||||
PalCounterReference* calRef_; //!< Reference counter
|
||||
uint flags_; //!< The perfcounter object state
|
||||
Info info_; //!< The info structure for perfcounter
|
||||
Pal::IPerfExperiment* counter_; //!< GSL counter object
|
||||
uint index_; //!< Counter index in the CAL container
|
||||
};
|
||||
|
||||
} // namespace pal
|
||||
|
||||
#endif // PALCOUNTERS_HPP_
|
||||
|
||||
@@ -0,0 +1,121 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALDEBGGER_H_
|
||||
#define PALDEBGGER_H_
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include "hsa.h"
|
||||
#include "amd_hsa_kernel_code.h"
|
||||
#include "device/device.hpp"
|
||||
#include "device/hwdebug.hpp"
|
||||
#include "acl.h"
|
||||
|
||||
static const int NumberReserveVgprs = 4;
|
||||
|
||||
namespace pal {
|
||||
|
||||
/**
|
||||
* \defgroup Services_API OCL Runtime Services API
|
||||
* @{
|
||||
*/
|
||||
|
||||
/*! \brief Dispatch packet information
|
||||
*
|
||||
* This structure contains the packet information for kernel dispatch
|
||||
*/
|
||||
struct PacketAmdInfo
|
||||
{
|
||||
uint32_t trapReservedVgprIndex_; //!< reserved VGPR index, -1 when they are not valid
|
||||
uint32_t scratchBufferWaveOffset_; //!< scratch buffer wave offset, -1 when no scratch buffer
|
||||
void* pointerToIsaBuffer_; //!< pointer to the buffer containing ISA
|
||||
size_t sizeOfIsaBuffer_; //!< size of the ISA buffer
|
||||
uint32_t numberOfVgprs_; //!< number of VGPRs used by the kernel
|
||||
uint32_t numberOfSgprs_; //!< number of SGPRs used by the kernel
|
||||
size_t sizeOfStaticGroupMemory_; //!< Static local memory used by the kernel
|
||||
};
|
||||
|
||||
/*! \brief Cache mask for invalidation
|
||||
*/
|
||||
struct HwDbgGpuCacheMask
|
||||
{
|
||||
HwDbgGpuCacheMask() :ui32All_(0) {}
|
||||
|
||||
HwDbgGpuCacheMask(uint32_t mask) :ui32All_(mask) {}
|
||||
|
||||
union {
|
||||
struct {
|
||||
uint32_t sqICache_ : 1; //!< Instruction cache
|
||||
uint32_t sqKCache_ : 1; //!< Data cache
|
||||
uint32_t tcL1_ : 1; //!< tcL1 cache
|
||||
uint32_t tcL2_ : 1; //!< tcL2 cache
|
||||
uint32_t reserved_ : 28;
|
||||
};
|
||||
uint32_t ui32All_;
|
||||
};
|
||||
};
|
||||
|
||||
/*! \brief Address watch information
|
||||
*
|
||||
* Information about each watch point - address, mask, mode and event
|
||||
*/
|
||||
struct HwDbgAddressWatch
|
||||
{
|
||||
void* watchAddress_; //! The address of watch point
|
||||
uint64_t watchMask_; //! The mask for watch point (lower 24 bits)
|
||||
cl_dbg_address_watch_mode_amd watchMode_; //! The watch mode for this watch
|
||||
DebugEvent event_; //! Event of the watch point (not used for now)
|
||||
};
|
||||
|
||||
/*! \brief Runtime structure used to communicate debug information
|
||||
* between Ocl services and core for a kernel dispatch.
|
||||
*/
|
||||
struct DebugToolInfo
|
||||
{
|
||||
uint64_t scratchAddress_; //! Scratch memory address
|
||||
size_t scratchSize_; //! Scratch memory size
|
||||
uint64_t globalAddress_; //! Global memory address
|
||||
uint32_t cacheDisableMask_; //! Cache mask, indicating caches disabled
|
||||
uint32_t exceptionMask_; //! Exception mask
|
||||
uint32_t reservedCuNum_; //! Number of reserved CUs for display,
|
||||
//! which ranges from 0 to 7 in the current implementation.
|
||||
bool monitorMode_; //! Debug or profiler mode
|
||||
bool gpuSingleStepMode_; //! SQ debug mode
|
||||
amd::Memory* trapHandler_; //! Trap handler address
|
||||
amd::Memory* trapBuffer_; //! Trap buffer address
|
||||
bool sqPerfcounterEnable_; //! whether SQ perf counters are enabled
|
||||
aclBinary* aclBinary_; //! pointer of the kernel ACL binary
|
||||
amd::Event* event_; //! pointer of the kernel event in the enqueue command
|
||||
};
|
||||
|
||||
/*! \brief Message used by the KFD wave control for CI
|
||||
*
|
||||
* Structure indicates the various information used by the wave control function.
|
||||
*/
|
||||
struct HwDebugWaveAddr
|
||||
{
|
||||
uint32_t VMID_ : 4; //! Virtual memory id
|
||||
uint32_t wave_ : 4; //! Wave id
|
||||
uint32_t SIMD_ : 2; //! SIMD id
|
||||
uint32_t CU_ : 4; //! Compute unit
|
||||
uint32_t SH_ : 1; //! Shader array
|
||||
uint32_t SE_ : 1; //! Shader engine
|
||||
};
|
||||
|
||||
/*! \brief Kernel code information
|
||||
*
|
||||
* This structure contains the pointer of mapped kernel code for host access
|
||||
* and its size (in bytes)
|
||||
*/
|
||||
struct AqlCodeInfo
|
||||
{
|
||||
amd_kernel_code_t * aqlCode_; //! pointer of AQL code to allow host access
|
||||
uint32_t aqlCodeSize_; //! size of AQL code
|
||||
};
|
||||
|
||||
/**@}*/
|
||||
|
||||
} // namespace pal
|
||||
|
||||
#endif // PALDEBGGER_H_
|
||||
@@ -0,0 +1,412 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#include "platform/commandqueue.hpp"
|
||||
#include "device/device.hpp"
|
||||
#include "device/pal/paldevice.hpp"
|
||||
#include "device/pal/palmemory.hpp"
|
||||
#include "device/pal/paltrap.hpp"
|
||||
#include "device/pal/paldebugmanager.hpp"
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
|
||||
namespace pal {
|
||||
|
||||
class VirtualGPU;
|
||||
class Device;
|
||||
class Memory;
|
||||
|
||||
/*
|
||||
***************************************************************************
|
||||
* Implementation of GPU Debug Manager class
|
||||
***************************************************************************
|
||||
*/
|
||||
|
||||
GpuDebugManager::GpuDebugManager(amd::Device* device)
|
||||
: HwDebugManager(device)
|
||||
, vGpu_(nullptr)
|
||||
, debugMessages_(0)
|
||||
, addressWatch_(nullptr)
|
||||
, addressWatchSize_(0)
|
||||
, oclEventHandle_(nullptr)
|
||||
{
|
||||
// Initialize the exception info and the kernel execution mode
|
||||
excpPolicy_.exceptionMask = 0x0;
|
||||
excpPolicy_.waveAction = CL_DBG_WAVES_RESUME;
|
||||
excpPolicy_.hostAction = CL_DBG_HOST_IGNORE;
|
||||
excpPolicy_.waveMode = CL_DBG_WAVEMODE_BROADCAST;
|
||||
|
||||
execMode_.ui32All = 0;
|
||||
|
||||
rtTrapHandlerInfo_.trap_.trapHandler_ = nullptr;
|
||||
rtTrapHandlerInfo_.trap_.trapBuffer_ = nullptr;
|
||||
|
||||
aqlPacket_ = (hsa_kernel_dispatch_packet_t *) nullptr;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
GpuDebugManager::~GpuDebugManager()
|
||||
{
|
||||
if (nullptr != addressWatch_) {
|
||||
delete [] addressWatch_;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::executePreDispatchCallBack(void* aqlPacket,
|
||||
void* toolInfo)
|
||||
{
|
||||
DebugToolInfo* info = reinterpret_cast<DebugToolInfo*>(toolInfo);
|
||||
|
||||
aqlPacket_ = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlPacket);
|
||||
Unimplemented();
|
||||
// Only if the pre-dispatch callback is set, will we update cache
|
||||
// flush configuration and build the memory descriptor.
|
||||
if (nullptr != preDispatchCallBackFunc_) {
|
||||
/*
|
||||
// Build the scratch memory descriptor
|
||||
device()->gslCtx()->BuildScratchBufferResource(debugInfo_.scratchMemoryDescriptor_,
|
||||
info->scratchAddress_,
|
||||
info->scratchSize_);
|
||||
|
||||
// Build the global memory descriptor
|
||||
device()->gslCtx()->BuildHeapBufferResource(debugInfo_.globalMemoryDescriptor_,
|
||||
info->globalAddress_);
|
||||
*/
|
||||
// // for invalidate cache (BuildEndOfKernelNotifyCommands)
|
||||
// aqlPacket->release_fence_scope = 2;
|
||||
|
||||
aclBinary_ = reinterpret_cast<void*>(info->aclBinary_);
|
||||
oclEventHandle_ = reinterpret_cast<void*>(as_cl(info->event_));
|
||||
|
||||
cl_device_id clDeviceId = as_cl(device_);
|
||||
preDispatchCallBackFunc_(clDeviceId,
|
||||
oclEventHandle_,
|
||||
aqlPacket_,
|
||||
aclBinary_,
|
||||
preDispatchCallBackArgs_);
|
||||
}
|
||||
|
||||
// setup the trap handler information only if the debugger has been registered
|
||||
if (isRegistered()) {
|
||||
// Copy the various info set by the debugger/profiler to the tool info structure
|
||||
setupTrapInformation(info);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::executePostDispatchCallBack()
|
||||
{
|
||||
if (nullptr != postDispatchCallBackFunc_) {
|
||||
cl_device_id clDeviceId = as_cl(device_);
|
||||
postDispatchCallBackFunc_(clDeviceId,
|
||||
aqlPacket_->completion_signal.handle,
|
||||
postDispatchCallBackArgs_);
|
||||
}
|
||||
}
|
||||
|
||||
//! Map the kernel code for host access
|
||||
void
|
||||
GpuDebugManager::mapKernelCode(void* aqlCodeInfo) const
|
||||
{
|
||||
AqlCodeInfo* codeInfo = reinterpret_cast<AqlCodeInfo*>(aqlCodeInfo);
|
||||
|
||||
codeInfo->aqlCode_ = reinterpret_cast<amd_kernel_code_t*>(aqlCodeAddr_);
|
||||
codeInfo->aqlCodeSize_ = aqlCodeSize_;
|
||||
}
|
||||
|
||||
cl_int
|
||||
GpuDebugManager::registerDebugger(amd::Context* context, uintptr_t messageStorage)
|
||||
{
|
||||
if (!device()->settings().enableHwDebug_) {
|
||||
LogError("debugmanager: Register debugger error - HW DEBUG is not enable");
|
||||
return CL_DEBUGGER_REGISTER_FAILURE_AMD;
|
||||
}
|
||||
|
||||
// first time register - set the message storage, flush queue and enable hw debug
|
||||
if (!isRegistered()) {
|
||||
debugMessages_ = messageStorage;
|
||||
Unimplemented();
|
||||
/*
|
||||
if (!device()->gslCtx()->registerHwDebugger(debugMessages_)) {
|
||||
LogError("debugmanager: Register debugger failed");
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
*/
|
||||
isRegistered_ = true;
|
||||
|
||||
if (CL_SUCCESS != createRuntimeTrapHandler()) {
|
||||
LogError("debugmanager: Create runtime trap handler failed");
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
}
|
||||
|
||||
context_ = context;
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::unregisterDebugger()
|
||||
{
|
||||
if (isRegistered()) {
|
||||
// reset the debugger registration flag
|
||||
isRegistered_ = false;
|
||||
context_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::flushCache(uint32_t mask)
|
||||
{
|
||||
HwDbgGpuCacheMask cacheMask(mask);
|
||||
device()->xferQueue()->flushCuCaches(cacheMask);
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
GpuDebugManager::setupTrapInformation(DebugToolInfo* toolInfo)
|
||||
{
|
||||
toolInfo->scratchAddress_ = 0;
|
||||
toolInfo->scratchSize_ = 0;
|
||||
toolInfo->globalAddress_ = 0;
|
||||
toolInfo->sqPerfcounterEnable_ = false;
|
||||
|
||||
// Set up trap related info in the kernel info structure to be
|
||||
// used in the kernel dispatch.
|
||||
toolInfo->exceptionMask_ = excpPolicy_.exceptionMask;
|
||||
toolInfo->gpuSingleStepMode_ = execMode_.gpuSingleStepMode;
|
||||
toolInfo->monitorMode_ = execMode_.monitorMode;
|
||||
|
||||
// The order of these three bits is determined by the definition
|
||||
// of the register COMPUTE_DISPATCH_INITIATOR
|
||||
toolInfo->cacheDisableMask_ = ((execMode_.disableL1Scalar << 2)
|
||||
| (execMode_.disableL2Cache << 1)
|
||||
| (execMode_.disableL1Vector));
|
||||
|
||||
toolInfo->reservedCuNum_ = execMode_.reservedCuNum;
|
||||
|
||||
toolInfo->trapHandler_ = rtTrapInfo_[kDebugTrapHandlerLocation];
|
||||
toolInfo->trapBuffer_ = rtTrapInfo_[kDebugTrapBufferLocation];
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::getPacketAmdInfo(
|
||||
const void* aqlCodeInfo,
|
||||
void* packetInfo) const
|
||||
|
||||
{
|
||||
const AqlCodeInfo* codeInfo =
|
||||
reinterpret_cast<const AqlCodeInfo*>(aqlCodeInfo);
|
||||
|
||||
const amd_kernel_code_t* hostAqlCode = codeInfo->aqlCode_;
|
||||
|
||||
PacketAmdInfo* packet =
|
||||
reinterpret_cast<PacketAmdInfo*>(packetInfo);
|
||||
|
||||
const amd_kernel_code_t* akc = hostAqlCode;
|
||||
|
||||
packet->numberOfSgprs_ = akc->wavefront_sgpr_count;
|
||||
packet->numberOfVgprs_ = akc->workitem_vgpr_count;
|
||||
|
||||
// use mapped kernel_object_address for host accessing of ISA buffer
|
||||
packet->pointerToIsaBuffer_ = (char*) (hostAqlCode) +
|
||||
akc->kernel_code_entry_byte_offset;
|
||||
|
||||
packet->scratchBufferWaveOffset_ =
|
||||
akc->debug_wavefront_private_segment_offset_sgpr;
|
||||
|
||||
packet->sizeOfIsaBuffer_ = codeInfo->aqlCodeSize_;
|
||||
|
||||
packet->sizeOfStaticGroupMemory_ = akc->workgroup_group_segment_byte_size;
|
||||
|
||||
// The trap_reserved_vgpr_index will be 4 less the original
|
||||
// This value must be used only by the debugger
|
||||
packet->trapReservedVgprIndex_ = akc->workitem_vgpr_count - NumberReserveVgprs;
|
||||
}
|
||||
|
||||
DebugEvent
|
||||
GpuDebugManager::createDebugEvent(
|
||||
const bool autoReset)
|
||||
{
|
||||
Unimplemented();
|
||||
/*
|
||||
// create the event object
|
||||
osEventHandle shaderEvent = osEventCreate(!autoReset);
|
||||
|
||||
// event object has been created, set the initial state
|
||||
if (shaderEvent != 0) {
|
||||
|
||||
osEventReset(shaderEvent); // initial state is non-signaled
|
||||
|
||||
if (device()->gslCtx()->exceptionNotification(shaderEvent)) {
|
||||
return shaderEvent;
|
||||
}
|
||||
}
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
cl_int
|
||||
GpuDebugManager::waitDebugEvent(
|
||||
DebugEvent pEvent,
|
||||
uint32_t timeOut) const
|
||||
{
|
||||
Unimplemented();
|
||||
/*
|
||||
if (osEventTimedWait(pEvent, timeOut)) {
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
else {
|
||||
return CL_EVENT_TIMEOUT_AMD;
|
||||
}
|
||||
*/
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::destroyDebugEvent(DebugEvent* pEvent)
|
||||
{
|
||||
Unimplemented();
|
||||
/*
|
||||
osEventDestroy(*pEvent);
|
||||
*pEvent = 0;
|
||||
|
||||
device()->gslCtx()->exceptionNotification(0);
|
||||
*/
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::wavefrontControl(
|
||||
uint32_t waveAction,
|
||||
uint32_t waveMode,
|
||||
uint32_t trapId,
|
||||
void* waveAddr) const
|
||||
{
|
||||
Unimplemented();
|
||||
//device()->gslCtx()->executeSqCommand(waveAction, waveMode, trapId, waveAddr);
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::setAddressWatch(
|
||||
uint32_t numWatchPoints,
|
||||
void** watchAddress,
|
||||
uint64_t* watchMask,
|
||||
uint64_t* watchMode,
|
||||
DebugEvent* event)
|
||||
{
|
||||
size_t requiredSize = numWatchPoints * sizeof(HwDbgAddressWatch);
|
||||
|
||||
// previously allocated size is not big enough, allocate new memory
|
||||
if (addressWatchSize_ < requiredSize) {
|
||||
if (nullptr != addressWatch_) { // free the smaller address watch storage
|
||||
delete [] addressWatch_;
|
||||
}
|
||||
addressWatch_ = new HwDbgAddressWatch[numWatchPoints];
|
||||
addressWatchSize_ = requiredSize;
|
||||
}
|
||||
|
||||
// fill in the address watch structure
|
||||
memset(addressWatch_, 0, addressWatchSize_);
|
||||
|
||||
for (uint32_t i = 0; i < numWatchPoints; i++)
|
||||
{
|
||||
amd::Memory* watchMem = as_amd(reinterpret_cast<cl_mem>(watchAddress[i]));
|
||||
Memory* watchMemAddress = device()->getGpuMemory(watchMem);
|
||||
|
||||
addressWatch_[i].watchAddress_ = reinterpret_cast<void*>(watchMemAddress->vmAddress());
|
||||
addressWatch_[i].watchMask_ = watchMask[i];
|
||||
addressWatch_[i].watchMode_ = (cl_dbg_address_watch_mode_amd) watchMode[i];
|
||||
addressWatch_[i].event_ = (0 != event) ? event[i] : 0;
|
||||
}
|
||||
|
||||
Unimplemented();
|
||||
// setup the watch addresses
|
||||
//device()->gslCtx()->setAddressWatch(numWatchPoints, (void*) addressWatch_);
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
GpuDebugManager::setGlobalMemory(
|
||||
amd::Memory* memObj,
|
||||
uint32_t offset,
|
||||
void* srcPtr,
|
||||
uint32_t size)
|
||||
{
|
||||
Memory* globalMem = device()->getGpuMemory(memObj);
|
||||
|
||||
address mappedMem = static_cast<address>(globalMem->map(nullptr,0));
|
||||
assert(mappedMem != 0);
|
||||
|
||||
void* dest_ptr = reinterpret_cast<void*>(mappedMem + offset);
|
||||
memcpy(dest_ptr, srcPtr, size);
|
||||
|
||||
globalMem->unmap(nullptr);
|
||||
}
|
||||
|
||||
cl_int
|
||||
GpuDebugManager::createRuntimeTrapHandler()
|
||||
{
|
||||
size_t codeSize = 0;
|
||||
const uint32_t* rtTrapCode = nullptr;
|
||||
|
||||
if (device()->settings().viPlus_) {
|
||||
codeSize = sizeof(RuntimeTrapCodeVi);
|
||||
rtTrapCode = RuntimeTrapCodeVi;
|
||||
}
|
||||
else {
|
||||
codeSize = sizeof(RuntimeTrapCode);
|
||||
rtTrapCode = RuntimeTrapCode;
|
||||
}
|
||||
|
||||
uint32_t numCodes = codeSize / sizeof(uint32_t);
|
||||
|
||||
// Handle TMA corruption hw bug workaround -
|
||||
// The trap handler buffer has extra 256 bytes allocated, the TMA address
|
||||
// is stored in the first two DWORDs and the actual trap handler code
|
||||
// is stored starting at the location of 256 bytes (TbaStartOffset).
|
||||
//
|
||||
// allocate memory for the runtime trap handler (TBA) + TMA address
|
||||
uint32_t allocSize = codeSize + TbaStartOffset;
|
||||
|
||||
Memory* rtTBA = new Memory(*device(), allocSize);
|
||||
runtimeTBA_ = rtTBA;
|
||||
|
||||
if ((rtTBA == nullptr) || !rtTBA->create(Resource::RemoteUSWC)) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
address tbaAddress = reinterpret_cast<address>(rtTBA->map(nullptr));
|
||||
|
||||
// allocate buffer for the runtime trap handler buffer (TMA)
|
||||
uint32_t tmaSize = 0x100;
|
||||
Memory* rtTMA = new Memory(*device(), tmaSize);
|
||||
runtimeTMA_ = rtTMA;
|
||||
|
||||
if ((rtTMA == nullptr) || !rtTMA->create(Resource::RemoteUSWC)) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
|
||||
uint64_t rtTmaAddress = rtTMA->vmAddress();
|
||||
if ((rtTBA->vmAddress() & 0xFF) != 0 || (rtTmaAddress & 0xFF) != 0) {
|
||||
LogError("debugmanager: Trap handler/buffer is not 256-byte aligned");
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
|
||||
// store the TMA address at the beginning of trap handler buffer
|
||||
uint64_t* tbaStorage = reinterpret_cast<uint64_t*>(tbaAddress);
|
||||
tbaStorage[0] = rtTmaAddress;
|
||||
|
||||
// save the trap handler code
|
||||
uint32_t* trapHandlerPtr = (uint32_t*)(tbaAddress + TbaStartOffset);
|
||||
for (uint32_t i = 0; i < numCodes; i++) {
|
||||
trapHandlerPtr[i] = rtTrapCode[i];
|
||||
}
|
||||
|
||||
rtTBA->unmap(nullptr);
|
||||
|
||||
return CL_SUCCESS;
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
@@ -0,0 +1,117 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALDEBUGMANAGER_H__
|
||||
#define PALDEBUGMANAGER_H__
|
||||
|
||||
#include "device/pal/palvirtual.hpp"
|
||||
#include "device/pal/paldebugger.hpp"
|
||||
|
||||
namespace pal {
|
||||
|
||||
class GpuDebugManager;
|
||||
class Device;
|
||||
class Memory;
|
||||
|
||||
|
||||
/*! \brief Debug Manager Class
|
||||
*
|
||||
* The debug manager class is used to pass all the trap info to the
|
||||
* kernel dispatch and then the kernel execution can use such trap information
|
||||
* for kernel execution. This class contains the trap handler and shader event
|
||||
* objects. The trap handler is setup by users and passed to the kernel dispatch.
|
||||
* The shader event is to receive interrupts from the GPU and then users can
|
||||
* perform various operations.
|
||||
*
|
||||
* This class also provides the interface for setting up the pre-dispatch
|
||||
* callback functions used by the profiler and debugger. It also provides
|
||||
* a way to retrieve various debug information for the kernel execution.
|
||||
*
|
||||
*/
|
||||
class GpuDebugManager : public amd::HwDebugManager {
|
||||
public:
|
||||
|
||||
//! Constructor of the debug manager class
|
||||
GpuDebugManager(amd::Device* device);
|
||||
|
||||
//! Destructor of the debug manager class
|
||||
~GpuDebugManager();
|
||||
|
||||
//! Get the single instance of the GpuDebugManager class
|
||||
static GpuDebugManager* getDefaultInstance();
|
||||
|
||||
//! Destroy the GpuDebugManager class object
|
||||
static void destroyInstances();
|
||||
|
||||
//! Flush cache
|
||||
void flushCache(uint32_t mask);
|
||||
|
||||
//! Create the debug event
|
||||
DebugEvent createDebugEvent(const bool autoReset);
|
||||
|
||||
//! Wait for the debug event
|
||||
cl_int waitDebugEvent(DebugEvent pEvent, uint32_t timeOut) const;
|
||||
|
||||
//! Destroy the debug event
|
||||
void destroyDebugEvent(DebugEvent* pEvent);
|
||||
|
||||
//! Register the debugger
|
||||
cl_int registerDebugger(amd::Context*context, uintptr_t messageStorage);
|
||||
|
||||
//! Unregister the debugger
|
||||
void unregisterDebugger();
|
||||
|
||||
//! Send the wavefront control cmmand
|
||||
void wavefrontControl(uint32_t waveAction,
|
||||
uint32_t waveMode,
|
||||
uint32_t trapId,
|
||||
void* waveAddr) const;
|
||||
|
||||
//! Set address watching point
|
||||
void setAddressWatch(uint32_t numWatchPoints,
|
||||
void** watchAddress,
|
||||
uint64_t* watchMask,
|
||||
uint64_t* watchMode,
|
||||
DebugEvent* pEvent);
|
||||
|
||||
//! Map the kernel code for host access
|
||||
void mapKernelCode(void* aqlCodeInfo) const;
|
||||
|
||||
//! Get the packet information for dispatch
|
||||
void getPacketAmdInfo(const void* aqlCodeInfo, void* packetInfo) const;
|
||||
|
||||
//! Set global memory values
|
||||
void setGlobalMemory(amd::Memory* memObj, uint32_t offset, void* srcPtr, uint32_t size);
|
||||
|
||||
//! Execute the post-dispatch callback function
|
||||
void executePostDispatchCallBack();
|
||||
|
||||
//! Execute the pre-dispatch callback function
|
||||
void executePreDispatchCallBack(void* aqlPacket,
|
||||
void* toolInfo);
|
||||
|
||||
protected:
|
||||
const VirtualGPU* vGpu() const { return vGpu_; }
|
||||
|
||||
private:
|
||||
//! Setup trap handler info for kernel execution
|
||||
void setupTrapInformation(DebugToolInfo* toolInfo);
|
||||
|
||||
//! Create runtime trap handler
|
||||
cl_int createRuntimeTrapHandler();
|
||||
|
||||
const pal::Device* device() const {
|
||||
return reinterpret_cast<const pal::Device *>(device_); }
|
||||
|
||||
VirtualGPU* vGpu_; //!< the virtual GPU
|
||||
uintptr_t debugMessages_; //!< Pointer to a SHARED_DEBUG_MESSAGES pass to the KMD
|
||||
HwDbgAddressWatch* addressWatch_; //!< Address watch data
|
||||
size_t addressWatchSize_; //!< Size of address watch data
|
||||
//! Arguments used by the callback function
|
||||
void* oclEventHandle_; //!< event handler
|
||||
const hsa_kernel_dispatch_packet_t* aqlPacket_; //!< AQL packet
|
||||
};
|
||||
|
||||
} // namespace pal
|
||||
|
||||
#endif // PALDEBUGMANAGER_H__
|
||||
@@ -0,0 +1,584 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALDEFS_HPP_
|
||||
#define PALDEFS_HPP_
|
||||
|
||||
#include "top.hpp"
|
||||
#include "pal.h"
|
||||
#include "palGpuMemory.h"
|
||||
#include "palImage.h"
|
||||
#include "palFormatInfo.h"
|
||||
|
||||
//
|
||||
/// Memory Object Type
|
||||
//
|
||||
enum PalGpuMemoryType {
|
||||
PAL_DEPTH_BUFFER = 0, ///< Depth Buffer
|
||||
PAL_BUFFER, ///< Pure buffer
|
||||
PAL_TEXTURE_3D, ///< 3D texture
|
||||
PAL_TEXTURE_2D, ///< 2D texture
|
||||
PAL_TEXTURE_1D, ///< 1D texture
|
||||
PAL_TEXTURE_1D_ARRAY, ///< 1D Array texture
|
||||
PAL_TEXTURE_2D_ARRAY, ///< 2D Array texture
|
||||
PAL_TEXTURE_BUFFER, ///< "buffer" texture inside VBO
|
||||
};
|
||||
|
||||
struct HwDbgKernelInfo
|
||||
{
|
||||
uint64_t scratchBufAddr; ///< Handle of GPU local memory for kernel private scratch space
|
||||
size_t scratchBufferSizeInBytes; ///< size of memory pointed to by pScratchBuffer,
|
||||
uint64_t heapBufAddr; ///< Address of the global heap base
|
||||
const void* pAqlDispatchPacket; ///< Pointer to the dipatch packet
|
||||
const void* pAqlQueuePtr; ///< pointer to the AQL Queue
|
||||
void* trapHandler; ///< address of the trap handler (TBA)
|
||||
void* trapHandlerBuffer; ///< address of the trap handler buffer (TMA)
|
||||
uint32_t excpEn; ///< excecption mask
|
||||
bool trapPresent; ///< trap present flag
|
||||
bool sqDebugMode; ///< debug mode flag (GPU single step mode)
|
||||
uint32_t mgmtSe0Mask; ///< mask for SE0 (reserving CU for display)
|
||||
uint32_t mgmtSe1Mask; ///< mask for SE1 (reserving CU for display)
|
||||
uint32_t cacheDisableMask; ///< cache disable mask
|
||||
};
|
||||
|
||||
//! Engine types
|
||||
enum EngineType
|
||||
{
|
||||
MainEngine = 0,
|
||||
SdmaEngine,
|
||||
AllEngines
|
||||
};
|
||||
|
||||
struct GpuEvent
|
||||
{
|
||||
static const unsigned int InvalidID = ((1<<30) - 1);
|
||||
|
||||
EngineType engineId_; ///< type of the id
|
||||
unsigned int id; ///< actual event id
|
||||
|
||||
//! GPU event default constructor
|
||||
GpuEvent(): engineId_(MainEngine), id(InvalidID) {}
|
||||
|
||||
//! Returns true if the current event is valid
|
||||
bool isValid() const { return (id != InvalidID) ? true : false; }
|
||||
|
||||
//! Set invalid event id
|
||||
void invalidate() { id = InvalidID; }
|
||||
};
|
||||
|
||||
/*! \addtogroup PAL
|
||||
* @{
|
||||
*/
|
||||
|
||||
//! PAL Device Implementation
|
||||
|
||||
namespace pal {
|
||||
|
||||
//! Maximum number of the supported global atomic counters
|
||||
const static uint MaxAtomicCounters = 8;
|
||||
//! Maximum number of the supported samplers
|
||||
const static uint MaxSamplers = 16;
|
||||
//! Maximum number of supported read images
|
||||
const static uint MaxReadImage = 128;
|
||||
//! Maximum number of supported write images
|
||||
const static uint MaxWriteImage = 8;
|
||||
//! Maximum number of supported read/write images for OCL20
|
||||
const static uint MaxReadWriteImage = 64;
|
||||
//! Maximum number of supported constant arguments
|
||||
const static uint MaxConstArguments = 8;
|
||||
//! Maximum number of supported kernel UAV arguments
|
||||
const static uint MaxUavArguments = 1024;
|
||||
//! Maximum number of pixels for a 1D image created from a buffer
|
||||
const static size_t MaxImageBufferSize = 65536;
|
||||
//! Maximum number of pixels for a 1D image created from a buffer
|
||||
const static size_t MaxImageArraySize = 2048;
|
||||
|
||||
//! Maximum number of supported constant buffers
|
||||
const static uint MaxConstBuffers = MaxConstArguments + 8;
|
||||
|
||||
//! Maximum number of constant buffers for arguments
|
||||
const static uint MaxConstBuffersArguments = 2;
|
||||
|
||||
//! Alignment restriciton for the pinned memory
|
||||
const static size_t PinnedMemoryAlignment = 4 * Ki;
|
||||
|
||||
//! HSA path specific defines for images
|
||||
const static uint HsaImageObjectSize = 48;
|
||||
const static uint HsaImageObjectAlignment = 16;
|
||||
const static uint HsaSamplerObjectSize = 32;
|
||||
const static uint HsaSamplerObjectAlignment = 16;
|
||||
|
||||
//! HSA path specific defines for images
|
||||
const static uint DeviceQueueMaskSize = 32;
|
||||
|
||||
struct AMDDeviceInfo {
|
||||
const char* targetName_; //!< Target name
|
||||
const char* machineTarget_; //!< Machine target
|
||||
uint simdPerCU_; //!< Number of SIMDs per CU
|
||||
uint simdWidth_; //!< Number of workitems processed per SIMD
|
||||
uint simdInstructionWidth_; //!< Number of instructions processed per SIMD
|
||||
uint memChannelBankWidth_; //!< Memory channel bank width
|
||||
uint localMemSizePerCU_; //!< Local memory size per CU
|
||||
uint localMemBanks_; //!< Number of banks of local memory
|
||||
uint gfxipVersion_; //!< The core engine GFXIP version
|
||||
};
|
||||
|
||||
static const AMDDeviceInfo DeviceInfo[] = {
|
||||
/* Unknown */ { "", "unknown", 4, 16, 1, 256, 64 * Ki, 32, 702 },
|
||||
/* Tahiti */ { "", "tahiti", 4, 16, 1, 256, 64 * Ki, 32, 702 },
|
||||
/* Pitcairn */ { "", "pitcairn", 4, 16, 1, 256, 64 * Ki, 32, 702 },
|
||||
/* Capeverde */ { "", "bonaire", 4, 16, 1, 256, 64 * Ki, 32, 702 },
|
||||
/* Oland */ { "", "oland", 4, 16, 1, 256, 64 * Ki, 32, 702 },
|
||||
/* Hainan */ { "", "hainan", 4, 16, 1, 256, 64 * Ki, 32, 702 },
|
||||
|
||||
/* Bonaire */ { "Bonaire", "bonaire", 4, 16, 1, 256, 64 * Ki, 32, 702 },
|
||||
/* Hawaii */ { "Hawaii", "hawaii", 4, 16, 1, 256, 64 * Ki, 32, 702 },
|
||||
/* Kalindi */ { "Kalindi", "kalindi", 4, 16, 1, 256, 64 * Ki, 32, 702 },
|
||||
/* Spectre */ { "Spectre", "spectre", 4, 16, 1, 256, 64 * Ki, 32, 701 },
|
||||
|
||||
/* Carrizo */ { "Carrizo" , "carrizo", 4, 16, 1, 256, 64 * Ki, 32, 800 },
|
||||
/* Stoney */ { "Stoney", "stoney", 4, 16, 1, 256, 64 * Ki, 32, 800 },
|
||||
|
||||
/* Iceland */ { "Iceland", "iceland", 4, 16, 1, 256, 64 * Ki, 32, 800 },
|
||||
/* Tonga */ { "Tonga", "tonga", 4, 16, 1, 256, 64 * Ki, 32, 800 },
|
||||
/* Fiji */ { "Fiji", "fiji", 4, 16, 1, 256, 64 * Ki, 32, 800 },
|
||||
/* Ellesmere */ { "Horse", "horse", 4, 16, 1, 256, 64 * Ki, 32, 800 },
|
||||
/* Buffin */ { "Goose", "goose", 4, 16, 1, 256, 64 * Ki, 32, 800 },
|
||||
};
|
||||
|
||||
static const char* Gfx700 = "AMD:AMDGPU:7:0:0";
|
||||
static const char* Gfx701 = "AMD:AMDGPU:7:0:1";
|
||||
static const char* Gfx800 = "AMD:AMDGPU:8:0:0";
|
||||
static const char* Gfx801 = "AMD:AMDGPU:8:0:1";
|
||||
static const char* Gfx804 = "AMD:AMDGPU:8:0:4";
|
||||
static const char* Gfx810 = "AMD:AMDGPU:8:1:0";
|
||||
static const char* Gfx900 = "AMD:AMDGPU:9:0:0";
|
||||
static const char* Gfx901 = "AMD:AMDGPU:9:0:1";
|
||||
|
||||
// Supported OpenCL versions
|
||||
enum OclVersion {
|
||||
OpenCL10,
|
||||
OpenCL11,
|
||||
OpenCL12,
|
||||
OpenCL20
|
||||
};
|
||||
|
||||
struct MemoryFormat {
|
||||
cl_image_format clFormat_; //!< CL image format
|
||||
Pal::Format palFormat_; //!< PAL image format
|
||||
Pal::ChannelMapping palChannel_;//!< PAL channel mapping
|
||||
};
|
||||
|
||||
static const MemoryFormat
|
||||
MemoryFormatMap[] = {
|
||||
// R
|
||||
{ { CL_R, CL_UNORM_INT8 },
|
||||
{ Pal::ChFmt::R8, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_R, CL_UNORM_INT16 },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
|
||||
{ { CL_R, CL_SNORM_INT8 },
|
||||
{ Pal::ChFmt::R8, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_R, CL_SNORM_INT16 },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
|
||||
{ { CL_R, CL_SIGNED_INT8 },
|
||||
{ Pal::ChFmt::R8, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_R, CL_SIGNED_INT16 },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_R, CL_SIGNED_INT32 },
|
||||
{ Pal::ChFmt::R32, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_R, CL_UNSIGNED_INT8 },
|
||||
{ Pal::ChFmt::R8, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_R, CL_UNSIGNED_INT16 },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_R, CL_UNSIGNED_INT32 },
|
||||
{ Pal::ChFmt::R32, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
|
||||
{ { CL_R, CL_HALF_FLOAT },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_R, CL_FLOAT },
|
||||
{ Pal::ChFmt::R32, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
|
||||
// A
|
||||
{ { CL_A, CL_UNORM_INT8 },
|
||||
{ Pal::ChFmt::R8, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_A, CL_UNORM_INT16 },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
|
||||
|
||||
{ { CL_A, CL_SNORM_INT8 },
|
||||
{ Pal::ChFmt::R8, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_A, CL_SNORM_INT16 },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
|
||||
|
||||
{ { CL_A, CL_SIGNED_INT8 },
|
||||
{ Pal::ChFmt::R8, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_A, CL_SIGNED_INT16 },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_A, CL_SIGNED_INT32},
|
||||
{ Pal::ChFmt::R32, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_A, CL_UNSIGNED_INT8 },
|
||||
{ Pal::ChFmt::R8, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_A, CL_UNSIGNED_INT16 },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_A, CL_UNSIGNED_INT32},
|
||||
{ Pal::ChFmt::R32 , Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
|
||||
|
||||
{ { CL_A, CL_HALF_FLOAT },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_A, CL_FLOAT },
|
||||
{ Pal::ChFmt::R32, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::Zero,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::R } },
|
||||
|
||||
// RG
|
||||
{ { CL_RG, CL_UNORM_INT8 },
|
||||
{ Pal::ChFmt::R8G8, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_RG, CL_UNORM_INT16 },
|
||||
{ Pal::ChFmt::R16G16, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
|
||||
{ { CL_RG, CL_SNORM_INT8 },
|
||||
{ Pal::ChFmt::R8G8, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_RG, CL_SNORM_INT16 },
|
||||
{ Pal::ChFmt::R16G16, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
|
||||
{ { CL_RG, CL_SIGNED_INT8 },
|
||||
{ Pal::ChFmt::R8G8, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_RG, CL_SIGNED_INT16 },
|
||||
{ Pal::ChFmt::R16G16, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_RG, CL_SIGNED_INT32},
|
||||
{ Pal::ChFmt::R32G32, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_RG, CL_UNSIGNED_INT8 },
|
||||
{ Pal::ChFmt::R8G8, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_RG, CL_UNSIGNED_INT16 },
|
||||
{ Pal::ChFmt::R16G16, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_RG, CL_UNSIGNED_INT32},
|
||||
{ Pal::ChFmt::R32G32, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
|
||||
{ { CL_RG, CL_HALF_FLOAT },
|
||||
{ Pal::ChFmt::R16G16, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_RG, CL_FLOAT },
|
||||
{ Pal::ChFmt::R32G32, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::Zero, Pal::ChannelSwizzle::One } },
|
||||
/*
|
||||
// RA
|
||||
{ { CL_RA, CL_UNORM_INT8 },
|
||||
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8 } },
|
||||
{ { CL_RA, CL_UNORM_INT16 },
|
||||
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16 } },
|
||||
|
||||
{ { CL_RA, CL_SNORM_INT8 },
|
||||
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8 } },
|
||||
{ { CL_RA, CL_SNORM_INT16 },
|
||||
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sUV16 } },
|
||||
|
||||
{ { CL_RA, CL_SIGNED_INT8 },
|
||||
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG8I } },
|
||||
{ { CL_RA, CL_SIGNED_INT16 },
|
||||
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG16I } },
|
||||
{ { CL_RA, CL_SIGNED_INT32},
|
||||
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_sRG32I } },
|
||||
{ { CL_RA, CL_UNSIGNED_INT8 },
|
||||
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG8I } },
|
||||
{ { CL_RA, CL_UNSIGNED_INT16 },
|
||||
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16I } },
|
||||
{ { CL_RA, CL_UNSIGNED_INT32},
|
||||
{ GSL_CHANNEL_ORDER_RA , CM_SURF_FMT_RG32I } },
|
||||
|
||||
{ { CL_RA, CL_HALF_FLOAT },
|
||||
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG16F } },
|
||||
{ { CL_RA, CL_FLOAT },
|
||||
{ GSL_CHANNEL_ORDER_RA, CM_SURF_FMT_RG32F } },
|
||||
*/
|
||||
// RGB
|
||||
{ { CL_RGB, CL_UNORM_INT_101010 },
|
||||
{ Pal::ChFmt::R10G10B10A2, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_RGB, CL_UNSIGNED_INT8 }, // This is used only by blit kernel
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
|
||||
|
||||
// RGBA
|
||||
{ { CL_RGBA, CL_UNORM_INT8 },
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
{ { CL_RGBA, CL_UNORM_INT16 },
|
||||
{ Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
|
||||
{ { CL_RGBA, CL_SNORM_INT8 },
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
{ { CL_RGBA, CL_SNORM_INT16 },
|
||||
{ Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
|
||||
{ { CL_RGBA, CL_SIGNED_INT8 },
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
{ { CL_RGBA, CL_SIGNED_INT16 },
|
||||
{ Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
{ { CL_RGBA, CL_SIGNED_INT32 },
|
||||
{ Pal::ChFmt::R32G32B32A32, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
{ { CL_RGBA, CL_UNSIGNED_INT8 },
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
{ { CL_RGBA, CL_UNSIGNED_INT16 },
|
||||
{ Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
{ { CL_RGBA, CL_UNSIGNED_INT32},
|
||||
{ Pal::ChFmt::R32G32B32A32, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
|
||||
{ { CL_RGBA, CL_HALF_FLOAT },
|
||||
{ Pal::ChFmt::R16G16B16A16, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
{ { CL_RGBA, CL_FLOAT },
|
||||
{ Pal::ChFmt::R32G32B32A32, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
|
||||
// ARGB
|
||||
{ { CL_ARGB, CL_UNORM_INT8 },
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
|
||||
Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_ARGB, CL_SNORM_INT8 },
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
|
||||
Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_ARGB, CL_SIGNED_INT8 },
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
|
||||
Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_ARGB, CL_UNSIGNED_INT8 },
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::G, Pal::ChannelSwizzle::B,
|
||||
Pal::ChannelSwizzle::A, Pal::ChannelSwizzle::R } },
|
||||
|
||||
// BGRA
|
||||
{ { CL_BGRA, CL_UNORM_INT8 },
|
||||
{ Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
|
||||
{ { CL_BGRA, CL_SNORM_INT8 },
|
||||
{ Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
|
||||
{ { CL_BGRA, CL_SIGNED_INT8 },
|
||||
{ Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Sint },
|
||||
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
|
||||
{ { CL_BGRA, CL_UNSIGNED_INT8 },
|
||||
{ Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
|
||||
|
||||
// LUMINANCE
|
||||
{ { CL_LUMINANCE, CL_SNORM_INT8 },
|
||||
{ Pal::ChFmt::R8, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_LUMINANCE, CL_SNORM_INT16 },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_LUMINANCE, CL_UNORM_INT8 },
|
||||
{ Pal::ChFmt::R8, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_LUMINANCE, CL_UNORM_INT16 },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_LUMINANCE, CL_HALF_FLOAT },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_LUMINANCE, CL_FLOAT },
|
||||
{ Pal::ChFmt::R32, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::One } },
|
||||
|
||||
// INTENSITY
|
||||
{ { CL_INTENSITY, CL_SNORM_INT8 },
|
||||
{ Pal::ChFmt::R8, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_INTENSITY, CL_SNORM_INT16 },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Snorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_INTENSITY, CL_UNORM_INT8 },
|
||||
{ Pal::ChFmt::R8, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_INTENSITY, CL_UNORM_INT16 },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_INTENSITY, CL_HALF_FLOAT },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_INTENSITY, CL_FLOAT },
|
||||
{ Pal::ChFmt::R32, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
|
||||
|
||||
// sRBGA
|
||||
{ { CL_sRGBA, CL_UNORM_INT8 },
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Srgb },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
{ { CL_sRGBA, CL_UNSIGNED_INT8 }, // This is used only by blit kernel
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::A } },
|
||||
|
||||
// sRBG
|
||||
{ { CL_sRGB, CL_UNORM_INT8 },
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Srgb },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_sRGB, CL_UNSIGNED_INT8 }, // This is used only by blit kernel
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
|
||||
|
||||
// sRBGx
|
||||
{ { CL_sRGBx, CL_UNORM_INT8 },
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Srgb },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
|
||||
{ { CL_sRGBx, CL_UNSIGNED_INT8 }, // This is used only by blit kernel
|
||||
{ Pal::ChFmt::R8G8B8A8, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::One } },
|
||||
|
||||
// sBGRA
|
||||
{ { CL_sBGRA, CL_UNORM_INT8 },
|
||||
{ Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Srgb },
|
||||
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
|
||||
{ { CL_sBGRA, CL_UNSIGNED_INT8 }, // This is used only by blit kernel
|
||||
{ Pal::ChFmt::B8G8R8A8, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::B, Pal::ChannelSwizzle::G,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::A } },
|
||||
|
||||
// DEPTH
|
||||
{ { CL_DEPTH, CL_FLOAT },
|
||||
{ Pal::ChFmt::R32, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_DEPTH, CL_UNSIGNED_INT32 }, // This is used only by blit kernel
|
||||
{ Pal::ChFmt::R32, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
|
||||
|
||||
{ { CL_DEPTH, CL_UNORM_INT16 },
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_DEPTH, CL_UNSIGNED_INT16 }, // This is used only by blit kernel
|
||||
{ Pal::ChFmt::R16, Pal::NumFmt::Uint },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
|
||||
|
||||
{ { CL_DEPTH_STENCIL, CL_UNORM_INT24 },
|
||||
{ Pal::ChFmt::R32, Pal::NumFmt::Unorm },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } },
|
||||
{ { CL_DEPTH_STENCIL, CL_FLOAT },
|
||||
{ Pal::ChFmt::R32, Pal::NumFmt::Float },
|
||||
{ Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R,
|
||||
Pal::ChannelSwizzle::R, Pal::ChannelSwizzle::R } }
|
||||
};
|
||||
|
||||
} // namespace pal
|
||||
|
||||
#endif // PALDEFS_HPP_
|
||||
檔案差異因為檔案過大而無法顯示
載入差異
@@ -0,0 +1,598 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALDEVICE_HPP_
|
||||
#define PALDEVICE_HPP_
|
||||
|
||||
#include "top.hpp"
|
||||
#include "device/device.hpp"
|
||||
#include "platform/command.hpp"
|
||||
#include "platform/program.hpp"
|
||||
#include "platform/perfctr.hpp"
|
||||
#include "platform/threadtrace.hpp"
|
||||
#include "platform/memory.hpp"
|
||||
#include "utils/concurrent.hpp"
|
||||
#include "thread/thread.hpp"
|
||||
#include "thread/monitor.hpp"
|
||||
#include "device/pal/palvirtual.hpp"
|
||||
#include "device/pal/palmemory.hpp"
|
||||
#include "device/pal/paldefs.hpp"
|
||||
#include "device/pal/palsettings.hpp"
|
||||
#include "device/pal/palappprofile.hpp"
|
||||
#include "acl.h"
|
||||
#include "memory"
|
||||
|
||||
|
||||
/*! \addtogroup PAL
|
||||
* @{
|
||||
*/
|
||||
|
||||
//! PAL Device Implementation
|
||||
namespace pal {
|
||||
|
||||
//! A nil device object
|
||||
class NullDevice : public amd::Device
|
||||
{
|
||||
protected:
|
||||
static aclCompiler* compiler_;
|
||||
public:
|
||||
aclCompiler* compiler() const { return compiler_; }
|
||||
|
||||
public:
|
||||
static bool init(void);
|
||||
|
||||
//! Construct a new identifier
|
||||
NullDevice();
|
||||
|
||||
//! Creates an offline device with the specified target
|
||||
bool create(
|
||||
Pal::GfxIpLevel ipLevel //!< GPU ip level
|
||||
);
|
||||
|
||||
virtual cl_int createSubDevices(
|
||||
device::CreateSubDevicesInfo& create_info,
|
||||
cl_uint num_entries,
|
||||
cl_device_id* devices,
|
||||
cl_uint* num_devices) {
|
||||
return CL_INVALID_VALUE;
|
||||
}
|
||||
|
||||
//! Instantiate a new virtual device
|
||||
virtual device::VirtualDevice* createVirtualDevice(
|
||||
amd::CommandQueue* queue = NULL
|
||||
) { return NULL; }
|
||||
|
||||
//! Compile the given source code.
|
||||
virtual device::Program* createProgram(amd::option::Options* options = NULL);
|
||||
|
||||
//! Just returns NULL for the dummy device
|
||||
virtual device::Memory* createMemory(amd::Memory& owner) const { return NULL; }
|
||||
|
||||
//! Sampler object allocation
|
||||
virtual bool createSampler(
|
||||
const amd::Sampler& owner, //!< abstraction layer sampler object
|
||||
device::Sampler** sampler //!< device sampler object
|
||||
) const
|
||||
{
|
||||
ShouldNotReachHere();
|
||||
return true;
|
||||
}
|
||||
|
||||
//! Just returns NULL for the dummy device
|
||||
virtual device::Memory* createView(
|
||||
amd::Memory& owner, //!< Owner memory object
|
||||
const device::Memory& parent //!< Parent device memory object for the view
|
||||
) const { return NULL; }
|
||||
|
||||
//! Reallocates the provided buffer object
|
||||
virtual bool reallocMemory(amd::Memory& owner) const { return true; }
|
||||
|
||||
//! Acquire external graphics API object in the host thread
|
||||
//! Needed for OpenGL objects on CPU device
|
||||
|
||||
virtual bool bindExternalDevice(
|
||||
intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
|
||||
|
||||
virtual bool unbindExternalDevice(
|
||||
intptr_t type, void* pDevice, void* pContext, bool validateOnly) { return true; }
|
||||
|
||||
//! Releases non-blocking map target memory
|
||||
virtual void freeMapTarget(amd::Memory& mem, void* target) {}
|
||||
|
||||
Pal::GfxIpLevel ipLevel() const { return ipLevel_; }
|
||||
|
||||
const AMDDeviceInfo* hwInfo() const { return hwInfo_; }
|
||||
|
||||
//! Empty implementation on Null device
|
||||
virtual bool globalFreeMemory(size_t* freeMemory) const { return false; }
|
||||
|
||||
//! Get GPU device settings
|
||||
const pal::Settings& settings() const
|
||||
{ return reinterpret_cast<pal::Settings&>(*settings_); }
|
||||
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags, void* svmPtr) const { return NULL; }
|
||||
virtual void svmFree(void* ptr) const {return;}
|
||||
|
||||
protected:
|
||||
Pal::GfxIpLevel ipLevel_; //!< Device IP level
|
||||
const AMDDeviceInfo* hwInfo_; //!< Device HW info structure
|
||||
|
||||
//! Fills OpenCL device info structure
|
||||
void fillDeviceInfo(
|
||||
const Pal::DeviceProperties& palProp,//!< PAL device properties
|
||||
const Pal::GpuMemoryHeapProperties heaps[Pal::GpuHeapCount],
|
||||
size_t maxTextureSize, //!< Maximum texture size supported in HW
|
||||
uint numComputeRings //!< Number of compute rings
|
||||
);
|
||||
};
|
||||
|
||||
//! Forward declarations
|
||||
class Command;
|
||||
class Device;
|
||||
class GpuCommand;
|
||||
class Heap;
|
||||
class HeapBlock;
|
||||
class Program;
|
||||
class Kernel;
|
||||
class Memory;
|
||||
class Resource;
|
||||
class VirtualDevice;
|
||||
class PrintfDbg;
|
||||
class ThreadTrace;
|
||||
|
||||
#ifndef CL_FILTER_NONE
|
||||
#define CL_FILTER_NONE 0x1142
|
||||
#endif
|
||||
|
||||
class Sampler : public device::Sampler
|
||||
{
|
||||
public:
|
||||
//! Constructor
|
||||
Sampler(const Device& dev): dev_(dev) {}
|
||||
|
||||
//! Default destructor for the device memory object
|
||||
virtual ~Sampler();
|
||||
|
||||
//! Creates a device sampler from the OCL sampler state
|
||||
bool create(
|
||||
uint32_t oclSamplerState //!< OCL sampler state
|
||||
);
|
||||
|
||||
//! Creates a device sampler from the OCL sampler state
|
||||
bool create(
|
||||
const amd::Sampler& owner //!< AMD sampler object
|
||||
);
|
||||
|
||||
const void* hwState() const { return hwState_; }
|
||||
|
||||
private:
|
||||
//! Disable default copy constructor
|
||||
Sampler& operator=(const Sampler&);
|
||||
|
||||
//! Disable operator=
|
||||
Sampler(const Sampler&);
|
||||
|
||||
const Device& dev_; //!< Device object associated with the sampler
|
||||
address hwState_; //!< GPU HW state (\todo legacy path)
|
||||
};
|
||||
|
||||
//! A GPU device ordinal (physical GPU device)
|
||||
class Device : public NullDevice
|
||||
{
|
||||
public:
|
||||
//! Locks any access to the virtual GPUs
|
||||
class ScopedLockVgpus : public amd::StackObject {
|
||||
public:
|
||||
//! Default constructor
|
||||
ScopedLockVgpus(const Device& dev);
|
||||
|
||||
//! Destructor
|
||||
~ScopedLockVgpus();
|
||||
|
||||
private:
|
||||
const Device& dev_; //! Device object
|
||||
};
|
||||
|
||||
//! Transfer buffers
|
||||
class XferBuffers : public amd::HeapObject
|
||||
{
|
||||
public:
|
||||
static const size_t MaxXferBufListSize = 8;
|
||||
|
||||
//! Default constructor
|
||||
XferBuffers(const Device& device, Resource::MemoryType type, size_t bufSize)
|
||||
: type_(type)
|
||||
, bufSize_(bufSize)
|
||||
, acquiredCnt_(0)
|
||||
, gpuDevice_(device)
|
||||
{}
|
||||
|
||||
//! Default destructor
|
||||
~XferBuffers();
|
||||
|
||||
//! Creates the xfer buffers object
|
||||
bool create();
|
||||
|
||||
//! Acquires an instance of the transfer buffers
|
||||
Memory& acquire();
|
||||
|
||||
//! Releases transfer buffer
|
||||
void release(
|
||||
VirtualGPU& gpu, //!< Virual GPU object used with the buffer
|
||||
Memory& buffer //!< Transfer buffer for release
|
||||
);
|
||||
|
||||
//! Returns the buffer's size for transfer
|
||||
size_t bufSize() const { return bufSize_; }
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
XferBuffers(const XferBuffers&);
|
||||
|
||||
//! Disable assignment operator
|
||||
XferBuffers& operator=(const XferBuffers&);
|
||||
|
||||
//! Get device object
|
||||
const Device& dev() const { return gpuDevice_; }
|
||||
|
||||
Resource::MemoryType type_; //!< The buffer's type
|
||||
size_t bufSize_; //!< Staged buffer size
|
||||
std::list<Memory*> freeBuffers_; //!< The list of free buffers
|
||||
amd::Atomic<uint> acquiredCnt_; //!< The total number of acquired buffers
|
||||
amd::Monitor lock_; //!< Stgaed buffer acquire/release lock
|
||||
const Device& gpuDevice_; //!< GPU device object
|
||||
};
|
||||
|
||||
//! Virtual address cache entry
|
||||
struct VACacheEntry : public amd::HeapObject
|
||||
{
|
||||
void* startAddress_; //!< Start virtual address
|
||||
void* endAddress_; //!< End virtual address
|
||||
Memory* memory_; //!< GPU memory, associated with the range
|
||||
|
||||
//! Constructor
|
||||
VACacheEntry(
|
||||
void* startAddress, //!< Start virtual address
|
||||
void* endAddress, //!< End virtual address
|
||||
Memory* memory //!< GPU memory object
|
||||
): startAddress_(startAddress), endAddress_(endAddress), memory_(memory) {}
|
||||
|
||||
private:
|
||||
//! Disable default constructor
|
||||
VACacheEntry();
|
||||
};
|
||||
|
||||
struct ScratchBuffer : public amd::HeapObject
|
||||
{
|
||||
uint regNum_; //!< The number of used scratch registers
|
||||
Memory* memObj_; //!< Memory objects for scratch buffers
|
||||
uint offset_; //!< Offset from the global scratch store
|
||||
uint size_; //!< Scratch buffer size on this queue
|
||||
|
||||
//! Default constructor
|
||||
ScratchBuffer(): regNum_(0), memObj_(NULL), offset_(0) {}
|
||||
|
||||
//! Default constructor
|
||||
~ScratchBuffer();
|
||||
|
||||
//! Destroys memory objects
|
||||
void destroyMemory();
|
||||
};
|
||||
|
||||
|
||||
class SrdManager : public amd::HeapObject {
|
||||
public:
|
||||
SrdManager(const Device& dev, uint srdSize, uint bufSize)
|
||||
: dev_(dev)
|
||||
, numFlags_(bufSize / (srdSize * MaskBits))
|
||||
, srdSize_(srdSize)
|
||||
, bufSize_(bufSize) {}
|
||||
~SrdManager();
|
||||
|
||||
//! Allocates a new SRD slot for a resource
|
||||
uint64_t allocSrdSlot(address* cpuAddr);
|
||||
|
||||
//! Frees a SRD slot
|
||||
void freeSrdSlot(uint64_t addr);
|
||||
|
||||
// Fills the memory list for VidMM KMD
|
||||
void fillResourceList(std::vector<const Memory*>& memList);
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
SrdManager(const SrdManager&);
|
||||
|
||||
//! Disable assignment operator
|
||||
SrdManager& operator=(const SrdManager&);
|
||||
|
||||
struct Chunk {
|
||||
Memory* buf_;
|
||||
uint* flags_;
|
||||
Chunk(): buf_(NULL), flags_(NULL) {}
|
||||
};
|
||||
|
||||
static const uint MaskBits = 32;
|
||||
const Device& dev_; //!< GPU device for the chunk manager
|
||||
amd::Monitor ml_; //!< Global lock for the SRD manager
|
||||
std::vector<Chunk> pool_; //!< Pool of SRD buffers
|
||||
uint numFlags_; //!< Total number of flags in array
|
||||
uint srdSize_; //!< SRD size
|
||||
uint bufSize_; //!< Buffer size that holds SRDs
|
||||
};
|
||||
|
||||
//! Initialise the whole GPU device subsystem
|
||||
static bool init();
|
||||
|
||||
//! Shutdown the whole GPU device subsystem
|
||||
static void tearDown();
|
||||
|
||||
//! Construct a new physical GPU device
|
||||
Device();
|
||||
|
||||
//! Initialise a device (i.e. all parts of the constructor that could
|
||||
//! potentially fail)
|
||||
bool create(
|
||||
Pal::IDevice* device //!< PAL device interface object
|
||||
);
|
||||
|
||||
//! Destructor for the physical GPU device
|
||||
virtual ~Device();
|
||||
|
||||
//! Instantiate a new virtual device
|
||||
device::VirtualDevice* createVirtualDevice(
|
||||
amd::CommandQueue* queue = NULL
|
||||
);
|
||||
|
||||
//! Memory allocation
|
||||
virtual device::Memory* createMemory(
|
||||
amd::Memory& owner //!< abstraction layer memory object
|
||||
) const;
|
||||
|
||||
//! Sampler object allocation
|
||||
virtual bool createSampler(
|
||||
const amd::Sampler& owner, //!< abstraction layer sampler object
|
||||
device::Sampler** sampler //!< device sampler object
|
||||
) const;
|
||||
|
||||
//! Reallocates the provided buffer object
|
||||
virtual bool reallocMemory(
|
||||
amd::Memory& owner //!< Buffer for reallocation
|
||||
) const;
|
||||
|
||||
//! Allocates a view object from the device memory
|
||||
virtual device::Memory* createView(
|
||||
amd::Memory& owner, //!< Owner memory object
|
||||
const device::Memory& parent //!< Parent device memory object for the view
|
||||
) const;
|
||||
|
||||
//! Create the device program.
|
||||
virtual device::Program* createProgram(amd::option::Options* options = NULL);
|
||||
|
||||
//! Attempt to bind with external graphics API's device/context
|
||||
virtual bool bindExternalDevice(
|
||||
intptr_t type,
|
||||
void* pDevice,
|
||||
void* pContext,
|
||||
bool validateOnly);
|
||||
|
||||
//! Attempt to unbind with external graphics API's device/context
|
||||
virtual bool unbindExternalDevice(
|
||||
intptr_t type,
|
||||
void* pDevice,
|
||||
void* pContext,
|
||||
bool validateOnly);
|
||||
|
||||
//! Validates kernel before execution
|
||||
virtual bool validateKernel(
|
||||
const amd::Kernel& kernel, //!< AMD kernel object
|
||||
const device::VirtualDevice* vdev
|
||||
);
|
||||
|
||||
//! Retrieves information about free memory on a GPU device
|
||||
virtual bool globalFreeMemory(size_t* freeMemory) const;
|
||||
|
||||
//! Returns a GPU memory object from AMD memory object
|
||||
pal::Memory* getGpuMemory(
|
||||
amd::Memory* mem //!< Pointer to AMD memory object
|
||||
) const;
|
||||
|
||||
amd::Monitor& lockAsyncOps() const { return *lockAsyncOps_; }
|
||||
|
||||
//! Returns the lock object for the virtual gpus list
|
||||
amd::Monitor* vgpusAccess() const { return vgpusAccess_; }
|
||||
|
||||
//! Returns the monitor object for PAL
|
||||
amd::Monitor& lockPAL() const { return *lockPAL_; }
|
||||
|
||||
//! Returns the number of virtual GPUs allocated on this device
|
||||
uint numOfVgpus() const { return numOfVgpus_; }
|
||||
uint numOfVgpus_; //!< The number of virtual GPUs (lock protected)
|
||||
|
||||
typedef std::vector<VirtualGPU*> VirtualGPUs;
|
||||
|
||||
//! Returns the list of all virtual GPUs running on this device
|
||||
const VirtualGPUs vgpus() const { return vgpus_; }
|
||||
VirtualGPUs vgpus_; //!< The list of all running virtual gpus (lock protected)
|
||||
|
||||
//! Scratch buffer allocation
|
||||
pal::Memory* createScratchBuffer(
|
||||
size_t size //!< Size of buffer
|
||||
) const;
|
||||
|
||||
//! Returns transfer buffer object
|
||||
XferBuffers& xferWrite() const { return *xferWrite_; }
|
||||
|
||||
//! Returns transfer buffer object
|
||||
XferBuffers& xferRead() const { return *xferRead_; }
|
||||
|
||||
//! Adds GPU memory to the VA cache list
|
||||
void addVACache(Memory* memory) const;
|
||||
|
||||
//! Removes GPU memory from the VA cache list
|
||||
void removeVACache(const Memory* memory) const;
|
||||
|
||||
//! Finds GPU memory from virtual address
|
||||
Memory* findMemoryFromVA(const void* ptr, size_t* offset) const;
|
||||
|
||||
//! Finds an appropriate map target
|
||||
amd::Memory* findMapTarget(size_t size) const;
|
||||
|
||||
//! Adds a map target to the cache
|
||||
bool addMapTarget(amd::Memory* memory) const;
|
||||
|
||||
//! Returns resource cache object
|
||||
ResourceCache& resourceCache() const { return *resourceCache_; }
|
||||
|
||||
//! Returns the number of available compute rings
|
||||
uint numComputeEngines() const { return numComputeEngines_; }
|
||||
|
||||
//! Returns the number of available DMA engines
|
||||
uint numDMAEngines() const { return numDmaEngines_; }
|
||||
|
||||
//! Returns engines object
|
||||
const device::BlitManager& xferMgr() const;
|
||||
|
||||
VirtualGPU* xferQueue() const { return xferQueue_; }
|
||||
|
||||
//! Retrieves the internal format from the OCL format
|
||||
Pal::Format getPalFormat(
|
||||
const amd::Image::Format& format, //! OCL image format
|
||||
Pal::ChannelMapping* channel
|
||||
) const;
|
||||
|
||||
const ScratchBuffer* scratch(uint idx) const { return scratch_[idx]; }
|
||||
|
||||
//! Returns the global scratch buffer
|
||||
Memory* globalScratchBuf() const { return globalScratchBuf_; };
|
||||
|
||||
//! Destroys scratch buffer memory
|
||||
void destroyScratchBuffers();
|
||||
|
||||
//! Initialize heap resources if uninitialized
|
||||
bool initializeHeapResources();
|
||||
|
||||
//! Set GSL sampler to the specified state
|
||||
void fillHwSampler(
|
||||
uint32_t state, //!< Sampler's OpenCL state
|
||||
void* hwState, //!< Sampler's HW state
|
||||
uint32_t hwStateSize, //!< Size of sampler's HW state
|
||||
uint32_t mipFilter = CL_FILTER_NONE, //!< Mip filter
|
||||
float minLod = 0.f, //!< Min level of detail
|
||||
float maxLod = CL_MAXFLOAT //!< Max level of detail
|
||||
) const;
|
||||
|
||||
//! host memory alloc
|
||||
virtual void* hostAlloc(size_t size, size_t alignment, bool atomics = false) const;
|
||||
|
||||
//! SVM allocation
|
||||
virtual void* svmAlloc(amd::Context& context, size_t size, size_t alignment,
|
||||
cl_svm_mem_flags flags, void* svmPtr) const;
|
||||
|
||||
//! Free host SVM memory
|
||||
void hostFree(void* ptr, size_t size) const;
|
||||
|
||||
//! SVM free
|
||||
virtual void svmFree(void* ptr) const;
|
||||
|
||||
//! Returns SRD manger object
|
||||
SrdManager& srds() const { return *srdManager_; }
|
||||
|
||||
//! Initial the Hardware Debug Manager
|
||||
cl_int hwDebugManagerInit(amd::Context *context, uintptr_t messageStorage);
|
||||
|
||||
//! Returns PAL device properties
|
||||
const Pal::DeviceProperties& properties() const { return properties_; }
|
||||
|
||||
//! Returns PAL device interface
|
||||
Pal::IDevice* iDev() const { return device_; }
|
||||
|
||||
//! Return private device context for internal allocations
|
||||
amd::Context& context() const { return *context_; }
|
||||
|
||||
//! Update free memory for OCL extension
|
||||
void updateFreeMemory(
|
||||
Pal::GpuHeap heap, //!< PAL GPU heap for update
|
||||
Pal::gpusize size, //!< Size of alocated/destroyed memory
|
||||
bool free //!< TRUE if runtime frees memory
|
||||
);
|
||||
|
||||
//! Interop for GL device
|
||||
bool initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const;
|
||||
bool glCanInterop(void* GLplatformContext, void* GLdeviceContext) const;
|
||||
bool resGLAssociate(void* GLContext, uint name, uint type,
|
||||
void** handle, void** mbResHandle, size_t* offset) const;
|
||||
bool resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const;
|
||||
bool resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const;
|
||||
bool resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const;
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
Device(const Device&);
|
||||
|
||||
//! Disable assignment
|
||||
Device& operator=(const Device&);
|
||||
|
||||
//! Sends the stall command to all queues
|
||||
bool stallQueues();
|
||||
|
||||
//! Buffer allocation
|
||||
pal::Memory* createBuffer(
|
||||
amd::Memory& owner, //!< Abstraction layer memory object
|
||||
bool directAccess //!< Use direct host memory access
|
||||
) const;
|
||||
|
||||
//! Image allocation
|
||||
pal::Memory* createImage(
|
||||
amd::Memory& owner, //!< Abstraction layer memory object
|
||||
bool directAccess //!< Use direct host memory access
|
||||
) const;
|
||||
|
||||
//! Allocates/reallocates the scratch buffer, according to the usage
|
||||
bool allocScratch(
|
||||
uint regNum, //!< Number of the scratch registers
|
||||
const VirtualGPU* vgpu //!< Virtual GPU for the allocation
|
||||
);
|
||||
|
||||
//! Interop for D3D devices
|
||||
bool associateD3D11Device(
|
||||
void* d3d11Device //!< void* is of type ID3D11Device*
|
||||
);
|
||||
bool associateD3D10Device(
|
||||
void* d3d10Device //!< void* is of type ID3D10Device*
|
||||
);
|
||||
bool associateD3D9Device(
|
||||
void* d3d9Device //!< void* is of type IDirect3DDevice9*
|
||||
);
|
||||
//! Interop for GL device
|
||||
bool glAssociate(void* GLplatformContext, void* GLdeviceContext) const;
|
||||
bool glDissociate(void* GLplatformContext, void* GLdeviceContext) const;
|
||||
|
||||
amd::Context* context_; //!< A dummy context for internal allocations
|
||||
amd::Monitor* lockAsyncOps_; //!< Lock to serialise all async ops on this device
|
||||
amd::Monitor* lockForInitHeap_; //!< Lock to serialise all async ops on initialization heap operation
|
||||
amd::Monitor* lockPAL_; //!< Lock to serialise PAL access
|
||||
amd::Monitor* vgpusAccess_; //!< Lock to serialise virtual gpu list access
|
||||
amd::Monitor* scratchAlloc_; //!< Lock to serialise scratch allocation
|
||||
amd::Monitor* mapCacheOps_; //!< Lock to serialise cache for the map resources
|
||||
XferBuffers* xferRead_; //!< Transfer buffers read
|
||||
XferBuffers* xferWrite_; //!< Transfer buffers write
|
||||
amd::Monitor* vaCacheAccess_; //!< Lock to serialize VA caching access
|
||||
std::list<VACacheEntry*>* vaCacheList_; //!< VA cache list
|
||||
std::vector<amd::Memory*>* mapCache_; //!< Map cache info structure
|
||||
ResourceCache* resourceCache_; //!< Resource cache
|
||||
uint numComputeEngines_; //!< The number of available compute engines
|
||||
uint numDmaEngines_; //!< The number of available compute engines
|
||||
bool heapInitComplete_; //!< Keep track of initialization status of heap resources
|
||||
VirtualGPU* xferQueue_; //!< Transfer queue
|
||||
std::vector<ScratchBuffer*> scratch_; //!< Scratch buffers for kernels
|
||||
Memory* globalScratchBuf_; //!< Global scratch buffer
|
||||
SrdManager* srdManager_; //!< SRD manager object
|
||||
static AppProfile appProfile_; //!< application profile
|
||||
mutable bool freeCPUMem_; //!< flag to mark GPU free SVM CPU mem
|
||||
Pal::DeviceProperties properties_; //!< PAL device properties
|
||||
Pal::IDevice* device_; //!< PAL device object
|
||||
std::atomic<Pal::gpusize> freeMem[Pal::GpuHeap::GpuHeapCount]; //!< Free memory counter
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
|
||||
#endif /*PALDEVICE_HPP_*/
|
||||
@@ -0,0 +1,143 @@
|
||||
#include "paldevice.hpp"
|
||||
|
||||
#if defined(ATI_OS_LINUX)
|
||||
namespace pal {
|
||||
bool
|
||||
Device::associateD3D10Device(void* d3d10Device)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
} // pal
|
||||
#else // !ATI_OS_WIN
|
||||
|
||||
#include <D3D10_1.h>
|
||||
|
||||
/**************************************************************************************************************
|
||||
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
|
||||
* This means OCL client spec will need to change to include headers directly from the DXX perforce tree.
|
||||
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
|
||||
* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes.
|
||||
**************************************************************************************************************/
|
||||
#include "DxxOpenCLInteropExt.h"
|
||||
|
||||
namespace pal {
|
||||
|
||||
static bool
|
||||
queryD3D10DeviceGPUMask(ID3D10Device* pd3d10Device, UINT* pd3d10DeviceGPUMask)
|
||||
{
|
||||
HMODULE hDLL = nullptr;
|
||||
IAmdDxExt* pExt = nullptr;
|
||||
IAmdDxExtCLInterop* pCLExt = nullptr;
|
||||
PFNAmdDxExtCreate AmdDxExtCreate;
|
||||
HRESULT hr = S_OK;
|
||||
|
||||
// Get a handle to the DXX DLL with extension API support
|
||||
#if defined _WIN64
|
||||
static const CHAR dxxModuleName[13] = "atidxx64.dll";
|
||||
#else
|
||||
static const CHAR dxxModuleName[13] = "atidxx32.dll";
|
||||
#endif
|
||||
|
||||
hDLL = GetModuleHandle(dxxModuleName);
|
||||
|
||||
if (hDLL == nullptr) {
|
||||
hr = E_FAIL;
|
||||
}
|
||||
|
||||
// Get the exported AmdDxExtCreate() function pointer
|
||||
if (SUCCEEDED(hr)) {
|
||||
AmdDxExtCreate = reinterpret_cast<PFNAmdDxExtCreate>(
|
||||
GetProcAddress(hDLL, "AmdDxExtCreate"));
|
||||
if (AmdDxExtCreate == nullptr) {
|
||||
hr = E_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
// Create the extension object
|
||||
if (SUCCEEDED(hr)) {
|
||||
hr = AmdDxExtCreate(pd3d10Device, &pExt);
|
||||
}
|
||||
|
||||
// Get the extension version information
|
||||
if (SUCCEEDED(hr)) {
|
||||
AmdDxExtVersion extVersion;
|
||||
hr = pExt->GetVersion(&extVersion);
|
||||
|
||||
if (extVersion.majorVersion == 0)
|
||||
{
|
||||
hr = E_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
// Get the OpenCL Interop interface
|
||||
if (SUCCEEDED(hr)) {
|
||||
pCLExt = static_cast<IAmdDxExtCLInterop*>(
|
||||
pExt->GetExtInterface(AmdDxExtCLInteropID));
|
||||
if (pCLExt != nullptr) {
|
||||
// Get the GPU mask using the CL Interop extension.
|
||||
pCLExt->QueryInteropGpuMask(pd3d10DeviceGPUMask);
|
||||
}
|
||||
else {
|
||||
hr = E_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
if (pCLExt != nullptr) {
|
||||
pCLExt->Release();
|
||||
}
|
||||
|
||||
if (pExt != nullptr) {
|
||||
pExt->Release();
|
||||
}
|
||||
|
||||
return (SUCCEEDED(hr));
|
||||
}
|
||||
|
||||
bool
|
||||
Device::associateD3D10Device(void* d3d10Device)
|
||||
{
|
||||
ID3D10Device* pd3d10Device = static_cast<ID3D10Device*>(d3d10Device);
|
||||
|
||||
IDXGIDevice* pDXGIDevice;
|
||||
pd3d10Device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice);
|
||||
|
||||
IDXGIAdapter* pDXGIAdapter;
|
||||
pDXGIDevice->GetAdapter(&pDXGIAdapter);
|
||||
|
||||
DXGI_ADAPTER_DESC adapterDesc;
|
||||
pDXGIAdapter->GetDesc(&adapterDesc);
|
||||
|
||||
// match the adapter
|
||||
bool canInteroperate =
|
||||
(properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) &&
|
||||
(properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart);
|
||||
|
||||
UINT chainBitMask = 1 << properties().gpuIndex;
|
||||
|
||||
// match the chain ID
|
||||
if (canInteroperate) {
|
||||
UINT d3d10DeviceGPUMask = 0;
|
||||
|
||||
if (queryD3D10DeviceGPUMask(pd3d10Device, &d3d10DeviceGPUMask)) {
|
||||
canInteroperate = (chainBitMask & d3d10DeviceGPUMask) != 0;
|
||||
}
|
||||
else {
|
||||
// special handling for Intel iGPU + AMD dGPU in LDA mode
|
||||
// (only occurs on a PX platform) where
|
||||
// the D3D10Device object is created on the Intel iGPU and
|
||||
// passed to AMD dGPU (secondary) to interoperate.
|
||||
if (chainBitMask > 1) {
|
||||
canInteroperate = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pDXGIDevice->Release();
|
||||
pDXGIAdapter->Release();
|
||||
|
||||
return canInteroperate;
|
||||
}
|
||||
|
||||
} // pal
|
||||
|
||||
#endif // !ATI_OS_WIN
|
||||
@@ -0,0 +1,142 @@
|
||||
#include "paldevice.hpp"
|
||||
|
||||
#if defined(ATI_OS_LINUX)
|
||||
namespace pal {
|
||||
bool
|
||||
Device::associateD3D11Device(void* d3d11Device)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#else // !ATI_OS_LINUX
|
||||
|
||||
#include <D3D11.h>
|
||||
|
||||
/**************************************************************************************************************
|
||||
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
|
||||
* This means OCL client spec will need to change to include headers directly from the DXX perforce tree.
|
||||
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
|
||||
* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes.
|
||||
**************************************************************************************************************/
|
||||
#include "DxxOpenCLInteropExt.h"
|
||||
|
||||
namespace pal {
|
||||
|
||||
static bool
|
||||
queryD3D11DeviceGPUMask(ID3D11Device* pd3d11Device, UINT* pd3d11DeviceGPUMask)
|
||||
{
|
||||
HMODULE hDLL = nullptr;
|
||||
IAmdDxExt* pExt = nullptr;
|
||||
IAmdDxExtCLInterop* pCLExt = nullptr;
|
||||
PFNAmdDxExtCreate11 AmdDxExtCreate11;
|
||||
HRESULT hr = S_OK;
|
||||
|
||||
// Get a handle to the DXX DLL with extension API support
|
||||
#if defined _WIN64
|
||||
static const CHAR dxxModuleName[13] = "atidxx64.dll";
|
||||
#else
|
||||
static const CHAR dxxModuleName[13] = "atidxx32.dll";
|
||||
#endif
|
||||
|
||||
hDLL = GetModuleHandle(dxxModuleName);
|
||||
|
||||
if (hDLL == nullptr) {
|
||||
hr = E_FAIL;
|
||||
}
|
||||
|
||||
// Get the exported AmdDxExtCreate() function pointer
|
||||
if (SUCCEEDED(hr)) {
|
||||
AmdDxExtCreate11 = reinterpret_cast<PFNAmdDxExtCreate11>(
|
||||
GetProcAddress(hDLL, "AmdDxExtCreate11"));
|
||||
if (AmdDxExtCreate11 == nullptr) {
|
||||
hr = E_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
// Create the extension object
|
||||
if (SUCCEEDED(hr)) {
|
||||
hr = AmdDxExtCreate11(pd3d11Device, &pExt);
|
||||
}
|
||||
|
||||
// Get the extension version information
|
||||
if (SUCCEEDED(hr)) {
|
||||
AmdDxExtVersion extVersion;
|
||||
hr = pExt->GetVersion(&extVersion);
|
||||
|
||||
if (extVersion.majorVersion == 0) {
|
||||
hr = E_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
// Get the OpenCL Interop interface
|
||||
if (SUCCEEDED(hr)) {
|
||||
pCLExt = static_cast<IAmdDxExtCLInterop*>(
|
||||
pExt->GetExtInterface(AmdDxExtCLInteropID));
|
||||
if (pCLExt != nullptr) {
|
||||
// Get the GPU mask using the CL Interop extension.
|
||||
pCLExt->QueryInteropGpuMask(pd3d11DeviceGPUMask);
|
||||
}
|
||||
else {
|
||||
hr = E_FAIL;
|
||||
}
|
||||
}
|
||||
|
||||
if (pCLExt != nullptr) {
|
||||
pCLExt->Release();
|
||||
}
|
||||
|
||||
if (pExt != nullptr) {
|
||||
pExt->Release();
|
||||
}
|
||||
|
||||
return (SUCCEEDED(hr));
|
||||
}
|
||||
|
||||
bool
|
||||
Device::associateD3D11Device(void* d3d11Device)
|
||||
{
|
||||
ID3D11Device* pd3d11Device = static_cast<ID3D11Device*>(d3d11Device);
|
||||
|
||||
IDXGIDevice* pDXGIDevice;
|
||||
pd3d11Device->QueryInterface(__uuidof(IDXGIDevice), (void **)&pDXGIDevice);
|
||||
|
||||
IDXGIAdapter* pDXGIAdapter;
|
||||
pDXGIDevice->GetAdapter(&pDXGIAdapter);
|
||||
|
||||
DXGI_ADAPTER_DESC adapterDesc;
|
||||
pDXGIAdapter->GetDesc(&adapterDesc);
|
||||
|
||||
// match the adapter
|
||||
bool canInteroperate =
|
||||
(properties().osProperties.luidHighPart == adapterDesc.AdapterLuid.HighPart) &&
|
||||
(properties().osProperties.luidLowPart == adapterDesc.AdapterLuid.LowPart);
|
||||
|
||||
UINT chainBitMask = 1 << properties().gpuIndex;
|
||||
|
||||
// match the chain ID
|
||||
if (canInteroperate) {
|
||||
UINT d3d11DeviceGPUMask = 0;
|
||||
|
||||
if (queryD3D11DeviceGPUMask(pd3d11Device, &d3d11DeviceGPUMask)) {
|
||||
canInteroperate = (chainBitMask & d3d11DeviceGPUMask) != 0;
|
||||
}
|
||||
else {
|
||||
// special handling for Intel iGPU + AMD dGPU in LDA mode
|
||||
// (only occurs on a PX platform) where
|
||||
// the D3D11Device object is created on the Intel iGPU and
|
||||
// passed to AMD dGPU (secondary) to interoperate.
|
||||
if (chainBitMask > 1) {
|
||||
canInteroperate = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pDXGIDevice->Release();
|
||||
pDXGIAdapter->Release();
|
||||
|
||||
return canInteroperate;
|
||||
}
|
||||
|
||||
} // pal
|
||||
|
||||
#endif // !ATI_OS_LINUX
|
||||
@@ -0,0 +1,53 @@
|
||||
#include "paldevice.hpp"
|
||||
|
||||
#if defined(ATI_OS_LINUX)
|
||||
namespace pal {
|
||||
bool
|
||||
Device::associateD3D9Device(void* d3dDevice)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
}
|
||||
#else // !ATI_OS_LINUX
|
||||
|
||||
#include <d3d9.h>
|
||||
#include <dxgi.h>
|
||||
|
||||
/**************************************************************************************************************
|
||||
* Note: ideally the DXX extension interfaces should be mapped from the DXX perforce branch.
|
||||
* This means OCL client spec will need to change to include headers directly from the DXX perforce tree.
|
||||
* However, OCL only cares about the DXX OpenCL extension interface class. The spec cannot change
|
||||
* without notification. So it is safe to use a local copy of the relevant DXX extension interface classes.
|
||||
**************************************************************************************************************/
|
||||
#include "DxxOpenCLInteropExt.h"
|
||||
|
||||
namespace pal {
|
||||
|
||||
bool
|
||||
Device::associateD3D9Device(void* d3d9Device)
|
||||
{
|
||||
D3DCAPS9 pCaps;
|
||||
IDirect3D9* p3d9dev;
|
||||
LUID d3d9deviceLuid = {0, 0};
|
||||
|
||||
IDirect3DDevice9* pd3d9Device = static_cast<IDirect3DDevice9*>(d3d9Device);
|
||||
|
||||
// Get D3D9 Device caps
|
||||
pd3d9Device->GetDeviceCaps(&pCaps);
|
||||
// Get 3D9 Device
|
||||
pd3d9Device->GetDirect3D(&p3d9dev);
|
||||
|
||||
IDirect3D9Ex* p3d9devEx = static_cast<IDirect3D9Ex*>(p3d9dev);
|
||||
p3d9devEx->GetAdapterLUID(pCaps.AdapterOrdinal, &d3d9deviceLuid);
|
||||
p3d9dev->Release();
|
||||
|
||||
// match the adapter
|
||||
bool canInteroperate =
|
||||
(properties().osProperties.luidHighPart == d3d9deviceLuid.HighPart) &&
|
||||
(properties().osProperties.luidLowPart == d3d9deviceLuid.LowPart);
|
||||
|
||||
return canInteroperate;
|
||||
}
|
||||
|
||||
} // pal
|
||||
#endif // !ATI_OS_WIN
|
||||
@@ -0,0 +1,306 @@
|
||||
#include "platform/context.hpp"
|
||||
#include "device/device.hpp"
|
||||
#include "platform/runtime.hpp"
|
||||
#include "platform/agent.hpp"
|
||||
#ifdef _WIN32
|
||||
#include <d3d10_1.h>
|
||||
#include "CL/cl_d3d10.h"
|
||||
#include "CL/cl_d3d11.h"
|
||||
#endif // _WIN32
|
||||
|
||||
#include <GL/gl.h>
|
||||
#include <GL/glext.h>
|
||||
#include "CL/cl_gl.h"
|
||||
#include "paldevice.hpp"
|
||||
//#include "cwddeci.h"
|
||||
#include <GL/gl.h>
|
||||
#include "GL/glATIInternal.h"
|
||||
#ifdef ATI_OS_LINUX
|
||||
#include <stdlib.h>
|
||||
#include <dlfcn.h>
|
||||
#include "GL/glx.h"
|
||||
#include "GL/glxext.h"
|
||||
#include "GL/glXATIPrivate.h"
|
||||
#else
|
||||
#include "GL/wglATIPrivate.h"
|
||||
#endif
|
||||
|
||||
#ifdef ATI_OS_LINUX
|
||||
typedef void* (*PFNGlxGetProcAddress)(const GLubyte* procName);
|
||||
static PFNGlxGetProcAddress pfnGlxGetProcAddress=NULL;
|
||||
static PFNGLXBEGINCLINTEROPAMD glXBeginCLInteropAMD = NULL;
|
||||
static PFNGLXENDCLINTEROPAMD glXEndCLInteropAMD = NULL;
|
||||
static PFNGLXRESOURCEATTACHAMD glXResourceAttachAMD = NULL;
|
||||
static PFNGLXRESOURCEDETACHAMD glxResourceAcquireAMD = NULL;
|
||||
static PFNGLXRESOURCEDETACHAMD glxResourceReleaseAMD = NULL;
|
||||
static PFNGLXRESOURCEDETACHAMD glXResourceDetachAMD = NULL;
|
||||
static PFNGLXGETCONTEXTMVPUINFOAMD glXGetContextMVPUInfoAMD = NULL;
|
||||
#else
|
||||
static PFNWGLBEGINCLINTEROPAMD wglBeginCLInteropAMD = NULL;
|
||||
static PFNWGLENDCLINTEROPAMD wglEndCLInteropAMD = NULL;
|
||||
static PFNWGLRESOURCEATTACHAMD wglResourceAttachAMD = NULL;
|
||||
static PFNWGLRESOURCEDETACHAMD wglResourceAcquireAMD = NULL;
|
||||
static PFNWGLRESOURCEDETACHAMD wglResourceReleaseAMD = NULL;
|
||||
static PFNWGLRESOURCEDETACHAMD wglResourceDetachAMD = NULL;
|
||||
static PFNWGLGETCONTEXTGPUINFOAMD wglGetContextGPUInfoAMD = NULL;
|
||||
#endif
|
||||
|
||||
namespace pal {
|
||||
|
||||
bool
|
||||
Device::initGLInteropPrivateExt(void* GLplatformContext, void* GLdeviceContext) const
|
||||
{
|
||||
#ifdef ATI_OS_LINUX
|
||||
GLXContext ctx = (GLXContext)GLplatformContext;
|
||||
void * pModule = dlopen("libGL.so.1",RTLD_NOW);
|
||||
|
||||
if(NULL == pModule) {
|
||||
return false;
|
||||
}
|
||||
pfnGlxGetProcAddress = (PFNGlxGetProcAddress) dlsym(pModule,"glXGetProcAddress");
|
||||
|
||||
if (NULL == pfnGlxGetProcAddress) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD ||
|
||||
!glXResourceDetachAMD || !glXGetContextMVPUInfoAMD) {
|
||||
glXBeginCLInteropAMD = (PFNGLXBEGINCLINTEROPAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXBeginCLInteroperabilityAMD");
|
||||
glXEndCLInteropAMD = (PFNGLXENDCLINTEROPAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXEndCLInteroperabilityAMD");
|
||||
glXResourceAttachAMD = (PFNGLXRESOURCEATTACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceAttachAMD");
|
||||
glxResourceAcquireAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceAcquireAMD");
|
||||
glxResourceReleaseAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceReleaseAMD");
|
||||
glXResourceDetachAMD = (PFNGLXRESOURCEDETACHAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXResourceDetachAMD");
|
||||
glXGetContextMVPUInfoAMD = (PFNGLXGETCONTEXTMVPUINFOAMD) pfnGlxGetProcAddress ((const GLubyte *)"glXGetContextMVPUInfoAMD");
|
||||
}
|
||||
|
||||
if (!glXBeginCLInteropAMD || !glXEndCLInteropAMD || !glXResourceAttachAMD ||
|
||||
!glXResourceDetachAMD
|
||||
#ifndef BRAHMA
|
||||
|| !glXGetContextMVPUInfoAMD
|
||||
#endif
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD ||
|
||||
!wglResourceDetachAMD || !wglGetContextGPUInfoAMD) {
|
||||
HGLRC fakeRC = NULL;
|
||||
|
||||
if (!wglGetCurrentContext()) {
|
||||
fakeRC = wglCreateContext((HDC)GLdeviceContext);
|
||||
wglMakeCurrent((HDC)GLdeviceContext, fakeRC);
|
||||
}
|
||||
|
||||
wglBeginCLInteropAMD = (PFNWGLBEGINCLINTEROPAMD) wglGetProcAddress ("wglBeginCLInteroperabilityAMD");
|
||||
wglEndCLInteropAMD = (PFNWGLENDCLINTEROPAMD) wglGetProcAddress ("wglEndCLInteroperabilityAMD");
|
||||
wglResourceAttachAMD = (PFNWGLRESOURCEATTACHAMD) wglGetProcAddress ("wglResourceAttachAMD");
|
||||
wglResourceAcquireAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceAcquireAMD");
|
||||
wglResourceReleaseAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceReleaseAMD");
|
||||
wglResourceDetachAMD = (PFNWGLRESOURCEDETACHAMD) wglGetProcAddress ("wglResourceDetachAMD");
|
||||
wglGetContextGPUInfoAMD = (PFNWGLGETCONTEXTGPUINFOAMD) wglGetProcAddress ("wglGetContextGPUInfoAMD");
|
||||
|
||||
if (fakeRC) {
|
||||
wglMakeCurrent(NULL, NULL);
|
||||
wglDeleteContext(fakeRC);
|
||||
}
|
||||
}
|
||||
if (!wglBeginCLInteropAMD || !wglEndCLInteropAMD || !wglResourceAttachAMD ||
|
||||
!wglResourceDetachAMD || !wglGetContextGPUInfoAMD) {
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
Device::glCanInterop(void* GLplatformContext, void* GLdeviceContext) const
|
||||
{
|
||||
bool canInteroperate = false;
|
||||
|
||||
#ifdef ATI_OS_WIN
|
||||
LUID glAdapterLuid = {0, 0};
|
||||
UINT glChainBitMask = 0;
|
||||
HGLRC hRC = (HGLRC)GLplatformContext;
|
||||
|
||||
//get GL context's LUID and chainBitMask from UGL
|
||||
if (wglGetContextGPUInfoAMD(hRC, &glAdapterLuid, &glChainBitMask)) {
|
||||
// match the adapter
|
||||
canInteroperate =
|
||||
(properties().osProperties.luidHighPart == glAdapterLuid.HighPart) &&
|
||||
(properties().osProperties.luidLowPart == glAdapterLuid.LowPart) &&
|
||||
((1 << properties().gpuIndex) == glChainBitMask);
|
||||
}
|
||||
#else
|
||||
#ifdef BRAHMA
|
||||
canInteroperate = true;
|
||||
#else
|
||||
GLuint glDeviceId = 0 ;
|
||||
GLuint glChainMask = 0 ;
|
||||
GLXContext ctx = (GLXContext)GLplatformContext;
|
||||
|
||||
if (glXGetContextMVPUInfoAMD(ctx, &glDeviceId, &glChainMask)) {
|
||||
// we allow intoperability only with GL context reside on a single GPU
|
||||
canInteroperate =
|
||||
(properties().deviceId == glDeviceId) &&
|
||||
((1 << properties().gpuIndex) == glChainBitMask);
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
return canInteroperate;
|
||||
}
|
||||
|
||||
bool
|
||||
Device::glAssociate(void* GLplatformContext, void* GLdeviceContext) const
|
||||
{
|
||||
//initialize pointers to the gl extension that supports interoperability
|
||||
if (!initGLInteropPrivateExt(GLplatformContext, GLdeviceContext) ||
|
||||
!glCanInterop(GLplatformContext, GLdeviceContext)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int flags = 0;
|
||||
/*
|
||||
if (m_adp->pAsicInfo->svmFineGrainSystem)
|
||||
{
|
||||
flags = GL_INTEROP_SVM;
|
||||
}
|
||||
*/
|
||||
#ifdef ATI_OS_LINUX
|
||||
GLXContext ctx = (GLXContext)GLplatformContext;
|
||||
return (glXBeginCLInteropAMD(ctx, 0)) ? true : false;
|
||||
#else
|
||||
HGLRC hRC = (HGLRC)GLplatformContext;
|
||||
return (wglBeginCLInteropAMD(hRC, flags)) ? true : false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool
|
||||
Device::glDissociate(void* GLplatformContext, void* GLdeviceContext) const
|
||||
{
|
||||
int flags = 0;
|
||||
/*
|
||||
if (m_adp->pAsicInfo->svmFineGrainSystem)
|
||||
{
|
||||
flags = GL_INTEROP_SVM;
|
||||
}
|
||||
*/
|
||||
#ifdef ATI_OS_LINUX
|
||||
GLXContext ctx = (GLXContext)GLplatformContext;
|
||||
return (glXEndCLInteropAMD(ctx, 0)) ? true : false;
|
||||
#else
|
||||
HGLRC hRC = (HGLRC)GLplatformContext;
|
||||
return (wglEndCLInteropAMD(hRC, flags)) ? true : false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool
|
||||
Device::resGLAssociate(
|
||||
void* GLContext,
|
||||
uint name,
|
||||
uint type,
|
||||
void** handle,
|
||||
void** mbResHandle,
|
||||
size_t* offset) const
|
||||
{
|
||||
amd::ScopedLock lk(lockPAL());
|
||||
|
||||
GLResource hRes = {};
|
||||
GLResourceData hData = {};
|
||||
|
||||
bool status = false;
|
||||
|
||||
hRes.type = type;
|
||||
hRes.name = name;
|
||||
|
||||
hData.version = GL_RESOURCE_DATA_VERSION;
|
||||
#ifdef ATI_OS_LINUX
|
||||
GLXContext ctx = (GLXContext)GLContext;
|
||||
if (glXResourceAttachAMD(ctx, &hRes, &hData)) {
|
||||
attribs.dynamicSharedBufferID = hData->sharedBufferID;
|
||||
status = true;
|
||||
}
|
||||
#else
|
||||
HGLRC hRC = (HGLRC)GLContext;
|
||||
if (wglResourceAttachAMD(hRC, &hRes, &hData)) {
|
||||
status = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!status) {
|
||||
return false;
|
||||
}
|
||||
|
||||
*handle = reinterpret_cast<void*>(hData.handle);
|
||||
*mbResHandle = reinterpret_cast<void*>(hData.mbResHandle);
|
||||
*offset = static_cast<size_t>(hData.offset);
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
bool
|
||||
Device::resGLAcquire(void* GLplatformContext, void* mbResHandle, uint type) const
|
||||
{
|
||||
amd::ScopedLock lk(lockPAL());
|
||||
|
||||
GLResource hRes = {};
|
||||
hRes.mbResHandle = (GLuintp)mbResHandle;
|
||||
hRes.type = type;
|
||||
|
||||
#ifdef ATI_OS_LINUX
|
||||
GLXContext ctx = (GLXContext) GLplatformContext;
|
||||
return (glxResourceAcquireAMD(ctx, &hRes)) ? true : false;
|
||||
#else
|
||||
HGLRC hRC = wglGetCurrentContext();
|
||||
//! @todo A temporary workaround for MT issue in conformance fence_sync
|
||||
if (0 == hRC) {
|
||||
return true;
|
||||
}
|
||||
return (wglResourceAcquireAMD(hRC, &hRes)) ? true : false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool
|
||||
Device::resGLRelease(void* GLplatformContext, void* mbResHandle, uint type) const
|
||||
{
|
||||
amd::ScopedLock lk(lockPAL());
|
||||
|
||||
GLResource hRes = {};
|
||||
hRes.mbResHandle = (GLuintp)mbResHandle;
|
||||
hRes.type = type;
|
||||
#ifdef ATI_OS_LINUX
|
||||
//TODO : make sure the application GL context is current. if not no
|
||||
// point calling into the GL RT.
|
||||
GLXContext ctx = (GLXContext) GLplatformContext;
|
||||
return (glxResourceReleaseAMD(ctx, &hRes)) ? true : false;
|
||||
#else
|
||||
// Make the call into the GL driver only if the application GL context is current
|
||||
HGLRC hRC = wglGetCurrentContext();
|
||||
//! @todo A temporary workaround for MT issue in conformance fence_sync
|
||||
if (0 == hRC) {
|
||||
return true;
|
||||
}
|
||||
return (wglResourceReleaseAMD(hRC, &hRes)) ? true : false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool
|
||||
Device::resGLFree(void* GLplatformContext, void* mbResHandle, uint type) const
|
||||
{
|
||||
amd::ScopedLock lk(lockPAL());
|
||||
|
||||
GLResource hRes = {};
|
||||
hRes.mbResHandle = (GLuintp)mbResHandle;
|
||||
hRes.type = type;
|
||||
#ifdef ATI_OS_LINUX
|
||||
GLXContext ctx = (GLXContext)GLplatformContext;
|
||||
return (glXResourceDetachAMD(ctx, &hRes)) ? true : false;
|
||||
#else
|
||||
HGLRC hRC = (HGLRC)GLplatformContext;
|
||||
return (wglResourceDetachAMD(hRC, &hRes)) ? true : false;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // pal
|
||||
檔案差異因為檔案過大而無法顯示
載入差異
@@ -0,0 +1,263 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef GPUKERNEL_HPP_
|
||||
#define GPUKERNEL_HPP_
|
||||
|
||||
#include "device/device.hpp"
|
||||
#include "utils/macros.hpp"
|
||||
#include "platform/command.hpp"
|
||||
#include "platform/program.hpp"
|
||||
#include "platform/kernel.hpp"
|
||||
#include "platform/sampler.hpp"
|
||||
#include "device/pal/paldevice.hpp"
|
||||
#include "device/pal/palvirtual.hpp"
|
||||
#include "amd_hsa_kernel_code.h"
|
||||
#include "device/pal/palprintf.hpp"
|
||||
#include "device/pal/palwavelimiter.hpp"
|
||||
#include "hsa.h"
|
||||
|
||||
namespace amd {
|
||||
namespace hsa {
|
||||
namespace loader {
|
||||
class Symbol;
|
||||
} // loader
|
||||
} // hsa
|
||||
} // amd
|
||||
|
||||
//! \namespace pal PAL Device Implementation
|
||||
namespace pal {
|
||||
|
||||
class VirtualGPU;
|
||||
class Device;
|
||||
class NullDevice;
|
||||
class HSAILProgram;
|
||||
|
||||
struct HWSHADER_Helper
|
||||
{
|
||||
template <typename S, typename T>
|
||||
static T Get(S base, T offset) {
|
||||
return reinterpret_cast<T>(reinterpret_cast<intptr_t>(base)
|
||||
+ reinterpret_cast<size_t>(offset));
|
||||
}
|
||||
};
|
||||
|
||||
#define HWSHADER_Get(shader, field) \
|
||||
HWSHADER_Helper::Get((shader), (shader)->field)
|
||||
|
||||
template <typename D, typename S>
|
||||
static void CalcPtr(D& dst, const S src, size_t structSize, size_t size) {
|
||||
dst = reinterpret_cast<D>(reinterpret_cast<const intptr_t>(src)
|
||||
+ structSize * size);
|
||||
}
|
||||
|
||||
/*! \addtogroup pal PAL Device Implementation
|
||||
* @{
|
||||
*/
|
||||
|
||||
enum HSAIL_ADDRESS_QUALIFIER{
|
||||
HSAIL_ADDRESS_ERROR = 0,
|
||||
HSAIL_ADDRESS_GLOBAL,
|
||||
HSAIL_ADDRESS_LOCAL,
|
||||
HSAIL_MAX_ADDRESS_QUALIFIERS
|
||||
} ;
|
||||
|
||||
enum HSAIL_ARG_TYPE{
|
||||
HSAIL_ARGTYPE_ERROR = 0,
|
||||
HSAIL_ARGTYPE_POINTER,
|
||||
HSAIL_ARGTYPE_VALUE,
|
||||
HSAIL_ARGTYPE_IMAGE,
|
||||
HSAIL_ARGTYPE_SAMPLER,
|
||||
HSAIL_ARGTYPE_QUEUE,
|
||||
HSAIL_ARGMAX_ARG_TYPES
|
||||
};
|
||||
|
||||
enum HSAIL_DATA_TYPE{
|
||||
HSAIL_DATATYPE_ERROR = 0,
|
||||
HSAIL_DATATYPE_B1,
|
||||
HSAIL_DATATYPE_B8,
|
||||
HSAIL_DATATYPE_B16,
|
||||
HSAIL_DATATYPE_B32,
|
||||
HSAIL_DATATYPE_B64,
|
||||
HSAIL_DATATYPE_S8,
|
||||
HSAIL_DATATYPE_S16,
|
||||
HSAIL_DATATYPE_S32,
|
||||
HSAIL_DATATYPE_S64,
|
||||
HSAIL_DATATYPE_U8,
|
||||
HSAIL_DATATYPE_U16,
|
||||
HSAIL_DATATYPE_U32,
|
||||
HSAIL_DATATYPE_U64,
|
||||
HSAIL_DATATYPE_F16,
|
||||
HSAIL_DATATYPE_F32,
|
||||
HSAIL_DATATYPE_F64,
|
||||
HSAIL_DATATYPE_STRUCT,
|
||||
HSAIL_DATATYPE_OPAQUE,
|
||||
HSAIL_DATATYPE_MAX_TYPES
|
||||
};
|
||||
|
||||
enum HSAIL_ACCESS_TYPE {
|
||||
HSAIL_ACCESS_TYPE_NONE = 0,
|
||||
HSAIL_ACCESS_TYPE_RO,
|
||||
HSAIL_ACCESS_TYPE_WO,
|
||||
HSAIL_ACCESS_TYPE_RW
|
||||
};
|
||||
|
||||
class HSAILKernel : public device::Kernel
|
||||
{
|
||||
public:
|
||||
struct Argument
|
||||
{
|
||||
std::string name_; //!< Argument's name
|
||||
std::string typeName_; //!< Argument's type name
|
||||
uint size_; //!< Size in bytes
|
||||
uint offset_; //!< Argument's offset
|
||||
uint alignment_; //!< Argument's alignment
|
||||
HSAIL_ARG_TYPE type_; //!< Type of the argument
|
||||
HSAIL_ADDRESS_QUALIFIER addrQual_; //!< Address qualifier of the argument
|
||||
HSAIL_DATA_TYPE dataType_; //!< The type of data
|
||||
uint numElem_; //!< Number of elements
|
||||
HSAIL_ACCESS_TYPE access_; //!< Access type for the argument
|
||||
};
|
||||
|
||||
// Max number of possible extra (hidden) kernel arguments
|
||||
static const uint MaxExtraArgumentsNum = 6;
|
||||
|
||||
HSAILKernel(std::string name,
|
||||
HSAILProgram* prog,
|
||||
std::string compileOptions,
|
||||
uint extraArgsNum);
|
||||
|
||||
virtual ~HSAILKernel();
|
||||
|
||||
//! Initializes the metadata required for this kernel,
|
||||
//! finalizes the kernel if needed
|
||||
bool init(amd::hsa::loader::Symbol *sym, bool finalize = false);
|
||||
|
||||
//! Returns true if memory is valid for execution
|
||||
virtual bool validateMemory(uint idx, amd::Memory* amdMem) const;
|
||||
|
||||
//! Returns a pointer to the hsail argument
|
||||
const Argument* argument(size_t i) const { return arguments_[i]; }
|
||||
|
||||
//! Returns the number of hsail arguments
|
||||
size_t numArguments() const { return arguments_.size(); }
|
||||
|
||||
//! Returns GPU device object, associated with this kernel
|
||||
const Device& dev() const;
|
||||
|
||||
//! Returns HSA program associated with this kernel
|
||||
const HSAILProgram& prog() const;
|
||||
|
||||
//! Returns LDS size used in this kernel
|
||||
uint32_t ldsSize() const
|
||||
{ return cpuAqlCode_->workgroup_group_segment_byte_size; }
|
||||
|
||||
//! Returns pointer on CPU to AQL code info
|
||||
const void* cpuAqlCode() const { return cpuAqlCode_; }
|
||||
|
||||
//! Returns memory object with AQL code
|
||||
pal::Memory* gpuAqlCode() const { return code_; }
|
||||
|
||||
//! Returns size of AQL code
|
||||
size_t aqlCodeSize() const { return codeSize_; }
|
||||
|
||||
//! Returns the size of argument buffer
|
||||
size_t argsBufferSize() const
|
||||
{ return cpuAqlCode_->kernarg_segment_byte_size; }
|
||||
|
||||
//! Returns spill reg size per workitem
|
||||
int spillSegSize() const
|
||||
{ return cpuAqlCode_->workitem_private_segment_byte_size; }
|
||||
|
||||
//! Returns TRUE if kernel uses dynamic parallelism
|
||||
bool dynamicParallelism() const
|
||||
{ return (flags_.dynamicParallelism_) ? true : false; }
|
||||
|
||||
//! Returns TRUE if kernel is internal kernel
|
||||
bool isInternalKernel() const
|
||||
{ return (flags_.internalKernel_) ? true : false; }
|
||||
|
||||
//! Finds local workgroup size
|
||||
void findLocalWorkSize(
|
||||
size_t workDim, //!< Work dimension
|
||||
const amd::NDRange& gblWorkSize,//!< Global work size
|
||||
amd::NDRange& lclWorkSize //!< Local work size
|
||||
) const;
|
||||
|
||||
//! Returns AQL packet in CPU memory
|
||||
//! if the kerenl arguments were successfully loaded, otherwise NULL
|
||||
hsa_kernel_dispatch_packet_t* loadArguments(
|
||||
VirtualGPU& gpu, //!< Running GPU context
|
||||
const amd::Kernel& kernel, //!< AMD kernel object
|
||||
const amd::NDRangeContainer& sizes, //!< NDrange container
|
||||
const_address parameters, //!< Application arguments for the kernel
|
||||
bool nativeMem, //!< Native memory objectes are passed
|
||||
uint64_t vmDefQueue, //!< GPU VM default queue pointer
|
||||
uint64_t* vmParentWrap, //!< GPU VM parent aql wrap object
|
||||
std::vector<const Memory*>& memList //!< Memory list for GSL/VidMM handles
|
||||
) const;
|
||||
|
||||
//! Returns pritnf info array
|
||||
const std::vector<PrintfInfo>& printfInfo() const { return printf_; }
|
||||
|
||||
//! Returns the kernel index in the program
|
||||
uint index() const { return index_; }
|
||||
|
||||
//! Returns kernel's extra argument count
|
||||
uint extraArgumentsNum() const { return extraArgumentsNum_; }
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
HSAILKernel(const HSAILKernel&);
|
||||
|
||||
//! Disable operator=
|
||||
HSAILKernel& operator=(const HSAILKernel&);
|
||||
|
||||
//! Creates AQL kernel HW info
|
||||
bool aqlCreateHWInfo(amd::hsa::loader::Symbol *sym);
|
||||
|
||||
//! Initializes arguments_ and the abstraction layer kernel parameters
|
||||
void initArgList(
|
||||
const aclArgData* aclArg //!< List of ACL arguments
|
||||
);
|
||||
|
||||
//! Initializes Hsail Argument metadata and info
|
||||
void initHsailArgs(
|
||||
const aclArgData* aclArg //!< List of ACL arguments
|
||||
);
|
||||
|
||||
//! Initializes Hsail Printf metadata and info
|
||||
void initPrintf(
|
||||
const aclPrintfFmt* aclPrintf //!< List of ACL printfs
|
||||
);
|
||||
|
||||
std::vector<Argument*> arguments_; //!< Vector list of HSAIL Arguments
|
||||
std::string compileOptions_; //!< compile used for finalizing this kernel
|
||||
amd_kernel_code_t* cpuAqlCode_; //!< AQL kernel code on CPU
|
||||
const NullDevice& dev_; //!< GPU device object
|
||||
const HSAILProgram& prog_; //!< Reference to the parent program
|
||||
std::vector<PrintfInfo> printf_; //!< Format strings for GPU printf support
|
||||
uint index_; //!< Kernel index in the program
|
||||
|
||||
pal::Memory* code_; //!< Memory object with ISA code
|
||||
size_t codeSize_; //!< Size of ISA code
|
||||
|
||||
char* hwMetaData_; //!< SI metadata
|
||||
|
||||
uint extraArgumentsNum_; //! Number of extra (hidden) kernel arguments
|
||||
|
||||
union Flags {
|
||||
struct {
|
||||
uint imageEna_: 1; //!< Kernel uses images
|
||||
uint imageWriteEna_: 1; //!< Kernel uses image writes
|
||||
uint dynamicParallelism_: 1; //!< Dynamic parallelism enabled
|
||||
uint internalKernel_: 1; //!< True: internal kernel
|
||||
};
|
||||
uint value_;
|
||||
Flags(): value_(0) {}
|
||||
} flags_;
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
|
||||
#endif /*PALKERNEL_HPP_*/
|
||||
檔案差異因為檔案過大而無法顯示
載入差異
@@ -0,0 +1,275 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALMEMORY_HPP_
|
||||
#define PALMEMORY_HPP_
|
||||
|
||||
#include "top.hpp"
|
||||
#include "thread/atomic.hpp"
|
||||
#include "device/pal/palresource.hpp"
|
||||
#include <map>
|
||||
|
||||
/*! \addtogroup GPU
|
||||
* @{
|
||||
*/
|
||||
namespace device {
|
||||
class Memory;
|
||||
}
|
||||
|
||||
//! PAL Device Implementation
|
||||
namespace pal {
|
||||
|
||||
class Device;
|
||||
class Heap;
|
||||
class Resource;
|
||||
class Memory;
|
||||
class VirtualGPU;
|
||||
|
||||
//! GPU memory object.
|
||||
// Wrapper that can contain a heap block or an interop buffer/image.
|
||||
class Memory: public device::Memory, public Resource
|
||||
{
|
||||
public:
|
||||
enum InteropType {
|
||||
InteropNone = 0, //!< None interop memory
|
||||
InteropHwEmulation = 1, //!< Uses HW emulaiton with calMemCopy
|
||||
InteropDirectAccess = 2 //!< Uses direct access to the interop surface
|
||||
};
|
||||
|
||||
//! Constructor (with owner)
|
||||
Memory(
|
||||
const Device& gpuDev, //!< GPU device object
|
||||
amd::Memory& owner, //!< Abstraction layer memory object
|
||||
size_t size //!< Memory size for allocation
|
||||
);
|
||||
|
||||
//! Constructor (nonfat version for local scratch mem use without heap block)
|
||||
Memory(
|
||||
const Device& gpuDev, //!< GPU device object
|
||||
size_t size //!< Memory size for allocation
|
||||
);
|
||||
|
||||
//! Constructor memory for images (without global heap allocation)
|
||||
Memory(
|
||||
const Device& gpuDev, //!< GPU device object
|
||||
amd::Memory& owner, //!< Abstraction layer memory object
|
||||
size_t width, //!< Allocated memory width
|
||||
size_t height, //!< Allocated memory height
|
||||
size_t depth, //!< Allocated memory depth
|
||||
cl_image_format format, //!< Memory format
|
||||
cl_mem_object_type imageType, //!< CL image type
|
||||
uint mipLevels //!< The number of mip levels
|
||||
);
|
||||
|
||||
//! Constructor memory for images (without global heap allocation)
|
||||
Memory(
|
||||
const Device& gpuDev, //!< GPU device object
|
||||
size_t size, //!< Memory object size
|
||||
size_t width, //!< Allocated memory width
|
||||
size_t height, //!< Allocated memory height
|
||||
size_t depth, //!< Allocated memory depth
|
||||
cl_image_format format, //!< Memory format
|
||||
cl_mem_object_type imageType, //!< CL image type
|
||||
uint mipLevels //!< The number of mip levels
|
||||
);
|
||||
|
||||
//! Default destructor
|
||||
~Memory();
|
||||
|
||||
//! Creates the interop memory
|
||||
bool createInterop(
|
||||
InteropType type //!< The interop type
|
||||
);
|
||||
|
||||
//! Overloads the resource create method
|
||||
virtual bool create(
|
||||
Resource::MemoryType memType, //!< Memory type
|
||||
Resource::CreateParams* params = NULL //!< Prameters for create
|
||||
);
|
||||
|
||||
//! Allocate memory for API-level maps
|
||||
virtual void* allocMapTarget(
|
||||
const amd::Coord3D& origin, //!< The map location in memory
|
||||
const amd::Coord3D& region, //!< The map region in memory
|
||||
uint mapFlags, //!< Map flags
|
||||
size_t* rowPitch = NULL, //!< Row pitch for the mapped memory
|
||||
size_t* slicePitch = NULL //!< Slice for the mapped memory
|
||||
);
|
||||
|
||||
//! Pins system memory associated with this memory object
|
||||
virtual bool pinSystemMemory(
|
||||
void* hostPtr, //!< System memory address
|
||||
size_t size //!< Size of allocated system memory
|
||||
);
|
||||
|
||||
//! Releases indirect map surface
|
||||
virtual void releaseIndirectMap() { decIndMapCount(); }
|
||||
|
||||
//! Map the device memory to CPU visible
|
||||
virtual void* cpuMap(
|
||||
device::VirtualDevice& vDev,//!< Virtual device for map operaiton
|
||||
uint flags = 0, //!< flags for the map operation
|
||||
// Optimization for multilayer map/unmap
|
||||
uint startLayer = 0, //!< Start layer for multilayer map
|
||||
uint numLayers = 0, //!< End layer for multilayer map
|
||||
size_t* rowPitch = NULL, //!< Row pitch for the device memory
|
||||
size_t* slicePitch = NULL //!< Slice pitch for the device memory
|
||||
);
|
||||
|
||||
//! Unmap the device memory
|
||||
virtual void cpuUnmap(
|
||||
device::VirtualDevice& vDev //!< Virtual device for unmap operaiton
|
||||
);
|
||||
|
||||
//! Updates device memory from the owner's host allocation
|
||||
void syncCacheFromHost(
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
//! Synchronization flags
|
||||
device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags()
|
||||
);
|
||||
|
||||
//! Updates the owner's host allocation from device memory
|
||||
virtual void syncHostFromCache(
|
||||
//! Synchronization flags
|
||||
device::Memory::SyncFlags syncFlags = device::Memory::SyncFlags()
|
||||
);
|
||||
|
||||
//! Creates a view from current resource
|
||||
virtual Memory* createBufferView(
|
||||
amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner
|
||||
);
|
||||
|
||||
//! Allocates host memory for synchronization with MGPU context
|
||||
void mgpuCacheWriteBack();
|
||||
|
||||
//! Transfers objects data to the destination object
|
||||
bool moveTo(Memory& dst);
|
||||
|
||||
//! Accessors for indirect map memory object
|
||||
Memory* mapMemory() const;
|
||||
|
||||
//! Returns the interop memory for this memory object
|
||||
Memory* interop() const { return interopMemory_; }
|
||||
|
||||
//! Gets interop type for this memory object
|
||||
InteropType interopType() const { return interopType_; }
|
||||
|
||||
//! Sets interop type for this memory object
|
||||
void setInteropType(InteropType type) { interopType_ = type; }
|
||||
|
||||
//! Set the owner
|
||||
void setOwner(amd::Memory* owner) { owner_ = owner; }
|
||||
|
||||
// Decompress GL depth-stencil/MSAA resources for CL access
|
||||
// Invalidates any FBOs the resource may be bound to, otherwise the GL driver may crash.
|
||||
virtual bool processGLResource(GLResourceOP operation);
|
||||
|
||||
//! Returns the interop resource for this memory object
|
||||
const Memory* parent() const { return parent_; }
|
||||
|
||||
//! Returns TRUE if direct map is acceaptable. The method detects
|
||||
//! forced USWC memory on APU and will cause a switch to
|
||||
//! indirect map for allocations with a possibility of host read
|
||||
bool isDirectMap()
|
||||
{
|
||||
return (isCacheable() || !isHostMemDirectAccess() ||
|
||||
(owner()->getMemFlags() &
|
||||
(CL_MEM_ALLOC_HOST_PTR | CL_MEM_HOST_WRITE_ONLY | CL_MEM_READ_ONLY)));
|
||||
}
|
||||
|
||||
protected:
|
||||
//! Decrement map count
|
||||
void decIndMapCount();
|
||||
|
||||
//! Initialize the object members
|
||||
void init();
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
Memory(const Memory&);
|
||||
|
||||
//! Disable operator=
|
||||
Memory& operator=(const Memory&);
|
||||
|
||||
InteropType interopType_; //!< Interop type
|
||||
Memory* interopMemory_; //!< interop memory
|
||||
Memory* pinnedMemory_; //!< Memory used as pinned system memory
|
||||
const Memory* parent_; //!< Parent memory object
|
||||
};
|
||||
|
||||
class Buffer: public pal::Memory
|
||||
{
|
||||
public:
|
||||
//! Buffer constructor
|
||||
Buffer(
|
||||
const Device& gpuDev, //!< GPU device object
|
||||
amd::Memory& owner, //!< Abstraction layer memory object
|
||||
size_t size //!< Buffer size
|
||||
)
|
||||
: pal::Memory(gpuDev, owner, size)
|
||||
{}
|
||||
|
||||
//! Creates a view from current resource
|
||||
virtual Memory* createBufferView(
|
||||
amd::Memory& subBufferOwner //!< The abstraction layer subbuf owner
|
||||
) const;
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
Buffer(const Buffer&);
|
||||
|
||||
//! Disable operator=
|
||||
Buffer& operator=(const Buffer&);
|
||||
};
|
||||
|
||||
class Image: public pal::Memory
|
||||
{
|
||||
public:
|
||||
//! Image constructor
|
||||
Image(
|
||||
const Device& gpuDev, //!< GPU device object
|
||||
amd::Memory& owner, //!< Abstraction layer memory object
|
||||
size_t width, //!< Allocated memory width
|
||||
size_t height, //!< Allocated memory height
|
||||
size_t depth, //!< Allocated memory depth
|
||||
cl_image_format format, //!< Memory format
|
||||
cl_mem_object_type imageType, //!< CL image type
|
||||
uint mipLevels //!< The number of mip levels
|
||||
)
|
||||
: pal::Memory(gpuDev, owner, width, height, depth, format, imageType, mipLevels)
|
||||
{}
|
||||
|
||||
//! Image constructor
|
||||
Image(
|
||||
const Device& gpuDev, //!< GPU device object
|
||||
size_t size, //!< Memory size
|
||||
size_t width, //!< Allocated memory width
|
||||
size_t height, //!< Allocated memory height
|
||||
size_t depth, //!< Allocated memory depth
|
||||
cl_image_format format, //!< Memory format
|
||||
cl_mem_object_type imageType, //!< CL image type
|
||||
uint mipLevels //!< The number of mip levels
|
||||
)
|
||||
: pal::Memory(gpuDev, size, width, height, depth, format, imageType, mipLevels)
|
||||
{}
|
||||
|
||||
//! Allocate memory for API-level maps
|
||||
virtual void* allocMapTarget(
|
||||
const amd::Coord3D& origin, //!< The map location in memory
|
||||
const amd::Coord3D& region, //!< The map region in memory
|
||||
uint mapFlags, //!< Map flags
|
||||
size_t* rowPitch = NULL, //!< Row pitch for the mapped memory
|
||||
size_t* slicePitch = NULL //!< Slice for the mapped memory
|
||||
);
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
Image(const Image&);
|
||||
|
||||
//! Disable operator=
|
||||
Image& operator=(const Image&);
|
||||
};
|
||||
|
||||
} // namespace pal
|
||||
|
||||
#endif // PALMEMORY_HPP_
|
||||
@@ -0,0 +1,714 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#include "top.hpp"
|
||||
#include "os/os.hpp"
|
||||
#include "device/device.hpp"
|
||||
#include "device/pal/paldefs.hpp"
|
||||
#include "device/pal/palmemory.hpp"
|
||||
#include "device/pal/palkernel.hpp"
|
||||
#include "device/pal/palprogram.hpp"
|
||||
#include "device/pal/palprintf.hpp"
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
#include <math.h>
|
||||
|
||||
namespace pal {
|
||||
|
||||
PrintfDbg::PrintfDbg(Device& device, FILE* file)
|
||||
: dbgBuffer_(nullptr)
|
||||
, dbgFile_(file)
|
||||
, gpuDevice_(device)
|
||||
, wiDbgSize_(0)
|
||||
, initCntValue_(device, 4)
|
||||
{
|
||||
}
|
||||
|
||||
PrintfDbg::~PrintfDbg()
|
||||
{
|
||||
delete dbgBuffer_;
|
||||
}
|
||||
|
||||
bool
|
||||
PrintfDbg::create()
|
||||
{
|
||||
// Create a resource for the init count value
|
||||
if (initCntValue_.create(Resource::Remote)) {
|
||||
uint32_t* value = reinterpret_cast<uint32_t*>(initCntValue_.map(nullptr));
|
||||
// The counter starts from 1
|
||||
if (nullptr != value) {
|
||||
*value = 1;
|
||||
}
|
||||
else {
|
||||
return false;
|
||||
}
|
||||
initCntValue_.unmap(nullptr);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
PrintfDbg::init(
|
||||
VirtualGPU& gpu,
|
||||
bool printfEnabled,
|
||||
const amd::NDRange& size)
|
||||
{
|
||||
// Set up debug output buffer (if printf active)
|
||||
if (printfEnabled) {
|
||||
if (!allocate()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Make sure that the size isn't bigger than the reported max
|
||||
if (size.product() <= dev().settings().maxWorkGroupSize_) {
|
||||
size_t wiDbgSizeTmp;
|
||||
|
||||
// Calculate the debug buffer size per workitem
|
||||
wiDbgSizeTmp = std::min(dbgBuffer_->size() / size.product(),
|
||||
dev().xferRead().bufSize());
|
||||
|
||||
// Make sure the size is DWORD aligned
|
||||
wiDbgSizeTmp = amd::alignDown(wiDbgSizeTmp, sizeof(uint32_t));
|
||||
|
||||
// If the new size is different, then clear the initial values
|
||||
if (wiDbgSize_ != wiDbgSizeTmp) {
|
||||
wiDbgSize_ = wiDbgSizeTmp;
|
||||
if (!clearWorkitems(gpu, 0, size.product())) {
|
||||
wiDbgSize_ = 0;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
PrintfDbg::output(
|
||||
VirtualGPU& gpu,
|
||||
bool printfEnabled,
|
||||
const amd::NDRange& size,
|
||||
const std::vector<PrintfInfo>& printfInfo)
|
||||
{
|
||||
// Are we expected to generate debug output?
|
||||
if (printfEnabled && !printfInfo.empty()) {
|
||||
uint32_t* workitemData;
|
||||
size_t i, j, k, z;
|
||||
bool realloc = false;
|
||||
|
||||
// Wait for kernel execution
|
||||
gpu.waitAllEngines();
|
||||
|
||||
size_t zdim = 1;
|
||||
size_t ydim = 1;
|
||||
size_t xdim = 1;
|
||||
|
||||
switch (size.dimensions()) {
|
||||
case 3:
|
||||
zdim = size[2];
|
||||
// Fall through ...
|
||||
case 2:
|
||||
ydim = size[1];
|
||||
// Fall through ...
|
||||
case 1:
|
||||
xdim = size[0];
|
||||
// Fall through ...
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
for (k = 0; k < zdim; ++k) {
|
||||
for (j = 0; j < ydim; ++j) {
|
||||
for (i = 0; i < xdim; ++i) {
|
||||
size_t idx = (xdim * (ydim * k + j) + i);
|
||||
workitemData = mapWorkitem(gpu, idx, &realloc);
|
||||
|
||||
if (nullptr != workitemData) {
|
||||
uint32_t wp = workitemData[0]; // write pointer (i.e. first unwritten element)
|
||||
// Walk through each PrintfDbg entry
|
||||
for (z = 1; (z < (wiDbgSize() / sizeof(uint32_t))) && (z < wp); ) {
|
||||
if (printfInfo.size() < workitemData[z]) {
|
||||
LogError("The format string wasn't reported");
|
||||
return false;
|
||||
}
|
||||
// Get the PrintfDbg info
|
||||
const PrintfInfo& info = printfInfo[workitemData[z++]];
|
||||
// There's something in this buffer
|
||||
outputDbgBuffer(info, workitemData, z);
|
||||
}
|
||||
}
|
||||
unmapWorkitem(gpu, workitemData);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Reallocate debug buffer if necessary
|
||||
if (!allocate(realloc)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
PrintfDbg::allocate(bool realloc)
|
||||
{
|
||||
if (nullptr == dbgBuffer_) {
|
||||
dbgBuffer_ = dev().createScratchBuffer(dev().info().printfBufferSize_);
|
||||
}
|
||||
else if (realloc) {
|
||||
LogWarning("Debug buffer reallocation!");
|
||||
// Double the buffer size if it's not big enough
|
||||
size_t size = dbgBuffer_->size();
|
||||
delete dbgBuffer_;
|
||||
dbgBuffer_ = dev().createScratchBuffer(size << 1);
|
||||
}
|
||||
|
||||
return (nullptr != dbgBuffer_) ? true : false;
|
||||
}
|
||||
|
||||
bool
|
||||
PrintfDbg::checkFloat(const std::string& fmt) const
|
||||
{
|
||||
switch (fmt[fmt.size() - 1]) {
|
||||
case 'e':
|
||||
case 'E':
|
||||
case 'f':
|
||||
case 'g':
|
||||
case 'G':
|
||||
case 'a':
|
||||
return true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool
|
||||
PrintfDbg::checkString(const std::string& fmt) const
|
||||
{
|
||||
if (fmt[fmt.size() - 1] == 's')
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
int
|
||||
PrintfDbg::checkVectorSpecifier(
|
||||
const std::string& fmt,
|
||||
size_t startPos,
|
||||
size_t& curPos) const
|
||||
{
|
||||
int vectorSize = 0;
|
||||
size_t pos = curPos;
|
||||
size_t size = curPos - startPos;
|
||||
|
||||
if (size >= 3) {
|
||||
size = 0;
|
||||
//no modifiers
|
||||
if (fmt[curPos - 3] == 'v') {
|
||||
size = 2;
|
||||
}
|
||||
//the modifiers are "h" or "l"
|
||||
else if (fmt[curPos - 4] == 'v') {
|
||||
size = 3;
|
||||
}
|
||||
//the modifier is "hh"
|
||||
else if ((curPos >= 5) && (fmt[curPos - 5] == 'v')) {
|
||||
size = 4;
|
||||
}
|
||||
if (size > 0) {
|
||||
curPos = size;
|
||||
pos -= curPos;
|
||||
|
||||
// Get vector size
|
||||
vectorSize = fmt[pos++] - '0';
|
||||
// PrintfDbg supports only 2, 3, 4, 8 and 16 wide vectors
|
||||
switch (vectorSize) {
|
||||
case 1:
|
||||
if ((fmt[pos++] - '0') == 6) {
|
||||
vectorSize = 16;
|
||||
}
|
||||
else {
|
||||
vectorSize = 0;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
case 3:
|
||||
case 4:
|
||||
case 8:
|
||||
break;
|
||||
default:
|
||||
vectorSize = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return vectorSize;
|
||||
}
|
||||
|
||||
static const size_t ConstStr = 0xffffffff;
|
||||
static const char Separator[] = ",\0";
|
||||
|
||||
size_t
|
||||
PrintfDbg::outputArgument(
|
||||
const std::string& fmt,
|
||||
bool printFloat,
|
||||
size_t size,
|
||||
const uint32_t* argument) const
|
||||
{
|
||||
// Serialize the output to the screen
|
||||
amd::ScopedLock k(dev().lockAsyncOps());
|
||||
|
||||
size_t copiedBytes = size;
|
||||
// Print the string argument, using standard PrintfDbg()
|
||||
if (checkString(fmt.c_str())) {
|
||||
//copiedBytes should be as number of printed chars
|
||||
copiedBytes = 0;
|
||||
//(null) should be printed
|
||||
if (*argument == 0) {
|
||||
amd::Os::printf(fmt.data(),0);
|
||||
//copiedBytes = strlen("(null)")
|
||||
copiedBytes = 6;
|
||||
}
|
||||
else {
|
||||
const unsigned char* argumentStr = reinterpret_cast<const unsigned char*>(argument);
|
||||
amd::Os::printf(fmt.data(),argumentStr);
|
||||
//copiedBytes = strlen(argumentStr)
|
||||
while (argumentStr[copiedBytes++] != 0);
|
||||
}
|
||||
}
|
||||
|
||||
// Print the argument(except for string ), using standard PrintfDbg()
|
||||
else {
|
||||
bool hlModifier = (strstr(fmt.c_str(),"hl") != nullptr);
|
||||
std::string hlFmt;
|
||||
if (hlModifier) {
|
||||
hlFmt = fmt;
|
||||
hlFmt.erase(hlFmt.find_first_of("hl"),2);
|
||||
}
|
||||
switch (size) {
|
||||
case 0: {
|
||||
const char* str = reinterpret_cast<const char*>(argument);
|
||||
amd::Os::printf(fmt.data(), str);
|
||||
// Find the string length
|
||||
while (str[copiedBytes++] != 0);
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
amd::Os::printf(fmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
|
||||
break;
|
||||
case 2:
|
||||
case 4:
|
||||
if (printFloat) {
|
||||
static const char* fSpecifiers = "eEfgGa";
|
||||
std::string fmtF = fmt;
|
||||
size_t posS = fmtF.find_first_of("%");
|
||||
size_t posE = fmtF.find_first_of(fSpecifiers);
|
||||
if (posS != std::string::npos &&posE != std::string::npos) {
|
||||
fmtF.replace(posS+1,posE-posS,"s");
|
||||
}
|
||||
float fArg = *(reinterpret_cast<const float*>(argument));
|
||||
float fSign = copysign(1.0,fArg);
|
||||
if (isinf(fArg)&&!isnan(fArg)) {
|
||||
if(fSign < 0) {
|
||||
amd::Os::printf(fmtF.data(),"-infinity");
|
||||
}
|
||||
else {
|
||||
amd::Os::printf(fmtF.data(),"infinity");
|
||||
}
|
||||
}
|
||||
else if (isnan(fArg)) {
|
||||
if(fSign < 0) {
|
||||
amd::Os::printf(fmtF.data(),"-nan");
|
||||
}
|
||||
else {
|
||||
amd::Os::printf(fmtF.data(),"nan");
|
||||
}
|
||||
}
|
||||
else if (hlModifier) {
|
||||
amd::Os::printf(hlFmt.data(),fArg);
|
||||
}
|
||||
else {
|
||||
amd::Os::printf(fmt.data(),fArg);
|
||||
}
|
||||
}
|
||||
else {
|
||||
bool hhModifier = (strstr(fmt.c_str(),"hh") != nullptr);
|
||||
if (hhModifier) {
|
||||
//current implementation of printf in gcc 4.5.2 runtime libraries, doesn`t recognize "hh" modifier ==>
|
||||
//argument should be explicitly converted to unsigned char (uchar) before printing and
|
||||
//fmt should be updated not to contain "hh" modifier
|
||||
std::string hhFmt = fmt;
|
||||
hhFmt.erase(hhFmt.find_first_of("h"),2);
|
||||
amd::Os::printf(hhFmt.data(), *(reinterpret_cast<const unsigned char*>(argument)));
|
||||
}
|
||||
else if (hlModifier) {
|
||||
amd::Os::printf(hlFmt.data(), *argument);
|
||||
}
|
||||
else {
|
||||
amd::Os::printf(fmt.data(), *argument);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 8:
|
||||
if (printFloat) {
|
||||
if (hlModifier) {
|
||||
amd::Os::printf(hlFmt.data(), *(reinterpret_cast<const double*>(argument)));
|
||||
}
|
||||
else {
|
||||
amd::Os::printf(fmt.data(), *(reinterpret_cast<const double*>(argument)));
|
||||
}
|
||||
}
|
||||
else {
|
||||
std::string out = fmt;
|
||||
// Use 'll' for 64 bit printf
|
||||
out.insert((out.size() - 1), 1, 'l');
|
||||
amd::Os::printf(out.data(), *(reinterpret_cast<const uint64_t*>(argument)));
|
||||
}
|
||||
break;
|
||||
case ConstStr: {
|
||||
const char* str = reinterpret_cast<const char*>(argument);
|
||||
amd::Os::printf(fmt.data(), str);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
amd::Os::printf("Error: Unsupported data size for PrintfDbg. %d bytes",
|
||||
static_cast<int>(size));
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
fflush(stdout);
|
||||
return copiedBytes;
|
||||
}
|
||||
|
||||
void
|
||||
PrintfDbg::outputDbgBuffer(const PrintfInfo& info, const uint32_t* workitemData, size_t& i) const
|
||||
{
|
||||
static const char* specifiers = "cdieEfgGaosuxXp";
|
||||
static const char* modifiers = "hl";
|
||||
static const char* special = "%n";
|
||||
static const std::string sepStr = "%s";
|
||||
const uint32_t* s = workitemData;
|
||||
size_t pos = 0;
|
||||
|
||||
// Find the format string
|
||||
std::string str = info.fmtString_;
|
||||
std::string fmt;
|
||||
size_t posStart, posEnd;
|
||||
|
||||
// Print all arguments
|
||||
// Note: the following code walks through all arguments, provided by the kernel and
|
||||
// finds the corresponding specifier in the format string.
|
||||
// Then it splits the original string into substrings with a single specifier and
|
||||
// uses standard PrintfDbg() to print each argument
|
||||
for (uint j = 0; j < info.arguments_.size(); ++j) {
|
||||
do {
|
||||
posStart = str.find_first_of("%", pos);
|
||||
if (posStart != std::string::npos) {
|
||||
posStart++;
|
||||
// Erase all spaces after %
|
||||
while (str[posStart] == ' ') {
|
||||
str.erase(posStart, 1);
|
||||
}
|
||||
size_t tmp = str.find_first_of(special, posStart);
|
||||
size_t tmp2 = str.find_first_of(specifiers, posStart);
|
||||
// Special cases. Special symbol is located before any specifier
|
||||
if (tmp < tmp2) {
|
||||
posEnd = posStart + 1;
|
||||
fmt = str.substr(pos, posEnd - pos);
|
||||
fmt.erase(posStart - pos - 1, 1);
|
||||
pos = posStart = posEnd;
|
||||
outputArgument(sepStr, false, ConstStr,
|
||||
reinterpret_cast<const uint32_t*>(fmt.data()));
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
else if (pos < str.length()) {
|
||||
outputArgument(sepStr, false, ConstStr,reinterpret_cast<const uint32_t*>((str.substr(pos)).data()));
|
||||
}
|
||||
}
|
||||
while (posStart != std::string::npos);
|
||||
|
||||
if (posStart != std::string::npos) {
|
||||
bool printFloat = false;
|
||||
int vectorSize = 0;
|
||||
size_t length;
|
||||
size_t idPos = 0;
|
||||
|
||||
// Search for PrintfDbg specifier in the format string.
|
||||
// It will be a split point for the output
|
||||
posEnd = str.find_first_of(specifiers, posStart);
|
||||
if (posEnd == std::string::npos) {
|
||||
pos = posStart = posEnd;
|
||||
break;
|
||||
}
|
||||
posEnd++;
|
||||
|
||||
size_t curPos = posEnd;
|
||||
vectorSize = checkVectorSpecifier(str, posStart, curPos);
|
||||
|
||||
// Get substring from the last position to the current specifier
|
||||
fmt = str.substr(pos, posEnd - pos);
|
||||
|
||||
// Readjust the string pointer if PrintfDbg outputs a vector
|
||||
if (vectorSize != 0) {
|
||||
size_t posVecSpec = fmt.length()-(curPos + 1);
|
||||
size_t posVecMod = fmt.find_first_of(modifiers,posVecSpec + 1);
|
||||
size_t posMod = str.find_first_of(modifiers,posStart);
|
||||
if(posMod < posEnd){
|
||||
fmt = fmt.erase(posVecSpec, posVecMod - posVecSpec);
|
||||
}
|
||||
else{
|
||||
fmt = fmt.erase(posVecSpec, curPos);
|
||||
}
|
||||
idPos = posStart - pos - 1;
|
||||
}
|
||||
pos = posStart = posEnd;
|
||||
|
||||
// Find out if the argument is a float
|
||||
printFloat = checkFloat(fmt);
|
||||
|
||||
// Is it a scalar value?
|
||||
if (vectorSize == 0) {
|
||||
length = outputArgument(fmt, printFloat, info.arguments_[j], &s[i]);
|
||||
if (0 == length) {
|
||||
return;
|
||||
}
|
||||
i += amd::alignUp(length, sizeof(uint32_t)) / sizeof(uint32_t);
|
||||
}
|
||||
else {
|
||||
// 3-component vector's size is defined as 4 * size of each scalar component
|
||||
size_t elemSize = info.arguments_[j] / (vectorSize == 3 ? 4 : vectorSize);
|
||||
size_t k = i * sizeof(uint32_t);
|
||||
std::string elementStr = fmt.substr(idPos, fmt.size());
|
||||
|
||||
// Print first element with full string
|
||||
if (0 == outputArgument(fmt, printFloat, elemSize, &s[i])) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Print other elemnts with separator if available
|
||||
for (int e = 1; e < vectorSize; ++e) {
|
||||
const char* t = reinterpret_cast<const char*>(s);
|
||||
// Output the vector separator
|
||||
outputArgument(sepStr, false, ConstStr,
|
||||
reinterpret_cast<const uint32_t*>(Separator));
|
||||
|
||||
// Output the next element
|
||||
outputArgument(elementStr, printFloat, elemSize,
|
||||
reinterpret_cast<const uint32_t*>(&t[k + e * elemSize]));
|
||||
}
|
||||
i += (amd::alignUp(info.arguments_[j], sizeof(uint32_t)))
|
||||
/ sizeof(uint32_t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (pos != std::string::npos) {
|
||||
fmt = str.substr(pos, str.size() - pos);
|
||||
outputArgument(sepStr, false, ConstStr,
|
||||
reinterpret_cast<const uint32_t*>(fmt.data()));
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
PrintfDbg::clearWorkitems(VirtualGPU& gpu, size_t idxStart, size_t number) const
|
||||
{
|
||||
// Go through all locations for every thread and copy 1
|
||||
for (uint i = idxStart; i < idxStart + number; ++i) {
|
||||
amd::Coord3D dst(i * wiDbgSize(), 0, 0);
|
||||
amd::Coord3D size(sizeof(uint32_t), 0, 0);
|
||||
|
||||
// Copy 1 into the corresponding location in the debug buffer
|
||||
if (!initCntValue_.partialMemCopyTo(
|
||||
gpu, amd::Coord3D(0, 0, 0), dst, size, *dbgBuffer_)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
uint32_t*
|
||||
PrintfDbg::mapWorkitem(VirtualGPU& gpu, size_t idx, bool* realloc)
|
||||
{
|
||||
uint32_t wiSize = 0;
|
||||
amd::Coord3D src(idx * wiDbgSize(), 0, 0);
|
||||
xferBufRead_ = &(dev().xferRead().acquire());
|
||||
|
||||
// Copy workitem size from the corresponding location in the debug buffer
|
||||
if (!dbgBuffer_->partialMemCopyTo(gpu,
|
||||
src, amd::Coord3D(0, 0, 0), amd::Coord3D(sizeof(uint32_t), 0, 0),
|
||||
*xferBufRead_)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Get memory pointer to the satged buffer
|
||||
uint32_t* workitem = reinterpret_cast<uint32_t*>(xferBufRead_->map(&gpu));
|
||||
if (nullptr == workitem) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Copy size value
|
||||
wiSize = *workitem;
|
||||
xferBufRead_->unmap(&gpu);
|
||||
|
||||
// Check if the cuurent workitem almost reached the size limit
|
||||
if ((wiDbgSize() - static_cast<size_t>(wiSize)) < 3) {
|
||||
*realloc = true;
|
||||
}
|
||||
|
||||
// If the current workitem had any output then get the data
|
||||
if ((wiSize > 1) && (wiSize <= wiDbgSize())) {
|
||||
amd::Coord3D size(wiSize * sizeof(uint32_t), 0, 0);
|
||||
|
||||
// Copy the current workitem output data to the staged buffer
|
||||
if (!dbgBuffer_->partialMemCopyTo(
|
||||
gpu, src, amd::Coord3D(0, 0, 0), size, *xferBufRead_) ||
|
||||
// Clear the write pointer back to index 1 for the current workitem
|
||||
!clearWorkitems(gpu, idx, 1)) {
|
||||
LogError("Reading the workitem data failed!");
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Get a pointer to the workitem data
|
||||
uint32_t* workitem = reinterpret_cast<uint32_t*>
|
||||
(xferBufRead_->map(&gpu));
|
||||
|
||||
return workitem;
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void
|
||||
PrintfDbg::unmapWorkitem(VirtualGPU& gpu , const uint32_t* workitemData) const
|
||||
{
|
||||
if (nullptr != workitemData) {
|
||||
xferBufRead_->unmap(&gpu);
|
||||
}
|
||||
|
||||
dev().xferRead().release(gpu, *xferBufRead_);
|
||||
}
|
||||
|
||||
bool
|
||||
PrintfDbgHSA::init(
|
||||
VirtualGPU& gpu,
|
||||
bool printfEnabled)
|
||||
{
|
||||
// Set up debug output buffer (if printf active)
|
||||
if (printfEnabled) {
|
||||
if (!allocate()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// The first two DWORDs in the printf buffer are as follows:
|
||||
// First DWORD = Offset to where next information is to
|
||||
// be written, initialized to 0
|
||||
// Second DWORD = Number of bytes available for printf data
|
||||
// = buffer size – 2*sizeof(uint32_t)
|
||||
const uint8_t initSize = 2*sizeof(uint32_t);
|
||||
uint8_t sysMem[initSize];
|
||||
memset(sysMem, 0, initSize);
|
||||
uint32_t dbgBufferSize = dbgBuffer_->size() - initSize;
|
||||
memcpy(&sysMem[4], &dbgBufferSize, sizeof(dbgBufferSize));
|
||||
|
||||
// Copy offset and number of bytes available for printf data
|
||||
// into the corresponding location in the debug buffer
|
||||
dbgBuffer_->writeRawData(gpu, initSize, sysMem, true);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
PrintfDbgHSA::output(
|
||||
VirtualGPU& gpu,
|
||||
bool printfEnabled,
|
||||
const std::vector<PrintfInfo>& printfInfo)
|
||||
{
|
||||
if (printfEnabled) {
|
||||
uint32_t offsetSize = 0;
|
||||
xferBufRead_ = &(dev().xferRead().acquire());
|
||||
|
||||
// Copy offset from the first DWORD in the debug buffer
|
||||
if (!dbgBuffer_->partialMemCopyTo(gpu,
|
||||
amd::Coord3D(0, 0, 0), amd::Coord3D(0, 0, 0),
|
||||
amd::Coord3D(sizeof(uint32_t), 0, 0),*xferBufRead_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get memory pointer to the satged buffer
|
||||
uint32_t* dbgBufferPtr = reinterpret_cast<uint32_t*>(xferBufRead_->map(&gpu));
|
||||
if (nullptr == dbgBufferPtr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
offsetSize = *dbgBufferPtr;
|
||||
xferBufRead_->unmap(&gpu);
|
||||
|
||||
if (offsetSize == 0) {
|
||||
LogError("\n The printf buffer is empty!");
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t bufSize = dev().xferRead().bufSize();
|
||||
size_t copySize = offsetSize;
|
||||
while (copySize != 0) {
|
||||
// Copy the buffer data (i.e., the printfID followed by the
|
||||
//argument data for each printf call in th kernel) to the staged buffer
|
||||
if (!dbgBuffer_->partialMemCopyTo(gpu,
|
||||
amd::Coord3D(2*sizeof(uint32_t) + offsetSize - copySize, 0, 0),
|
||||
amd::Coord3D(0, 0, 0),
|
||||
std::min(copySize, bufSize), *xferBufRead_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Get a pointer to the buffer data
|
||||
dbgBufferPtr = reinterpret_cast<uint32_t*>(xferBufRead_->map(&gpu));
|
||||
if (nullptr == dbgBufferPtr) {
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<uint>::const_iterator ita;
|
||||
uint sb = 0;
|
||||
uint sbt = 0;
|
||||
|
||||
// parse the debug buffer
|
||||
while (sbt < copySize) {
|
||||
assert(((*dbgBufferPtr) < printfInfo.size()) &&
|
||||
"Cound't find the reported PrintfID!");
|
||||
const PrintfInfo& info = printfInfo[(*dbgBufferPtr)];
|
||||
sb += sizeof(uint32_t);
|
||||
for (ita = info.arguments_.begin();
|
||||
ita != info.arguments_.end(); ++ita){
|
||||
sb += *ita;
|
||||
}
|
||||
|
||||
if (sbt + sb > bufSize) {
|
||||
break; // Need new portion of data in staging buffer
|
||||
}
|
||||
|
||||
size_t idx = 1;
|
||||
// There's something in the debug buffer
|
||||
outputDbgBuffer(info, dbgBufferPtr, idx);
|
||||
|
||||
sbt += sb;
|
||||
dbgBufferPtr += sb/sizeof(uint32_t);
|
||||
sb = 0;
|
||||
}
|
||||
|
||||
copySize -= sbt;
|
||||
xferBufRead_->unmap(&gpu);
|
||||
}
|
||||
|
||||
dev().xferRead().release(gpu, *xferBufRead_);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
@@ -0,0 +1,192 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALPRINTFDBG_HPP_
|
||||
#define PALPRINTFDBG_HPP_
|
||||
|
||||
#include "device/pal/palmemory.hpp"
|
||||
|
||||
/*! \addtogroup GPU GPU Device Implementation
|
||||
* @{
|
||||
*/
|
||||
#ifndef isinf
|
||||
#ifdef _MSC_VER
|
||||
#define isinf(X) (!_finite(X) && !_isnan(X))
|
||||
#endif //_MSC_VER
|
||||
#endif //isinf
|
||||
|
||||
#ifndef isnan
|
||||
#ifdef _MSC_VER
|
||||
#define isnan(X) (_isnan(X))
|
||||
#endif //_MSC_VER
|
||||
#endif //isnan
|
||||
|
||||
#ifndef copysign
|
||||
#ifdef _MSC_VER
|
||||
#define copysign(X,Y) (_copysign(X,Y))
|
||||
#endif //_MSC_VER
|
||||
#endif //copysign
|
||||
|
||||
//! GPU Device Implementation
|
||||
namespace pal {
|
||||
|
||||
//! Printf info structure
|
||||
struct PrintfInfo
|
||||
{
|
||||
std::string fmtString_; //!< formated string for printf
|
||||
std::vector<uint> arguments_; //!< passed arguments to the printf() call
|
||||
};
|
||||
|
||||
class Kernel;
|
||||
class VirtualGPU;
|
||||
class Memory;
|
||||
|
||||
class PrintfDbg : public amd::HeapObject
|
||||
{
|
||||
public:
|
||||
//! Debug buffer size per workitem
|
||||
static const uint WorkitemDebugSize = 4096;
|
||||
|
||||
//! Default constructor
|
||||
PrintfDbg(
|
||||
Device& device,
|
||||
FILE* file = NULL
|
||||
);
|
||||
|
||||
//! Destructor
|
||||
~PrintfDbg();
|
||||
|
||||
//! Creates the PrintfDbg object
|
||||
bool create();
|
||||
|
||||
//! Initializes the debug buffer before kernel's execution
|
||||
bool init(
|
||||
VirtualGPU& gpu, //!< Virtual GPU object
|
||||
bool printfEnabled, //!< checks for printf
|
||||
const amd::NDRange& size //!< Kernel's workload
|
||||
);
|
||||
|
||||
//! Prints the kernel's debug informaiton from the buffer
|
||||
bool output(
|
||||
VirtualGPU& gpu, //!< Virtual GPU object
|
||||
bool printfEnabled, //!< checks for printf
|
||||
const amd::NDRange& size, //!< Kernel's workload
|
||||
const std::vector<PrintfInfo>& printfInfo //!< printf info
|
||||
);
|
||||
|
||||
//! Debug buffer size per workitem
|
||||
size_t wiDbgSize() const { return wiDbgSize_; }
|
||||
|
||||
//! Returns debug buffer object
|
||||
Memory* dbgBuffer() const { return dbgBuffer_; }
|
||||
|
||||
protected:
|
||||
Memory* dbgBuffer_; //!< Buffer to hold debug output
|
||||
FILE* dbgFile_; //!< Debug file
|
||||
Device& gpuDevice_; //!< GPU device object
|
||||
Memory* xferBufRead_; //!< Transfer buffer for the dump read
|
||||
|
||||
//! Gets GPU device object
|
||||
Device& dev() const { return gpuDevice_; }
|
||||
|
||||
//! Allocates the debug buffer
|
||||
bool allocate(
|
||||
bool realloc = false //!< If TRUE then reallocate the debug memory
|
||||
);
|
||||
|
||||
//! Returns TRUE if a float value has to be printed
|
||||
bool checkFloat(
|
||||
const std::string& fmt //!< Format string
|
||||
) const;
|
||||
|
||||
//! Returns TRUE if a string value has to be printed
|
||||
bool checkString(
|
||||
const std::string& fmt //!< Format string
|
||||
) const;
|
||||
|
||||
//! Finds the specifier in the format string
|
||||
int checkVectorSpecifier(
|
||||
const std::string& fmt, //!< Format string
|
||||
size_t startPos, //!< Start position for processing
|
||||
size_t& curPos //!< End position for processing
|
||||
) const;
|
||||
|
||||
//! Outputs an argument
|
||||
size_t outputArgument(
|
||||
const std::string& fmt, //!< Format strint
|
||||
bool printFloat, //!< Argument is a float value
|
||||
size_t size, //!< Argument's size
|
||||
const uint32_t* argument //!< Argument's location
|
||||
) const;
|
||||
|
||||
//! Displays the PrintfDbg
|
||||
void outputDbgBuffer(
|
||||
const PrintfInfo& info, //!< printf info
|
||||
const uint32_t* workitemData, //!< The PrintfDbg dump buffer
|
||||
size_t& i //!< index to the data in the buffer
|
||||
) const;
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
PrintfDbg(const PrintfDbg&);
|
||||
|
||||
//! Disable assignment
|
||||
PrintfDbg& operator=(const PrintfDbg&);
|
||||
|
||||
//! Returns the pointer to the workitem data block
|
||||
bool clearWorkitems(
|
||||
VirtualGPU& gpu, //!< Virtual GPU object
|
||||
size_t idxStart, //!< Workitem global index start
|
||||
size_t number //!< Number of workitems to clear
|
||||
) const;
|
||||
|
||||
//! Returns the pointer to the workitem data block
|
||||
uint32_t* mapWorkitem(
|
||||
VirtualGPU& gpu, //!< Virtual GPU object
|
||||
size_t idx, //!< Workitem global index
|
||||
bool* realloc //!< Returns TRUE if workitem reached the buffer limit
|
||||
);
|
||||
|
||||
//! Unamp the staged buffer
|
||||
void unmapWorkitem(
|
||||
VirtualGPU& gpu, //!< Virtual GPU object
|
||||
const uint32_t* workitemData //!< The PrintfDbg dump buffer
|
||||
) const;
|
||||
|
||||
size_t wiDbgSize_; //!< Workitem debug size
|
||||
Memory initCntValue_; //!< Initialized count value
|
||||
};
|
||||
class PrintfDbgHSA : public PrintfDbg
|
||||
{
|
||||
public:
|
||||
|
||||
//! Default constructor
|
||||
PrintfDbgHSA(
|
||||
Device& device,
|
||||
FILE* file = NULL
|
||||
): PrintfDbg(device, file) { }
|
||||
|
||||
//! Initializes the debug buffer before kernel's execution
|
||||
bool init(
|
||||
VirtualGPU& gpu, //!< Virtual GPU object
|
||||
bool printfEnabled //!< checks for printf
|
||||
);
|
||||
|
||||
//! Prints the kernel's debug informaiton from the buffer
|
||||
bool output(
|
||||
VirtualGPU& gpu, //!< Virtual GPU object
|
||||
bool printfEnabled, //!< checks for printf
|
||||
const std::vector<PrintfInfo>& printfInfo //!< printf info
|
||||
);
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
PrintfDbgHSA(const PrintfDbgHSA&);
|
||||
|
||||
//! Disable assignment
|
||||
PrintfDbgHSA& operator=(const PrintfDbgHSA&);
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
|
||||
#endif /*PALPRINTFDBG_HPP_*/
|
||||
@@ -0,0 +1,925 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#include "os/os.hpp"
|
||||
#include "utils/flags.hpp"
|
||||
#include "include/aclTypes.h"
|
||||
#include "utils/amdilUtils.hpp"
|
||||
#include "utils/bif_section_labels.hpp"
|
||||
#include "device/pal/palprogram.hpp"
|
||||
#include "device/pal/palblit.hpp"
|
||||
#include "macrodata.h"
|
||||
#include "MDParser/AMDILMDInterface.h"
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <cstdio>
|
||||
#include <algorithm>
|
||||
#include "utils/options.hpp"
|
||||
#include "hsa.h"
|
||||
#include "hsa_ext_image.h"
|
||||
#include "amd_hsa_loader.hpp"
|
||||
|
||||
namespace pal {
|
||||
|
||||
HSAILProgram::HSAILProgram(Device& device)
|
||||
: Program(device)
|
||||
, llvmBinary_()
|
||||
, binaryElf_(nullptr)
|
||||
, rawBinary_(nullptr)
|
||||
, kernels_(nullptr)
|
||||
, maxScratchRegs_(0)
|
||||
, isNull_(false)
|
||||
, executable_(nullptr)
|
||||
, loaderContext_(this)
|
||||
{
|
||||
memset(&binOpts_, 0, sizeof(binOpts_));
|
||||
binOpts_.struct_size = sizeof(binOpts_);
|
||||
binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64);
|
||||
binOpts_.bitness = ELFDATA2LSB;
|
||||
binOpts_.alloc = &::malloc;
|
||||
binOpts_.dealloc = &::free;
|
||||
loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
|
||||
}
|
||||
|
||||
HSAILProgram::HSAILProgram(NullDevice& device)
|
||||
: Program(device)
|
||||
, llvmBinary_()
|
||||
, binaryElf_(nullptr)
|
||||
, rawBinary_(nullptr)
|
||||
, kernels_(nullptr)
|
||||
, maxScratchRegs_(0)
|
||||
, isNull_(true)
|
||||
, executable_(nullptr)
|
||||
, loaderContext_(this)
|
||||
{
|
||||
memset(&binOpts_, 0, sizeof(binOpts_));
|
||||
binOpts_.struct_size = sizeof(binOpts_);
|
||||
binOpts_.elfclass = LP64_SWITCH(ELFCLASS32, ELFCLASS64);
|
||||
binOpts_.bitness = ELFDATA2LSB;
|
||||
binOpts_.alloc = &::malloc;
|
||||
binOpts_.dealloc = &::free;
|
||||
loader_ = amd::hsa::loader::Loader::Create(&loaderContext_);
|
||||
}
|
||||
|
||||
HSAILProgram::~HSAILProgram()
|
||||
{
|
||||
// Destroy internal static samplers
|
||||
for (auto& it : staticSamplers_) {
|
||||
delete it;
|
||||
}
|
||||
if (rawBinary_ != nullptr) {
|
||||
free(rawBinary_);
|
||||
}
|
||||
acl_error error;
|
||||
// Free the elf binary
|
||||
if (binaryElf_ != nullptr) {
|
||||
error = aclBinaryFini(binaryElf_);
|
||||
if (error != ACL_SUCCESS) {
|
||||
LogWarning( "Error while destroying the acl binary \n" );
|
||||
}
|
||||
}
|
||||
releaseClBinary();
|
||||
if (executable_ != nullptr) {
|
||||
loader_->DestroyExecutable(executable_);
|
||||
}
|
||||
delete kernels_;
|
||||
amd::hsa::loader::Loader::Destroy(loader_);
|
||||
}
|
||||
|
||||
bool
|
||||
HSAILProgram::initBuild(amd::option::Options *options)
|
||||
{
|
||||
if (!device::Program::initBuild(options)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const char* devName = dev().hwInfo()->machineTarget_;
|
||||
options->setPerBuildInfo(
|
||||
(devName && (devName[0] != '\0')) ? devName : "gpu",
|
||||
clBinary()->getEncryptCode(), true);
|
||||
|
||||
// Elf Binary setup
|
||||
std::string outFileName;
|
||||
|
||||
// true means fsail required
|
||||
clBinary()->init(options, true);
|
||||
if (options->isDumpFlagSet(amd::option::DUMP_BIF)) {
|
||||
outFileName = options->getDumpFileName(".bin");
|
||||
}
|
||||
|
||||
if (!clBinary()->setElfOut(LP64_SWITCH(ELFCLASS32, ELFCLASS64),
|
||||
(outFileName.size() > 0) ? outFileName.c_str() : nullptr)) {
|
||||
LogError("Setup elf out for gpu failed");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
HSAILProgram::finiBuild(bool isBuildGood)
|
||||
{
|
||||
clBinary()->resetElfOut();
|
||||
clBinary()->resetElfIn();
|
||||
|
||||
if (!isBuildGood) {
|
||||
// Prevent the encrypted binary form leaking out
|
||||
clBinary()->setBinary(nullptr, 0);
|
||||
}
|
||||
|
||||
return device::Program::finiBuild(isBuildGood);
|
||||
}
|
||||
|
||||
bool
|
||||
HSAILProgram::linkImpl(
|
||||
const std::vector<device::Program *> &inputPrograms,
|
||||
amd::option::Options *options,
|
||||
bool createLibrary)
|
||||
{
|
||||
std::vector<device::Program *>::const_iterator it
|
||||
= inputPrograms.begin();
|
||||
std::vector<device::Program *>::const_iterator itEnd
|
||||
= inputPrograms.end();
|
||||
acl_error errorCode;
|
||||
|
||||
// For each program we need to extract the LLVMIR and create
|
||||
// aclBinary for each
|
||||
std::vector<aclBinary *> binaries_to_link;
|
||||
|
||||
for (size_t i = 0; it != itEnd; ++it, ++i) {
|
||||
HSAILProgram *program = (HSAILProgram *)*it;
|
||||
// Check if the program was created with clCreateProgramWIthBinary
|
||||
binary_t binary = program->binary();
|
||||
if ((binary.first != nullptr) && (binary.second > 0)) {
|
||||
// Binary already exists -- we can also check if there is no
|
||||
// opencl source code
|
||||
// Need to check if LLVMIR exists in the binary
|
||||
// If LLVMIR does not exist then is it valid
|
||||
// We need to pull out all the compiled kernels
|
||||
// We cannot do this at present because we need at least
|
||||
// Hsail text to pull the kernels oout
|
||||
void *mem = const_cast<void *>(binary.first);
|
||||
binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
LogWarning("Error while linking : Could not read from raw binary");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// At this stage each HSAILProgram contains a valid binary_elf
|
||||
// Check if LLVMIR is in the binary
|
||||
// @TODO - Memory leak , cannot free this buffer
|
||||
// need to fix this.. File EPR on compiler library
|
||||
size_t llvmirSize = 0;
|
||||
const void *llvmirText = aclExtractSection(dev().compiler(),
|
||||
binaryElf_, &llvmirSize, aclLLVMIR, &errorCode);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
bool spirv = false;
|
||||
size_t boolSize = sizeof(bool);
|
||||
errorCode = aclQueryInfo(dev().compiler(), binaryElf_,
|
||||
RT_CONTAINS_SPIRV, nullptr, &spirv, &boolSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
spirv = false;
|
||||
}
|
||||
if (spirv) {
|
||||
errorCode = aclCompile(dev().compiler(), binaryElf_,
|
||||
options->origOptionStr.c_str(), ACL_TYPE_SPIRV_BINARY,
|
||||
ACL_TYPE_LLVMIR_BINARY, nullptr);
|
||||
buildLog_ += aclGetCompilerLog(dev().compiler());
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error while linking: Could not load SPIR-V" ;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
buildLog_ +="Error while linking : \
|
||||
Invalid binary (Missing LLVMIR section)" ;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Create a new aclBinary for each LLVMIR and save it in a list
|
||||
aclBIFVersion ver = aclBinaryVersion(binaryElf_);
|
||||
aclBinary *bin = aclCreateFromBinary(binaryElf_, ver);
|
||||
binaries_to_link.push_back(bin);
|
||||
}
|
||||
|
||||
errorCode = aclLink(dev().compiler(),
|
||||
binaries_to_link[0], binaries_to_link.size() - 1,
|
||||
binaries_to_link.size() > 1 ? &binaries_to_link[1] : NULL,
|
||||
ACL_TYPE_LLVMIR_BINARY, "-create-library", NULL);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += aclGetCompilerLog(dev().compiler());
|
||||
buildLog_ +="Error while linking : aclLink failed" ;
|
||||
return false;
|
||||
}
|
||||
// Store the newly linked aclBinary for this program.
|
||||
binaryElf_ = binaries_to_link[0];
|
||||
// Free all the other aclBinaries
|
||||
for (size_t i = 1; i < binaries_to_link.size(); i++) {
|
||||
aclBinaryFini(binaries_to_link[i]);
|
||||
}
|
||||
if (createLibrary) {
|
||||
size_t size = 0;
|
||||
void *mem = NULL;
|
||||
aclWriteToMem(binaryElf_, &mem, &size);
|
||||
setBinary(static_cast<char*>(mem), size);
|
||||
buildLog_ += aclGetCompilerLog(dev().compiler());
|
||||
setType(TYPE_LIBRARY);
|
||||
return true;
|
||||
}
|
||||
// Now call linkImpl with the new options
|
||||
return linkImpl(options);
|
||||
}
|
||||
|
||||
aclType
|
||||
HSAILProgram::getCompilationStagesFromBinary(std::vector<aclType>& completeStages, bool& needOptionsCheck)
|
||||
{
|
||||
acl_error errorCode;
|
||||
size_t secSize = 0;
|
||||
completeStages.clear();
|
||||
aclType from = ACL_TYPE_DEFAULT;
|
||||
needOptionsCheck = true;
|
||||
size_t boolSize = sizeof(bool);
|
||||
//! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
|
||||
// Checking llvmir in .llvmir section
|
||||
bool containsSpirv = true;
|
||||
errorCode = aclQueryInfo(dev().compiler(), binaryElf_,
|
||||
RT_CONTAINS_SPIRV, nullptr, &containsSpirv, &boolSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
containsSpirv = false;
|
||||
}
|
||||
if (containsSpirv) {
|
||||
completeStages.push_back(from);
|
||||
from = ACL_TYPE_SPIRV_BINARY;
|
||||
}
|
||||
bool containsSpirText = true;
|
||||
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_SPIR, nullptr, &containsSpirText, &boolSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
containsSpirText = false;
|
||||
}
|
||||
if (containsSpirText) {
|
||||
completeStages.push_back(from);
|
||||
from = ACL_TYPE_SPIR_BINARY;
|
||||
}
|
||||
bool containsLlvmirText = true;
|
||||
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LLVMIR, nullptr, &containsLlvmirText, &boolSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
containsLlvmirText = false;
|
||||
}
|
||||
// Checking compile & link options in .comment section
|
||||
bool containsOpts = true;
|
||||
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_OPTIONS, nullptr, &containsOpts, &boolSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
containsOpts = false;
|
||||
}
|
||||
if (containsLlvmirText && containsOpts) {
|
||||
completeStages.push_back(from);
|
||||
from = ACL_TYPE_LLVMIR_BINARY;
|
||||
}
|
||||
// Checking HSAIL in .cg section
|
||||
bool containsHsailText = true;
|
||||
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_HSAIL, nullptr, &containsHsailText, &boolSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
containsHsailText = false;
|
||||
}
|
||||
// Checking BRIG sections
|
||||
bool containsBrig = true;
|
||||
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_BRIG, nullptr, &containsBrig, &boolSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
containsBrig = false;
|
||||
}
|
||||
if (containsBrig) {
|
||||
completeStages.push_back(from);
|
||||
from = ACL_TYPE_HSAIL_BINARY;
|
||||
} else if (containsHsailText) {
|
||||
completeStages.push_back(from);
|
||||
from = ACL_TYPE_HSAIL_TEXT;
|
||||
}
|
||||
// Checking Loader Map symbol from CG section
|
||||
bool containsLoaderMap = true;
|
||||
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_LOADER_MAP, nullptr, &containsLoaderMap, &boolSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
containsLoaderMap = false;
|
||||
}
|
||||
if (containsLoaderMap) {
|
||||
completeStages.push_back(from);
|
||||
from = ACL_TYPE_CG;
|
||||
}
|
||||
// Checking ISA in .text section
|
||||
bool containsShaderIsa = true;
|
||||
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_CONTAINS_ISA, nullptr, &containsShaderIsa, &boolSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
containsShaderIsa = false;
|
||||
}
|
||||
if (containsShaderIsa) {
|
||||
completeStages.push_back(from);
|
||||
from = ACL_TYPE_ISA;
|
||||
}
|
||||
std::string sCurOptions = compileOptions_ + linkOptions_;
|
||||
amd::option::Options curOptions;
|
||||
if (!amd::option::parseAllOptions(sCurOptions, curOptions)) {
|
||||
buildLog_ += curOptions.optionsLog();
|
||||
LogError("Parsing compile options failed.");
|
||||
return ACL_TYPE_DEFAULT;
|
||||
}
|
||||
switch (from) {
|
||||
// compile from HSAIL text, no matter prev. stages and options
|
||||
case ACL_TYPE_HSAIL_TEXT:
|
||||
needOptionsCheck = false;
|
||||
break;
|
||||
case ACL_TYPE_HSAIL_BINARY:
|
||||
// do not check options, if LLVMIR is absent or might be absent or options are absent
|
||||
if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) {
|
||||
needOptionsCheck = false;
|
||||
}
|
||||
break;
|
||||
case ACL_TYPE_CG:
|
||||
case ACL_TYPE_ISA:
|
||||
// do not check options, if LLVMIR is absent or might be absent or options are absent
|
||||
if (!curOptions.oVariables->BinLLVMIR || !containsLlvmirText || !containsOpts) {
|
||||
needOptionsCheck = false;
|
||||
}
|
||||
// do not check options, if BRIG is absent or might be absent or LoaderMap is absent
|
||||
if (!curOptions.oVariables->BinCG || !containsBrig || !containsLoaderMap) {
|
||||
needOptionsCheck = false;
|
||||
}
|
||||
break;
|
||||
// recompilation might be needed
|
||||
case ACL_TYPE_LLVMIR_BINARY:
|
||||
case ACL_TYPE_DEFAULT:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return from;
|
||||
}
|
||||
|
||||
aclType
|
||||
HSAILProgram::getNextCompilationStageFromBinary(amd::option::Options* options) {
|
||||
aclType continueCompileFrom = ACL_TYPE_DEFAULT;
|
||||
binary_t binary = this->binary();
|
||||
// If the binary already exists
|
||||
if ((binary.first != nullptr) && (binary.second > 0)) {
|
||||
void *mem = const_cast<void *>(binary.first);
|
||||
acl_error errorCode;
|
||||
binaryElf_ = aclReadFromMem(mem, binary.second, &errorCode);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Reading the binary from memory failed.\n";
|
||||
return continueCompileFrom;
|
||||
}
|
||||
// Calculate the next stage to compile from, based on sections in binaryElf_;
|
||||
// No any validity checks here
|
||||
std::vector<aclType> completeStages;
|
||||
bool needOptionsCheck = true;
|
||||
continueCompileFrom = getCompilationStagesFromBinary(completeStages, needOptionsCheck);
|
||||
// Saving binary in the interface class,
|
||||
// which also load compile & link options from binary
|
||||
setBinary(static_cast<char*>(mem), binary.second);
|
||||
if (!options || !needOptionsCheck) {
|
||||
return continueCompileFrom;
|
||||
}
|
||||
bool recompile = false;
|
||||
//! @todo Should we also check for ACL_TYPE_OPENCL & ACL_TYPE_LLVMIR_TEXT?
|
||||
switch (continueCompileFrom) {
|
||||
case ACL_TYPE_HSAIL_BINARY:
|
||||
case ACL_TYPE_CG:
|
||||
case ACL_TYPE_ISA: {
|
||||
// Compare options loaded from binary with current ones, recompile if differ;
|
||||
// If compile options are absent in binary, do not compare and recompile
|
||||
if (compileOptions_.empty())
|
||||
break;
|
||||
const oclBIFSymbolStruct* symbol = findBIF30SymStruct(symOpenclCompilerOptions);
|
||||
assert(symbol && "symbol not found");
|
||||
std::string symName = std::string(symbol->str[bif::PRE]) + std::string(symbol->str[bif::POST]);
|
||||
size_t symSize = 0;
|
||||
const void *opts = aclExtractSymbol(dev().compiler(),
|
||||
binaryElf_, &symSize, aclCOMMENT, symName.c_str(), &errorCode);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
recompile = true;
|
||||
break;
|
||||
}
|
||||
std::string sBinOptions = std::string((char*)opts, symSize);
|
||||
std::string sCurOptions = compileOptions_ + linkOptions_;
|
||||
amd::option::Options curOptions, binOptions;
|
||||
if (!amd::option::parseAllOptions(sBinOptions, binOptions)) {
|
||||
buildLog_ += binOptions.optionsLog();
|
||||
LogError("Parsing compile options from binary failed.");
|
||||
return ACL_TYPE_DEFAULT;
|
||||
}
|
||||
if (!amd::option::parseAllOptions(sCurOptions, curOptions)) {
|
||||
buildLog_ += curOptions.optionsLog();
|
||||
LogError("Parsing compile options failed.");
|
||||
return ACL_TYPE_DEFAULT;
|
||||
}
|
||||
if (!curOptions.equals(binOptions)) {
|
||||
recompile = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (recompile) {
|
||||
while (!completeStages.empty()) {
|
||||
continueCompileFrom = completeStages.back();
|
||||
if (continueCompileFrom == ACL_TYPE_SPIRV_BINARY ||
|
||||
continueCompileFrom == ACL_TYPE_LLVMIR_BINARY ||
|
||||
continueCompileFrom == ACL_TYPE_SPIR_BINARY ||
|
||||
continueCompileFrom == ACL_TYPE_DEFAULT) {
|
||||
break;
|
||||
}
|
||||
completeStages.pop_back();
|
||||
}
|
||||
}
|
||||
}
|
||||
return continueCompileFrom;
|
||||
}
|
||||
|
||||
inline static std::vector<std::string>
|
||||
splitSpaceSeparatedString(char *str)
|
||||
{
|
||||
std::string s(str);
|
||||
std::stringstream ss(s);
|
||||
std::istream_iterator<std::string> beg(ss), end;
|
||||
std::vector<std::string> vec(beg, end);
|
||||
return vec;
|
||||
}
|
||||
|
||||
bool
|
||||
HSAILProgram::linkImpl(amd::option::Options* options)
|
||||
{
|
||||
acl_error errorCode;
|
||||
aclType continueCompileFrom = ACL_TYPE_LLVMIR_BINARY;
|
||||
bool finalize = true;
|
||||
bool hsaLoad = true;
|
||||
// If !binaryElf_ then program must have been created using clCreateProgramWithBinary
|
||||
if (!binaryElf_) {
|
||||
continueCompileFrom = getNextCompilationStageFromBinary(options);
|
||||
}
|
||||
switch (continueCompileFrom) {
|
||||
case ACL_TYPE_SPIRV_BINARY:
|
||||
case ACL_TYPE_SPIR_BINARY:
|
||||
// Compilation from ACL_TYPE_LLVMIR_BINARY to ACL_TYPE_CG in cases:
|
||||
// 1. if the program is not created with binary;
|
||||
// 2. if the program is created with binary and contains only .llvmir & .comment
|
||||
// 3. if the program is created with binary, contains .llvmir, .comment, brig sections,
|
||||
// but the binary's compile & link options differ from current ones (recompilation);
|
||||
case ACL_TYPE_LLVMIR_BINARY:
|
||||
// Compilation from ACL_TYPE_HSAIL_BINARY to ACL_TYPE_CG in cases:
|
||||
// 1. if the program is created with binary and contains only brig sections
|
||||
case ACL_TYPE_HSAIL_BINARY:
|
||||
// Compilation from ACL_TYPE_HSAIL_TEXT to ACL_TYPE_CG in cases:
|
||||
// 1. if the program is created with binary and contains only hsail text
|
||||
case ACL_TYPE_HSAIL_TEXT: {
|
||||
std::string curOptions = options->origOptionStr + hsailOptions();
|
||||
errorCode = aclCompile(dev().compiler(), binaryElf_,
|
||||
curOptions.c_str(), continueCompileFrom, ACL_TYPE_CG, nullptr);
|
||||
buildLog_ += aclGetCompilerLog(dev().compiler());
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: BRIG code generation failed.\n";
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ACL_TYPE_CG:
|
||||
break;
|
||||
case ACL_TYPE_ISA:
|
||||
finalize = false;
|
||||
break;
|
||||
default:
|
||||
buildLog_ += "Error: The binary is incorrect or incomplete. Finalization to ISA couldn't be performed.\n";
|
||||
return false;
|
||||
}
|
||||
if (finalize) {
|
||||
std::string fin_options(options->origOptionStr + hsailOptions());
|
||||
// Append an option so that we can selectively enable a SCOption on CZ
|
||||
// whenever IOMMUv2 is enabled.
|
||||
if (dev().settings().svmFineGrainSystem_) {
|
||||
fin_options.append(" -sc-xnack-iommu");
|
||||
}
|
||||
errorCode = aclCompile(dev().compiler(), binaryElf_,
|
||||
fin_options.c_str(), ACL_TYPE_CG, ACL_TYPE_ISA, nullptr);
|
||||
buildLog_ += aclGetCompilerLog(dev().compiler());
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: BRIG finalization to ISA failed.\n";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// ACL_TYPE_CG stage is not performed for offline compilation
|
||||
hsa_agent_t agent;
|
||||
agent.handle = 1;
|
||||
if (!isNull() && hsaLoad) {
|
||||
executable_ = loader_->CreateExecutable(HSA_PROFILE_BASE, nullptr);
|
||||
if (executable_ == nullptr) {
|
||||
buildLog_ += "Error: Executable for AMD HSA Code Object isn't created.\n";
|
||||
return false;
|
||||
}
|
||||
size_t size = 0;
|
||||
hsa_code_object_t code_object;
|
||||
code_object.handle = reinterpret_cast<uint64_t>(aclExtractSection(dev().compiler(), binaryElf_, &size, aclTEXT, &errorCode));
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Extracting AMD HSA Code Object from binary failed.\n";
|
||||
return false;
|
||||
}
|
||||
hsa_status_t status = executable_->LoadCodeObject(agent, code_object, nullptr);
|
||||
if (status != HSA_STATUS_SUCCESS) {
|
||||
buildLog_ += "Error: AMD HSA Code Object loading failed.\n";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
size_t kernelNamesSize = 0;
|
||||
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, nullptr, &kernelNamesSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Querying of kernel names size from the binary failed.\n";
|
||||
return false;
|
||||
}
|
||||
if (!isNull() && kernelNamesSize > 0) {
|
||||
char* kernelNames = new char[kernelNamesSize];
|
||||
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_KERNEL_NAMES, nullptr, kernelNames, &kernelNamesSize);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Querying of kernel names from the binary failed.\n";
|
||||
delete kernelNames;
|
||||
return false;
|
||||
}
|
||||
std::vector<std::string> vKernels = splitSpaceSeparatedString(kernelNames);
|
||||
delete kernelNames;
|
||||
std::vector<std::string>::iterator it = vKernels.begin();
|
||||
bool dynamicParallelism = false;
|
||||
aclMetadata md;
|
||||
md.numHiddenKernelArgs = 0;
|
||||
size_t sizeOfnumHiddenKernelArgs = sizeof(md.numHiddenKernelArgs);
|
||||
for (it; it != vKernels.end(); ++it) {
|
||||
std::string kernelName(*it);
|
||||
std::string openclKernelName = device::Kernel::openclMangledName(kernelName);
|
||||
errorCode = aclQueryInfo(dev().compiler(), binaryElf_, RT_NUM_KERNEL_HIDDEN_ARGS,
|
||||
openclKernelName.c_str(), &md.numHiddenKernelArgs, &sizeOfnumHiddenKernelArgs);
|
||||
if (errorCode != ACL_SUCCESS) {
|
||||
buildLog_ += "Error: Querying of kernel '" + openclKernelName +
|
||||
"' extra arguments count from AMD HSA Code Object failed. Kernel initialization failed.\n";
|
||||
return false;
|
||||
}
|
||||
HSAILKernel *aKernel = new HSAILKernel(kernelName, this, options->origOptionStr + hsailOptions(),
|
||||
md.numHiddenKernelArgs);
|
||||
kernels()[kernelName] = aKernel;
|
||||
amd::hsa::loader::Symbol *sym = executable_->GetSymbol("", openclKernelName.c_str(), agent, 0);
|
||||
if (!sym) {
|
||||
buildLog_ += "Error: Getting kernel ISA code symbol '" + openclKernelName +
|
||||
"' from AMD HSA Code Object failed. Kernel initialization failed.\n";
|
||||
return false;
|
||||
}
|
||||
if (!aKernel->init(sym, false)) {
|
||||
buildLog_ += "Error: Kernel '" + openclKernelName + "' initialization failed.\n";
|
||||
return false;
|
||||
}
|
||||
buildLog_ += aKernel->buildLog();
|
||||
aKernel->setUniformWorkGroupSize(options->oVariables->UniformWorkGroupSize);
|
||||
dynamicParallelism |= aKernel->dynamicParallelism();
|
||||
// Find max scratch regs used in the program. It's used for scratch buffer preallocation
|
||||
// with dynamic parallelism, since runtime doesn't know which child kernel will be called
|
||||
maxScratchRegs_ = std::max(static_cast<uint>(aKernel->workGroupInfo()->scratchRegs_), maxScratchRegs_);
|
||||
}
|
||||
// Allocate kernel table for device enqueuing
|
||||
if (!isNull() && dynamicParallelism && !allocKernelTable()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Save the binary in the interface class
|
||||
size_t size = 0;
|
||||
void *mem = nullptr;
|
||||
aclWriteToMem(binaryElf_, &mem, &size);
|
||||
setBinary(static_cast<char*>(mem), size);
|
||||
buildLog_ += aclGetCompilerLog(dev().compiler());
|
||||
setType(TYPE_EXECUTABLE);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
HSAILProgram::createBinary(amd::option::Options *options)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
HSAILProgram::initClBinary()
|
||||
{
|
||||
if (clBinary_ == nullptr) {
|
||||
clBinary_ = new ClBinaryHsa(static_cast<const Device &>(device()));
|
||||
if (clBinary_ == nullptr) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
HSAILProgram::releaseClBinary()
|
||||
{
|
||||
if (clBinary_ != nullptr) {
|
||||
delete clBinary_;
|
||||
clBinary_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
std::string
|
||||
HSAILProgram::hsailOptions()
|
||||
{
|
||||
std::string hsailOptions;
|
||||
// Set options for the standard device specific options
|
||||
// All our devices support these options now
|
||||
if (dev().settings().reportFMAF_) {
|
||||
hsailOptions.append(" -DFP_FAST_FMAF=1");
|
||||
}
|
||||
if (dev().settings().reportFMA_) {
|
||||
hsailOptions.append(" -DFP_FAST_FMA=1");
|
||||
}
|
||||
if (!dev().settings().singleFpDenorm_) {
|
||||
hsailOptions.append(" -cl-denorms-are-zero");
|
||||
}
|
||||
|
||||
// Check if the host is 64 bit or 32 bit
|
||||
LP64_ONLY(hsailOptions.append(" -m64"));
|
||||
|
||||
// Append each extension supported by the device
|
||||
std::string token;
|
||||
std::istringstream iss("");
|
||||
iss.str(device().info().extensions_);
|
||||
while (getline(iss, token, ' ')) {
|
||||
if (!token.empty()) {
|
||||
hsailOptions.append(" -D");
|
||||
hsailOptions.append(token);
|
||||
hsailOptions.append("=1");
|
||||
}
|
||||
}
|
||||
return hsailOptions;
|
||||
}
|
||||
|
||||
bool
|
||||
HSAILProgram::allocKernelTable()
|
||||
{
|
||||
uint size = kernels().size() * sizeof(size_t);
|
||||
|
||||
kernels_ = new pal::Memory(dev(), size);
|
||||
// Initialize kernel table
|
||||
if ((kernels_ == nullptr) || !kernels_->create(Resource::RemoteUSWC)) {
|
||||
delete kernels_;
|
||||
return false;
|
||||
}
|
||||
else {
|
||||
size_t* table = reinterpret_cast<size_t*>(
|
||||
kernels_->map(nullptr, pal::Resource::WriteOnly));
|
||||
for (auto& it : kernels()) {
|
||||
HSAILKernel* kernel = static_cast<HSAILKernel*>(it.second);
|
||||
table[kernel->index()] = static_cast<size_t>(
|
||||
kernel->gpuAqlCode()->vmAddress());
|
||||
}
|
||||
kernels_->unmap(nullptr);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
HSAILProgram::fillResListWithKernels(
|
||||
std::vector<const Memory*>& memList) const
|
||||
{
|
||||
for (auto& it : kernels()) {
|
||||
memList.push_back(
|
||||
static_cast<HSAILKernel*>(it.second)->gpuAqlCode());
|
||||
}
|
||||
}
|
||||
|
||||
const aclTargetInfo &
|
||||
HSAILProgram::info(const char * str) {
|
||||
acl_error err;
|
||||
std::string arch = "hsail";
|
||||
if (dev().settings().use64BitPtr_) {
|
||||
arch = "hsail64";
|
||||
}
|
||||
info_ = aclGetTargetInfo(arch.c_str(), ( str && str[0] == '\0' ?
|
||||
dev().hwInfo()->targetName_ : str ), &err);
|
||||
if (err != ACL_SUCCESS) {
|
||||
LogWarning("aclGetTargetInfo failed");
|
||||
}
|
||||
return info_;
|
||||
}
|
||||
|
||||
hsa_isa_t ORCAHSALoaderContext::IsaFromName(const char *name) {
|
||||
hsa_isa_t isa = {0};
|
||||
if (!strcmp(Gfx700, name)) { isa.handle = gfx700; return isa; }
|
||||
if (!strcmp(Gfx701, name)) { isa.handle = gfx701; return isa; }
|
||||
if (!strcmp(Gfx800, name)) { isa.handle = gfx800; return isa; }
|
||||
if (!strcmp(Gfx801, name)) { isa.handle = gfx801; return isa; }
|
||||
if (!strcmp(Gfx804, name)) { isa.handle = gfx804; return isa; }
|
||||
if (!strcmp(Gfx810, name)) { isa.handle = gfx810; return isa; }
|
||||
if (!strcmp(Gfx900, name)) { isa.handle = gfx900; return isa; }
|
||||
if (!strcmp(Gfx901, name)) { isa.handle = gfx901; return isa; }
|
||||
return isa;
|
||||
}
|
||||
|
||||
bool ORCAHSALoaderContext::IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) {
|
||||
switch (program_->dev().hwInfo()->gfxipVersion_) {
|
||||
default:
|
||||
LogError("Unsupported gfxip version");
|
||||
return false;
|
||||
case gfx700:
|
||||
case gfx701:
|
||||
case gfx702:
|
||||
// gfx701 only differs from gfx700 by faster fp operations and can be loaded on either device.
|
||||
return isa.handle == gfx700 || isa.handle == gfx701;
|
||||
case gfx800:
|
||||
switch (program_->dev().properties().revision) {
|
||||
case Pal::AsicRevision::Iceland:
|
||||
case Pal::AsicRevision::Tonga:
|
||||
return isa.handle == gfx800;
|
||||
case Pal::AsicRevision::Carrizo:
|
||||
return isa.handle == gfx801;
|
||||
case Pal::AsicRevision::Fiji:
|
||||
case Pal::AsicRevision::Ellesmere:
|
||||
case Pal::AsicRevision::Baffin:
|
||||
// gfx800 ISA has only sgrps limited and can be loaded.
|
||||
// gfx801 ISA has XNACK limitations and can be loaded.
|
||||
return isa.handle == gfx800 || isa.handle == gfx801 || isa.handle == gfx804;
|
||||
case Pal::AsicRevision::Stoney:
|
||||
return isa.handle == gfx810;
|
||||
default:
|
||||
assert(0);
|
||||
return false;
|
||||
}
|
||||
case gfx900:
|
||||
switch (program_->dev().properties().revision) {
|
||||
case 0:
|
||||
/* case Pal::AsicRevision::Greenland:
|
||||
return isa.handle == gfx900 || isa.handle == gfx901;*/
|
||||
default:
|
||||
assert(0);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void* ORCAHSALoaderContext::SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, size_t size, size_t align, bool zero) {
|
||||
assert(size);
|
||||
assert(align);
|
||||
switch (segment) {
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
|
||||
case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
|
||||
return AgentGlobalAlloc(agent, size, align, zero);
|
||||
case AMDGPU_HSA_SEGMENT_CODE_AGENT:
|
||||
return KernelCodeAlloc(agent, size, align, zero);
|
||||
default:
|
||||
assert(false); return 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool ORCAHSALoaderContext::SegmentCopy(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* dst, size_t offset, const void* src, size_t size) {
|
||||
switch (segment) {
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
|
||||
case AMDGPU_HSA_SEGMENT_READONLY_AGENT:
|
||||
return AgentGlobalCopy(dst, offset, src, size);
|
||||
case AMDGPU_HSA_SEGMENT_CODE_AGENT:
|
||||
return KernelCodeCopy(dst, offset, src, size);
|
||||
default:
|
||||
assert(false); return false;
|
||||
}
|
||||
}
|
||||
|
||||
void ORCAHSALoaderContext::SegmentFree(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* seg, size_t size) {
|
||||
switch (segment) {
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
|
||||
case AMDGPU_HSA_SEGMENT_READONLY_AGENT: AgentGlobalFree(seg, size); break;
|
||||
case AMDGPU_HSA_SEGMENT_CODE_AGENT: KernelCodeFree(seg, size); break;
|
||||
default:
|
||||
assert(false); return;
|
||||
}
|
||||
}
|
||||
|
||||
void* ORCAHSALoaderContext::SegmentAddress(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* seg, size_t offset) {
|
||||
assert(seg);
|
||||
switch (segment) {
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_PROGRAM:
|
||||
case AMDGPU_HSA_SEGMENT_GLOBAL_AGENT:
|
||||
case AMDGPU_HSA_SEGMENT_READONLY_AGENT: {
|
||||
pal::Memory *gpuMem = reinterpret_cast<pal::Memory*>(seg);
|
||||
return reinterpret_cast<void*>(gpuMem->vmAddress() + offset);
|
||||
}
|
||||
case AMDGPU_HSA_SEGMENT_CODE_AGENT: return (char*) seg + offset;
|
||||
default:
|
||||
assert(false); return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
hsa_status_t ORCAHSALoaderContext::SamplerCreate(
|
||||
hsa_agent_t agent,
|
||||
const hsa_ext_sampler_descriptor_t *sampler_descriptor,
|
||||
hsa_ext_sampler_t *sampler_handle) {
|
||||
if (!agent.handle) {
|
||||
return HSA_STATUS_ERROR_INVALID_AGENT;
|
||||
}
|
||||
if (!sampler_descriptor || !sampler_handle) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
uint32_t state = 0;
|
||||
switch (sampler_descriptor->coordinate_mode) {
|
||||
case HSA_EXT_SAMPLER_COORDINATE_MODE_UNNORMALIZED: state = amd::Sampler::StateNormalizedCoordsFalse; break;
|
||||
case HSA_EXT_SAMPLER_COORDINATE_MODE_NORMALIZED: state = amd::Sampler::StateNormalizedCoordsTrue; break;
|
||||
default:
|
||||
assert(false);
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
switch (sampler_descriptor->filter_mode) {
|
||||
case HSA_EXT_SAMPLER_FILTER_MODE_NEAREST: state |= amd::Sampler::StateFilterNearest; break;
|
||||
case HSA_EXT_SAMPLER_FILTER_MODE_LINEAR: state |= amd::Sampler::StateFilterLinear; break;
|
||||
default:
|
||||
assert(false);
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
|
||||
}
|
||||
switch (sampler_descriptor->address_mode) {
|
||||
case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_EDGE: state |= amd::Sampler::StateAddressClampToEdge; break;
|
||||
case HSA_EXT_SAMPLER_ADDRESSING_MODE_CLAMP_TO_BORDER: state |= amd::Sampler::StateAddressClamp; break;
|
||||
case HSA_EXT_SAMPLER_ADDRESSING_MODE_REPEAT: state |= amd::Sampler::StateAddressRepeat; break;
|
||||
case HSA_EXT_SAMPLER_ADDRESSING_MODE_MIRRORED_REPEAT: state |= amd::Sampler::StateAddressMirroredRepeat; break;
|
||||
case HSA_EXT_SAMPLER_ADDRESSING_MODE_UNDEFINED: state |= amd::Sampler::StateAddressNone; break;
|
||||
default:
|
||||
assert(false);
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
assert(!program_->dev().settings().hsailDirectSRD_);
|
||||
pal::Sampler* sampler = new pal::Sampler(program_->dev());
|
||||
if (!sampler || !sampler->create(state)) {
|
||||
delete sampler;
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
program_->addSampler(sampler);
|
||||
sampler_handle->handle = sampler->hwSrd();
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t ORCAHSALoaderContext::SamplerDestroy(
|
||||
hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) {
|
||||
if (!agent.handle) {
|
||||
return HSA_STATUS_ERROR_INVALID_AGENT;
|
||||
}
|
||||
if (!sampler_handle.handle) {
|
||||
return HSA_STATUS_ERROR_INVALID_ARGUMENT;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
void* ORCAHSALoaderContext::CpuMemAlloc(size_t size, size_t align, bool zero) {
|
||||
assert(size);
|
||||
assert(align);
|
||||
assert(sizeof(void*) == 8 || sizeof(void*) == 4);
|
||||
void* ptr = amd::Os::alignedMalloc(size, align);
|
||||
if (zero) {
|
||||
memset(ptr, 0, size);
|
||||
}
|
||||
return ptr;
|
||||
}
|
||||
|
||||
bool ORCAHSALoaderContext::CpuMemCopy(void *dst, size_t offset, const void* src, size_t size) {
|
||||
if (!dst || !src || dst == src) {
|
||||
return false;
|
||||
}
|
||||
if (0 == size) {
|
||||
return true;
|
||||
}
|
||||
amd::Os::fastMemcpy((char*)dst + offset, src, size);
|
||||
return true;
|
||||
}
|
||||
|
||||
void* ORCAHSALoaderContext::GpuMemAlloc(size_t size, size_t align, bool zero) {
|
||||
assert(size);
|
||||
assert(align);
|
||||
assert(sizeof(void*) == 8 || sizeof(void*) == 4);
|
||||
pal::Memory* mem = new pal::Memory(program_->dev(), amd::alignUp(size, align));
|
||||
if (!mem || !mem->create(pal::Resource::Local)) {
|
||||
delete mem;
|
||||
return nullptr;
|
||||
}
|
||||
assert(program_->dev().xferQueue());
|
||||
if (zero) {
|
||||
char pattern = 0;
|
||||
program_->dev().xferMgr().fillBuffer(*mem, &pattern, sizeof(pattern), amd::Coord3D(0), amd::Coord3D(size));
|
||||
}
|
||||
program_->addGlobalStore(mem);
|
||||
program_->setGlobalVariableTotalSize(program_->globalVariableTotalSize() + size);
|
||||
return mem;
|
||||
}
|
||||
|
||||
bool ORCAHSALoaderContext::GpuMemCopy(void *dst, size_t offset, const void *src, size_t size) {
|
||||
if (!dst || !src || dst == src) {
|
||||
return false;
|
||||
}
|
||||
if (0 == size) {
|
||||
return true;
|
||||
}
|
||||
assert(program_->dev().xferQueue());
|
||||
pal::Memory* mem = reinterpret_cast<pal::Memory*>(dst);
|
||||
return program_->dev().xferMgr().writeBuffer(src, *mem, amd::Coord3D(offset), amd::Coord3D(size), true);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
@@ -0,0 +1,292 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALPROGRAM_HPP_
|
||||
#define PALPROGRAM_HPP_
|
||||
|
||||
#include "device/pal/palkernel.hpp"
|
||||
#include "device/pal/palbinary.hpp"
|
||||
#include "amd_hsa_loader.hpp"
|
||||
|
||||
namespace amd {
|
||||
namespace option {
|
||||
class Options;
|
||||
} // option
|
||||
namespace hsa {
|
||||
namespace loader {
|
||||
class Loader;
|
||||
class Executable;
|
||||
class Context;
|
||||
} // loader
|
||||
} // hsa
|
||||
} // amd
|
||||
|
||||
//! \namespace pal PAL Device Implementation
|
||||
namespace pal {
|
||||
|
||||
/*! \addtogroup pal PAL Device Implementation
|
||||
* @{
|
||||
*/
|
||||
|
||||
using namespace amd::hsa::loader;
|
||||
class HSAILProgram;
|
||||
class ClBinaryHsa;
|
||||
|
||||
class ORCAHSALoaderContext final: public Context {
|
||||
public:
|
||||
ORCAHSALoaderContext(HSAILProgram* program): program_(program) {}
|
||||
|
||||
virtual ~ORCAHSALoaderContext() {}
|
||||
|
||||
hsa_isa_t IsaFromName(const char *name) override;
|
||||
|
||||
bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) override;
|
||||
|
||||
void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, size_t size, size_t align, bool zero) override;
|
||||
|
||||
bool SegmentCopy(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* dst, size_t offset,
|
||||
const void* src, size_t size) override;
|
||||
|
||||
void SegmentFree(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* seg, size_t size = 0) override;
|
||||
|
||||
void* SegmentAddress(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* seg, size_t offset) override;
|
||||
|
||||
bool SegmentFreeze(amdgpu_hsa_elf_segment_t segment,
|
||||
hsa_agent_t agent, void* seg, size_t size) override { return false; }
|
||||
|
||||
bool ImageExtensionSupported() override { return false; }
|
||||
|
||||
hsa_status_t ImageCreate(
|
||||
hsa_agent_t agent,
|
||||
hsa_access_permission_t image_permission,
|
||||
const hsa_ext_image_descriptor_t *image_descriptor,
|
||||
const void *image_data,
|
||||
hsa_ext_image_t *image_handle) override {
|
||||
// not supported
|
||||
assert(false);
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
hsa_status_t ImageDestroy(
|
||||
hsa_agent_t agent, hsa_ext_image_t image_handle) override {
|
||||
// not supported
|
||||
assert(false);
|
||||
return HSA_STATUS_ERROR;
|
||||
}
|
||||
|
||||
hsa_status_t SamplerCreate(
|
||||
hsa_agent_t agent,
|
||||
const hsa_ext_sampler_descriptor_t *sampler_descriptor,
|
||||
hsa_ext_sampler_t *sampler_handle) override;
|
||||
|
||||
//! All samplers are owned by HSAILProgram and are deleted in its destructor.
|
||||
hsa_status_t SamplerDestroy(
|
||||
hsa_agent_t agent, hsa_ext_sampler_t sampler_handle) override;
|
||||
|
||||
private:
|
||||
|
||||
void* AgentGlobalAlloc(
|
||||
hsa_agent_t agent, size_t size, size_t align, bool zero) {
|
||||
return GpuMemAlloc(size, align, zero);
|
||||
}
|
||||
|
||||
bool AgentGlobalCopy(void *dst, size_t offset, const void *src, size_t size) {
|
||||
return GpuMemCopy(dst, offset, src, size);
|
||||
}
|
||||
|
||||
void AgentGlobalFree(void *ptr, size_t size) {
|
||||
GpuMemFree(ptr, size);
|
||||
}
|
||||
|
||||
void* KernelCodeAlloc(
|
||||
hsa_agent_t agent, size_t size, size_t align, bool zero) {
|
||||
return CpuMemAlloc(size, align, zero);
|
||||
}
|
||||
|
||||
bool KernelCodeCopy(void *dst, size_t offset, const void *src, size_t size) {
|
||||
return CpuMemCopy(dst, offset, src, size);
|
||||
}
|
||||
|
||||
void KernelCodeFree(void *ptr, size_t size) {
|
||||
CpuMemFree(ptr, size);
|
||||
}
|
||||
|
||||
void* CpuMemAlloc(size_t size, size_t align, bool zero);
|
||||
|
||||
bool CpuMemCopy(void *dst, size_t offset, const void* src, size_t size);
|
||||
|
||||
void CpuMemFree(void *ptr, size_t size) {
|
||||
amd::Os::alignedFree(ptr);
|
||||
}
|
||||
|
||||
void* GpuMemAlloc(size_t size, size_t align, bool zero);
|
||||
|
||||
bool GpuMemCopy(void *dst, size_t offset, const void *src, size_t size);
|
||||
|
||||
void GpuMemFree(void *ptr, size_t size = 0) {
|
||||
delete reinterpret_cast<pal::Memory*>(ptr);
|
||||
}
|
||||
|
||||
ORCAHSALoaderContext(const ORCAHSALoaderContext &c);
|
||||
|
||||
ORCAHSALoaderContext& operator=(const ORCAHSALoaderContext &c);
|
||||
|
||||
enum gfx_handle {
|
||||
gfx700 = 700,
|
||||
gfx701 = 701,
|
||||
gfx702 = 702,
|
||||
gfx800 = 800,
|
||||
gfx801 = 801,
|
||||
gfx804 = 804,
|
||||
gfx810 = 810,
|
||||
gfx900 = 900,
|
||||
gfx901 = 901
|
||||
};
|
||||
|
||||
pal::HSAILProgram* program_;
|
||||
};
|
||||
|
||||
//! \class HSAIL program
|
||||
class HSAILProgram : public device::Program
|
||||
{
|
||||
friend class ClBinary;
|
||||
public:
|
||||
//! Default constructor
|
||||
HSAILProgram(Device& device);
|
||||
HSAILProgram(NullDevice& device);
|
||||
//! Default destructor
|
||||
~HSAILProgram();
|
||||
|
||||
//! Returns the aclBinary associated with the progrm
|
||||
aclBinary* binaryElf() const {
|
||||
return static_cast<aclBinary*>(binaryElf_); }
|
||||
|
||||
void addGlobalStore(Memory* mem) { globalStores_.push_back(mem); }
|
||||
|
||||
const std::vector<Memory*>& globalStores() const { return globalStores_; }
|
||||
|
||||
//! Return a typecasted GPU device
|
||||
pal::Device& dev()
|
||||
{ return const_cast<pal::Device&>(
|
||||
static_cast<const pal::Device&>(device())); }
|
||||
|
||||
//! Returns GPU kernel table
|
||||
const Memory* kernelTable() const { return kernels_; }
|
||||
|
||||
//! Adds all kernels to the mem handle lists
|
||||
void fillResListWithKernels(std::vector<const Memory*>& memList) const;
|
||||
|
||||
//! Returns the maximum number of scratch regs used in the program
|
||||
uint maxScratchRegs() const { return maxScratchRegs_; }
|
||||
|
||||
//! Add internal static sampler
|
||||
void addSampler(Sampler* sampler) { staticSamplers_.push_back(sampler); }
|
||||
|
||||
//! Returns TRUE if the program just compiled
|
||||
bool isNull() const { return isNull_; }
|
||||
|
||||
protected:
|
||||
//! pre-compile setup for GPU
|
||||
virtual bool initBuild(amd::option::Options* options);
|
||||
|
||||
//! post-compile setup for GPU
|
||||
virtual bool finiBuild(bool isBuildGood);
|
||||
|
||||
/*! \brief Compiles GPU CL program to LLVM binary (compiler frontend)
|
||||
*
|
||||
* \return True if we successefully compiled a GPU program
|
||||
*/
|
||||
virtual bool compileImpl(
|
||||
const std::string& sourceCode, //!< the program's source code
|
||||
const std::vector<const std::string*>& headers,
|
||||
const char** headerIncludeNames,
|
||||
amd::option::Options* options //!< compile options's object
|
||||
);
|
||||
|
||||
/* \brief Returns the next stage to compile from, based on sections in binary,
|
||||
* also returns completeStages in a vector, which contains at least ACL_TYPE_DEFAULT,
|
||||
* sets needOptionsCheck to true if options check is needed to decide whether or not to recompile
|
||||
*/
|
||||
aclType getCompilationStagesFromBinary(std::vector<aclType>& completeStages, bool& needOptionsCheck);
|
||||
|
||||
/* \brief Returns the next stage to compile from, based on sections and options in binary
|
||||
*/
|
||||
aclType getNextCompilationStageFromBinary(amd::option::Options* options);
|
||||
|
||||
/*! \brief Compiles LLVM binary to FSAIL code (compiler backend: link+opt+codegen)
|
||||
*
|
||||
* \return The build error code
|
||||
*/
|
||||
int compileBinaryToFSAIL(
|
||||
amd::option::Options* options //!< options for compilation
|
||||
);
|
||||
|
||||
virtual bool linkImpl(amd::option::Options* options);
|
||||
|
||||
//! Link the device programs.
|
||||
virtual bool linkImpl (const std::vector<device::Program*>& inputPrograms,
|
||||
amd::option::Options* options,
|
||||
bool createLibrary);
|
||||
|
||||
virtual bool createBinary(amd::option::Options* options);
|
||||
|
||||
//! Initialize Binary
|
||||
virtual bool initClBinary();
|
||||
|
||||
//! Release the Binary
|
||||
virtual void releaseClBinary();
|
||||
|
||||
virtual const aclTargetInfo & info(const char * str = "");
|
||||
|
||||
virtual bool isElf(const char* bin) const {
|
||||
return amd::isElfMagic(bin);
|
||||
//return false;
|
||||
}
|
||||
|
||||
//! Returns the binary
|
||||
// This should ensure that the binary is updated with all the kernels
|
||||
// ClBinary& clBinary() { return binary_; }
|
||||
ClBinaryHsa* clBinary() {
|
||||
return static_cast<ClBinaryHsa*>(device::Program::clBinary());
|
||||
}
|
||||
const ClBinaryHsa* clBinary() const {
|
||||
return static_cast<const ClBinaryHsa*>(device::Program::clBinary());
|
||||
}
|
||||
|
||||
private:
|
||||
//! Disable default copy constructor
|
||||
HSAILProgram(const HSAILProgram&);
|
||||
|
||||
//! Disable operator=
|
||||
HSAILProgram& operator=(const HSAILProgram&);
|
||||
|
||||
//! Returns all the options to be appended while passing to the
|
||||
//compiler library
|
||||
std::string hsailOptions();
|
||||
|
||||
//! Allocate kernel table
|
||||
bool allocKernelTable();
|
||||
|
||||
std::string openCLSource_; //!< Original OpenCL source
|
||||
std::string HSAILProgram_; //!< FSAIL program after compilation
|
||||
std::string llvmBinary_; //!< LLVM IR binary code
|
||||
aclBinary* binaryElf_; //!< Binary for the new compiler library
|
||||
void* rawBinary_; //!< Pointer to the raw binary
|
||||
aclBinaryOptions binOpts_; //!< Binary options to create aclBinary
|
||||
std::vector<Memory*> globalStores_; //!< Global memory for the program
|
||||
Memory* kernels_; //!< Table with kernel object pointers
|
||||
uint maxScratchRegs_; //!< Maximum number of scratch regs used in the program by individual kernel
|
||||
std::list<Sampler*> staticSamplers_; //!< List od internal static samplers
|
||||
bool isNull_; //!< Null program no memory allocations
|
||||
amd::hsa::loader::Loader* loader_; //!< Loader object
|
||||
amd::hsa::loader::Executable* executable_; //!< Executable for HSA Loader
|
||||
ORCAHSALoaderContext loaderContext_; //!< Context for HSA Loader
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
|
||||
#endif /*PALPROGRAM_HPP_*/
|
||||
檔案差異因為檔案過大而無法顯示
載入差異
@@ -0,0 +1,508 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALRESOURCE_HPP_
|
||||
#define PALRESOURCE_HPP_
|
||||
|
||||
#include "platform/command.hpp"
|
||||
#include "platform/program.hpp"
|
||||
#include "device/pal/paldefs.hpp"
|
||||
|
||||
//! \namespace pal PAL Resource Implementation
|
||||
namespace pal {
|
||||
|
||||
class Device;
|
||||
class VirtualGPU;
|
||||
|
||||
/*! \addtogroup PAL PAL Resource Implementation
|
||||
* @{
|
||||
*/
|
||||
|
||||
class GpuMemoryReference : public amd::ReferenceCountedObject
|
||||
{
|
||||
public:
|
||||
static GpuMemoryReference* Create(
|
||||
const Device& dev,
|
||||
const Pal::GpuMemoryCreateInfo& createInfo);
|
||||
|
||||
static GpuMemoryReference* Create(
|
||||
const Device& dev,
|
||||
const void* sysMem,
|
||||
size_t memSize);
|
||||
|
||||
static GpuMemoryReference* Create(
|
||||
const Device& dev,
|
||||
const Pal::ExternalResourceOpenInfo& openInfo);
|
||||
|
||||
static GpuMemoryReference* Create(
|
||||
const Device& dev,
|
||||
const Pal::ExternalImageOpenInfo& openInfo,
|
||||
Pal::ImageCreateInfo* imgCreateInfo,
|
||||
Pal::IImage** image);
|
||||
|
||||
//! Default constructor
|
||||
GpuMemoryReference();
|
||||
|
||||
//! Get PAL memory object
|
||||
Pal::IGpuMemory* iMem() const { return gpuMem_; }
|
||||
|
||||
Pal::IGpuMemory* gpuMem_; //!< PAL GPU memory object
|
||||
void* cpuAddress_; //!< CPU address of this memory
|
||||
|
||||
protected:
|
||||
//! Default destructor
|
||||
~GpuMemoryReference();
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
GpuMemoryReference(const GpuMemoryReference&);
|
||||
|
||||
//! Disable operator=
|
||||
GpuMemoryReference& operator=(const GpuMemoryReference&);
|
||||
};
|
||||
|
||||
//! GPU resource
|
||||
class Resource : public amd::HeapObject
|
||||
{
|
||||
public:
|
||||
enum InteropType {
|
||||
InteropTypeless = 0,
|
||||
InteropVertexBuffer,
|
||||
InteropIndexBuffer,
|
||||
InteropRenderBuffer,
|
||||
InteropTexture,
|
||||
InteropTextureViewLevel,
|
||||
InteropTextureViewCube,
|
||||
InteropSurface
|
||||
};
|
||||
|
||||
struct CreateParams : public amd::StackObject {
|
||||
amd::Memory* owner_; //!< Resource's owner
|
||||
VirtualGPU* gpu_; //!< Resource won't be shared between multiple queues
|
||||
CreateParams(): owner_(NULL), gpu_(NULL) {}
|
||||
};
|
||||
|
||||
struct PinnedParams : public CreateParams {
|
||||
const amd::HostMemoryReference* hostMemRef_;//!< System memory pointer for pinning
|
||||
size_t size_; //!< System memory size
|
||||
};
|
||||
|
||||
struct ViewParams : public CreateParams {
|
||||
size_t offset_; //!< Alias resource offset
|
||||
size_t size_; //!< Alias resource size
|
||||
const Resource* resource_; //!< Parent resource for the view creation
|
||||
const void* memory_;
|
||||
};
|
||||
|
||||
struct ImageViewParams : public CreateParams {
|
||||
size_t level_; //!< Image mip level for a new view
|
||||
size_t layer_; //!< Image layer for a new view
|
||||
const Resource* resource_; //!< Parent resource for the view creation
|
||||
const void* memory_;
|
||||
};
|
||||
|
||||
struct ImageBufferParams : public CreateParams {
|
||||
const Resource* resource_; //!< Parent resource for the image creation
|
||||
const void* memory_;
|
||||
};
|
||||
|
||||
struct OGLInteropParams : public CreateParams {
|
||||
InteropType type_; //!< OGL resource type
|
||||
uint handle_; //!< OGL resource handle
|
||||
uint mipLevel_; //!< Texture mip level
|
||||
uint layer_; //!< Texture layer
|
||||
void* glPlatformContext_;
|
||||
void* glDeviceContext_;
|
||||
uint flags_;
|
||||
};
|
||||
|
||||
#ifdef _WIN32
|
||||
struct D3DInteropParams : public CreateParams {
|
||||
InteropType type_; //!< D3D resource type
|
||||
void* iDirect3D_; //!< D3D resource interface object
|
||||
void* handle_; //!< D3D resource handle
|
||||
uint mipLevel_; //!< Texture mip level
|
||||
int layer_; //!< Texture layer
|
||||
uint misc; //!< miscellaneous cases
|
||||
};
|
||||
#endif // _WIN32
|
||||
|
||||
//! Resource memory
|
||||
enum MemoryType
|
||||
{
|
||||
Empty = 0x0, //!< resource is empty
|
||||
Local, //!< resource in local memory
|
||||
Persistent, //!< resource in persistent memory
|
||||
Remote, //!< resource in nonlocal memory
|
||||
RemoteUSWC, //!< resource in nonlocal memory
|
||||
Pinned, //!< resource in pinned system memory
|
||||
View, //!< resource is an alias
|
||||
OGLInterop, //!< resource is an OGL memory object
|
||||
D3D10Interop, //!< resource is a D3D10 memory object
|
||||
D3D11Interop, //!< resource is a D3D11 memory object
|
||||
ImageView, //!< resource is a view to some image
|
||||
ImageBuffer, //!< resource is an image view of a buffer
|
||||
BusAddressable, //!< resource is a bus addressable memory
|
||||
ExternalPhysical, //!< resource is an external physical memory
|
||||
D3D9Interop, //!< resource is a D3D9 memory object
|
||||
Scratch, //!< resource is scratch memory
|
||||
Shader, //!< resource is a shader
|
||||
};
|
||||
|
||||
//! Resource map flags
|
||||
enum MapFlags
|
||||
{
|
||||
Discard = 0x00000001, //!< discard lock
|
||||
NoOverwrite = 0x00000002, //!< lock with no overwrite
|
||||
ReadOnly = 0x00000004, //!< lock for read only operation
|
||||
WriteOnly = 0x00000008, //!< lock for write only operation
|
||||
NoWait = 0x00000010, //!< lock with no wait
|
||||
};
|
||||
|
||||
//! Resource descriptor
|
||||
struct Descriptor : public amd::HeapObject
|
||||
{
|
||||
MemoryType type_; //!< Memory type
|
||||
size_t width_; //!< Resource width
|
||||
size_t height_; //!< Resource height
|
||||
size_t depth_; //!< Resource depth
|
||||
uint mipLevels_; //!< Number of mip levels
|
||||
uint flags_; //!< Resource flags, used in creation
|
||||
size_t pitch_; //!< Resource pitch, valid if locked
|
||||
size_t slice_; //!< Resource slice, valid if locked
|
||||
cl_image_format format_; //!< CL image format
|
||||
cl_mem_object_type topology_;//!< CL mem object type
|
||||
union {
|
||||
struct {
|
||||
uint dimSize_ : 2; //!< Dimension size
|
||||
uint cardMemory_ : 1; //!< GSL resource is in video memory
|
||||
uint imageArray_ : 1; //!< GSL resource is an array of images
|
||||
uint buffer_ : 1; //!< GSL resource is a buffer
|
||||
uint tiled_ : 1; //!< GSL resource is tiled
|
||||
uint SVMRes_ : 1; //!< SVM flag to the cal resource
|
||||
uint scratch_ : 1; //!< Scratch buffer
|
||||
uint isAllocExecute_ : 1; //!< SVM resource allocation attribute for shader\cmdbuf
|
||||
};
|
||||
uint state_;
|
||||
};
|
||||
};
|
||||
|
||||
//! Constructor of 1D Resource object
|
||||
Resource(
|
||||
const Device& gpuDev, //!< GPU device object
|
||||
size_t size //!< Resource size
|
||||
);
|
||||
|
||||
//! Constructor of Image Resource object
|
||||
Resource(
|
||||
const Device& gpuDev, //!< GPU device object
|
||||
size_t width, //!< resource width
|
||||
size_t height, //!< resource height
|
||||
size_t depth, //!< resource depth
|
||||
cl_image_format format, //!< resource format
|
||||
cl_mem_object_type imageType, //!< CL image type
|
||||
uint mipLevels = 1 //!< Number of mip levels
|
||||
);
|
||||
|
||||
//! Destructor of the resource
|
||||
virtual ~Resource();
|
||||
|
||||
/*! \brief Creates a CAL object, associated with the resource
|
||||
*
|
||||
* \return True if we succesfully created a CAL resource
|
||||
*/
|
||||
virtual bool create(
|
||||
MemoryType memType, //!< memory type
|
||||
CreateParams* params = 0 //!< special parameters for resource allocation
|
||||
);
|
||||
|
||||
/*! \brief Copies a subregion of memory from one resource to another
|
||||
*
|
||||
* This is a general copy from anything to anything (as long as it fits).
|
||||
* All positions and sizes are given in bytes. Note, however, that only
|
||||
* a subset of this general interface is currently implemented.
|
||||
*
|
||||
* \return true if successful
|
||||
*/
|
||||
bool partialMemCopyTo(
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
const amd::Coord3D& srcOrigin, //!< Origin of the source region
|
||||
const amd::Coord3D& dstOrigin, //!< Origin of the destination region
|
||||
const amd::Coord3D& size, //!< Size of the region to copy
|
||||
Resource& dstResource, //!< Destination resource
|
||||
bool enableRectCopy = false, //!< Rectangular DMA support
|
||||
bool flushDMA = false, //!< Flush DMA if requested
|
||||
uint bytesPerElement = 1 //!< Bytes Per Element
|
||||
) const;
|
||||
|
||||
/*! \brief Copies size/4 DWORD of memory to a surface
|
||||
*
|
||||
* This is a raw copy to any surface using a CP packet.
|
||||
* Size needs to be atleast a DWORD or multiple
|
||||
*
|
||||
*/
|
||||
void writeRawData(
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
size_t size, //!< Size in bytes of data to be copied(multiple of DWORDS)
|
||||
const void* data, //!< Data to be copied
|
||||
bool waitForEvent //!< Wait for event complete
|
||||
) const;
|
||||
|
||||
//! Returns the offset in GPU memory for aliases
|
||||
size_t offset() const { return offset_; }
|
||||
|
||||
//! Returns the pinned memory offset
|
||||
uint64_t pinOffset() const { return pinOffset_; }
|
||||
|
||||
//! Returns the GPU device that owns this resource
|
||||
const Device& dev() const { return gpuDevice_; }
|
||||
|
||||
//! Returns the descriptor for resource
|
||||
const Descriptor& desc() const { return desc_; }
|
||||
|
||||
//! Returns the PAL memory object
|
||||
Pal::IGpuMemory* iMem() const { return memRef_->iMem(); }
|
||||
|
||||
//! Returns global memory offset
|
||||
uint64_t vmAddress() const { return iMem()->Desc().gpuVirtAddr + offset_; }
|
||||
|
||||
//! Returns global memory offset
|
||||
uint64_t vmSize() const { return iMem()->Desc().size - offset_; }
|
||||
|
||||
//! Returns global memory offset
|
||||
bool mipMapped() const { return (desc().mipLevels_ > 1) ? true : false; }
|
||||
|
||||
//! Checks if persistent memory can have a direct map
|
||||
bool isPersistentDirectMap() const;
|
||||
|
||||
/*! \brief Locks the resource and returns a physical pointer
|
||||
*
|
||||
* \note This operation stalls HW pipeline!
|
||||
*
|
||||
* \return Pointer to the physical memory
|
||||
*/
|
||||
void* map(
|
||||
VirtualGPU* gpu, //!< Virtual GPU device object
|
||||
uint flags = 0, //!< flags for the map operation
|
||||
// Optimization for multilayer map/unmap
|
||||
uint startLayer = 0, //!< Start layer for multilayer map
|
||||
uint numLayers = 0 //!< End layer for multilayer map
|
||||
);
|
||||
|
||||
//! Unlocks the resource if it was locked
|
||||
void unmap(
|
||||
VirtualGPU* gpu //!< Virtual GPU device object
|
||||
);
|
||||
|
||||
//! Marks the resource as busy
|
||||
void setBusy(
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
GpuEvent calEvent //!< CAL event
|
||||
) const;
|
||||
|
||||
//! Wait for the resource
|
||||
void wait(
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
bool waitOnBusyEngine = false//!< Wait only if engine has changed
|
||||
) const;
|
||||
|
||||
//! Performs host write to the resource GPU memory
|
||||
bool hostWrite(
|
||||
VirtualGPU* gpu, //!< Virtual GPU device object
|
||||
const void* hostPtr, //!< Host pointer to the SRC data
|
||||
const amd::Coord3D& origin, //!< Offsets for the update
|
||||
const amd::Coord3D& size, //!< The number of bytes to write
|
||||
uint flags = 0, //!< Map flags
|
||||
size_t rowPitch = 0, //!< Raw data row pitch
|
||||
size_t slicePitch = 0 //!< Raw data slice pitch
|
||||
);
|
||||
|
||||
//! Performs host read from the resource GPU memory
|
||||
bool hostRead(
|
||||
VirtualGPU* gpu, //!< Virtual GPU device object
|
||||
void* hostPtr, //!< Host pointer to the DST data
|
||||
const amd::Coord3D& origin, //!< Offsets for the update
|
||||
const amd::Coord3D& size, //!< The number of bytes to write
|
||||
size_t rowPitch = 0, //!< Raw data row pitch
|
||||
size_t slicePitch = 0 //!< Raw data slice pitch
|
||||
);
|
||||
|
||||
//! Warms up the rename list for this resource
|
||||
void warmUpRenames(VirtualGPU& gpu);
|
||||
|
||||
//! Gets the resource element size
|
||||
uint elementSize() const { return elementSize_; }
|
||||
|
||||
//! Get the mapped address of this resource
|
||||
address data() const { return reinterpret_cast<address>(address_); }
|
||||
|
||||
//! Frees all allocated CAL memories and resources,
|
||||
//! associated with this objects. And also destroys all rename structures
|
||||
//! Note: doesn't destroy the object itself
|
||||
void free();
|
||||
|
||||
//! Return memory type
|
||||
MemoryType memoryType() const { return desc().type_; }
|
||||
|
||||
//! Retunrs true if memory type matches specified
|
||||
bool isMemoryType(MemoryType memType) const;
|
||||
|
||||
//! Returns TRUE if resource was allocated as cacheable
|
||||
bool isCacheable() const
|
||||
{ return (isMemoryType(Remote) || isMemoryType(Pinned)) ? true : false; }
|
||||
|
||||
bool gslGLAcquire() ;
|
||||
bool gslGLRelease() ;
|
||||
|
||||
//! Returns HW state for the resource (used for images only)
|
||||
const void* hwState() const { return hwState_; }
|
||||
|
||||
//! Returns CPU HW SRD for the resource (used for images only)
|
||||
uint64_t hwSrd() const { return hwSrd_; }
|
||||
|
||||
uint numComponents() const {
|
||||
return Pal::Formats::NumComponents(image_->GetImageCreateInfo().format.chFmt); }
|
||||
|
||||
protected:
|
||||
uint elementSize_; //!< Size of a single element in bytes
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
Resource(const Resource&);
|
||||
|
||||
//! Disable operator=
|
||||
Resource& operator=(const Resource&);
|
||||
|
||||
typedef std::vector<GpuMemoryReference*> RenameList;
|
||||
|
||||
//! Rename current resource
|
||||
bool rename(
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
bool force = false //!< Force renaming
|
||||
);
|
||||
|
||||
//! Sets the rename as active
|
||||
void setActiveRename(
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
GpuMemoryReference* rename //!< new active rename
|
||||
);
|
||||
|
||||
//! Gets the active rename
|
||||
bool getActiveRename(
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
GpuMemoryReference** rename //!< Saved active rename
|
||||
);
|
||||
|
||||
/*! \brief Locks the resource with layers and returns a physical pointer
|
||||
*
|
||||
* \return Pointer to the physical memory
|
||||
*/
|
||||
void* mapLayers(
|
||||
VirtualGPU* gpu, //!< Virtual GPU device object
|
||||
uint flags = 0 //!< flags for the map operation
|
||||
);
|
||||
|
||||
//! Unlocks the resource with layers if it was locked
|
||||
void unmapLayers(
|
||||
VirtualGPU* gpu //!< Virtual GPU device object
|
||||
);
|
||||
|
||||
//! Calls GSL to map a resource
|
||||
void* gpuMemoryMap(
|
||||
size_t* pitch, //!< Pitch value for the image
|
||||
uint flags, //!< Map flags
|
||||
Pal::IGpuMemory* resource //!< GSL memory object
|
||||
) const;
|
||||
|
||||
//! Uses GSL to unmap a resource
|
||||
void gpuMemoryUnmap(
|
||||
Pal::IGpuMemory* resource //!< GSL memory object
|
||||
) const;
|
||||
|
||||
//! Fress all GSL resources associated with OCL resource
|
||||
void gslFree() const;
|
||||
|
||||
//! Converts Resource memory type to the PAL heaps
|
||||
void memTypeToHeap(
|
||||
Pal::GpuMemoryCreateInfo* createInfo //!< Memory create info
|
||||
);
|
||||
|
||||
const Device& gpuDevice_; //!< GPU device
|
||||
Descriptor desc_; //!< Descriptor for this resource
|
||||
amd::Atomic<int> mapCount_; //!< Total number of maps
|
||||
void* address_; //!< Physical address of this resource
|
||||
size_t offset_; //!< Resource offset
|
||||
size_t curRename_; //!< Current active rename in the list
|
||||
RenameList renames_; //!< Rename resource list
|
||||
GpuMemoryReference* memRef_; //!< GSL resource reference
|
||||
const Resource* viewOwner_; //!< GPU resource, which owns this view
|
||||
uint64_t pinOffset_; //!< Pinned memory offset
|
||||
void* glInteropMbRes_;//!< Mb Res handle
|
||||
uint32_t glType_; //!< GL interop type
|
||||
void* glPlatformContext_;
|
||||
void* glDeviceContext_;
|
||||
|
||||
// Optimization for multilayer map/unmap
|
||||
uint startLayer_; //!< Start layer for map/unmapLayer
|
||||
uint numLayers_; //!< Number of layers for map/unmapLayer
|
||||
uint mapFlags_; //!< Map flags for map/umapLayer
|
||||
|
||||
//! @note: This field is necessary for the thread safe release only
|
||||
VirtualGPU* gpu_; //!< Resource will be used only on this queue
|
||||
Pal::IImage* image_; //!< PAL image object
|
||||
|
||||
uint32_t* hwState_; //!< HW state for image object
|
||||
uint64_t hwSrd_; //!< GPU pointer to HW SRD
|
||||
};
|
||||
|
||||
class ResourceCache : public amd::HeapObject
|
||||
{
|
||||
public:
|
||||
//! Default constructor
|
||||
ResourceCache(size_t cacheSizeLimit)
|
||||
: lockCacheOps_("PAL resource cache", true)
|
||||
, cacheSize_(0)
|
||||
, cacheSizeLimit_(cacheSizeLimit)
|
||||
{}
|
||||
|
||||
//! Default destructor
|
||||
~ResourceCache();
|
||||
|
||||
//! Adds a CAL resource to the cache
|
||||
bool addGpuMemory(
|
||||
Resource::Descriptor* desc, //!< Resource descriptor - cache key
|
||||
GpuMemoryReference* ref //!< Resource reference
|
||||
);
|
||||
|
||||
//! Finds a CAL resource from the cache
|
||||
GpuMemoryReference* findGpuMemory(
|
||||
Resource::Descriptor* desc, //!< Resource descriptor - cache key
|
||||
Pal::gpusize size,
|
||||
Pal::gpusize alignment
|
||||
);
|
||||
|
||||
//! Destroys cache
|
||||
bool free(size_t minCacheEntries = 0);
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
ResourceCache(const ResourceCache&);
|
||||
|
||||
//! Disable operator=
|
||||
ResourceCache& operator=(const ResourceCache&);
|
||||
|
||||
//! Removes one last entry from the cache
|
||||
void removeLast();
|
||||
|
||||
amd::Monitor lockCacheOps_; //!< Lock to serialise cache access
|
||||
|
||||
size_t cacheSize_; //!< Current cache size in bytes
|
||||
size_t cacheSizeLimit_; //!< Cache size limit in bytes
|
||||
|
||||
//! CAL resource cache
|
||||
std::list<std::pair<Resource::Descriptor*, GpuMemoryReference*> > resCache_;
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
|
||||
#endif /*PALRESOURCE_HPP_*/
|
||||
@@ -0,0 +1,78 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALSCHED_HPP_
|
||||
#define PALSCHED_HPP_
|
||||
|
||||
#include "hsa.h"
|
||||
|
||||
namespace pal {
|
||||
|
||||
//! AmdAqlWrap slot state
|
||||
enum AqlWrapState {
|
||||
AQL_WRAP_FREE = 0,
|
||||
AQL_WRAP_RESERVED,
|
||||
AQL_WRAP_READY,
|
||||
AQL_WRAP_MARKER,
|
||||
AQL_WRAP_BUSY,
|
||||
AQL_WRAP_DONE
|
||||
};
|
||||
|
||||
struct AmdVQueueHeader {
|
||||
uint32_t aql_slot_num; //!< [LRO/SRO] The total number of the AQL slots (multiple of 64).
|
||||
uint32_t event_slot_num; //!< [LRO] The number of kernel events in the events buffer
|
||||
uint64_t event_slot_mask; //!< [LRO] A pointer to the allocation bitmask array for the events
|
||||
uint64_t event_slots; //!< [LRO] Pointer to a buffer for the events.
|
||||
// Array of event_slot_num entries of AmdEvent
|
||||
uint64_t aql_slot_mask; //!< [LRO/SRO]A pointer to the allocation bitmask for aql_warp slots
|
||||
uint32_t command_counter; //!< [LRW] The global counter for the submitted commands into the queue
|
||||
uint32_t wait_size; //!< [LRO] The wait list size (in clk_event_t)
|
||||
uint32_t arg_size; //!< [LRO] The size of argument buffer (in bytes)
|
||||
uint32_t mask_groups; //!< Processed mask groups by one thread
|
||||
uint64_t kernel_table; //!< [LRO] Pointer to an array with all kernel objects (ulong for each entry)
|
||||
uint32_t reserved[2]; //!< For the future usage
|
||||
};
|
||||
|
||||
struct AmdAqlWrap {
|
||||
uint32_t state; //!< [LRW/SRW] The current state of the AQL wrapper: FREE, RESERVED, READY,
|
||||
// MARKER, BUSY and DONE. The block could be returned back to a free state.
|
||||
uint32_t enqueue_flags; //!< [LWO/SRO] Contains the flags for the kernel execution start
|
||||
uint32_t command_id; //!< [LWO/SRO] The unique command ID
|
||||
uint32_t child_counter; //!< [LRW/SRW] Counter that determine the launches of child kernels.
|
||||
// It’s incremented on the
|
||||
// start and decremented on the finish. The parent kernel can be considered as
|
||||
// done when the value is 0 and the state is DONE
|
||||
uint64_t completion; //!< [LWO/SRO] CL event for the current execution (clk_event_t)
|
||||
uint64_t parent_wrap; //!< [LWO/SRO] Pointer to the parent AQL wrapper (AmdAqlWrap*)
|
||||
uint64_t wait_list; //!< [LRO/SRO] Pointer to an array of clk_event_t objects (64 bytes default)
|
||||
uint32_t wait_num; //!< [LWO/SRO] The number of cl_event_wait objects
|
||||
uint32_t reserved[5]; //!< For the future usage
|
||||
hsa_kernel_dispatch_packet_t aql; //!< [LWO/SRO] AQL packet – 64 bytes AQL packet
|
||||
};
|
||||
|
||||
struct AmdEvent {
|
||||
uint32_t state; //!< [LRO/SRW] Event state: START, END, COMPLETE
|
||||
uint32_t counter; //!< [LRW] Event retain/release counter. 0 means the event is free
|
||||
uint64_t timer[3]; //!< [LRO/SWO] Timer values for profiling for each state
|
||||
uint64_t captureInfo; //!< [LRW/SRO] Profiling capture info for CLK_PROFILING_COMMAND_EXEC_TIME
|
||||
};
|
||||
|
||||
struct SchedulerParam {
|
||||
uint32_t signal; //!< Signal to stop the child queue(address must be 16 bytes aligned)
|
||||
uint32_t eng_clk; //!< Engine clock in Mhz
|
||||
uint64_t hw_queue; //!< Address to HW queue
|
||||
uint64_t hsa_queue; //!< Address to HSA dummy queue
|
||||
uint32_t useATC; //!< GPU access to shader program by ATC.
|
||||
uint32_t scratchSize; //!< Scratch buffer size
|
||||
uint64_t scratch; //!< GPU address to the scratch buffer
|
||||
uint32_t numMaxWaves; //!< The max number of possible waves
|
||||
uint32_t releaseHostCP; //!< Releases CP on the host queue
|
||||
uint64_t parentAQL; //!< Host parent AmdAqlWrap packet
|
||||
uint32_t dedicatedQueue; //!< Scheduler uses a dedicated queue
|
||||
uint32_t scratchOffset; //!< Scratch buffer offset
|
||||
uint32_t reserved[2]; //!< Reserved
|
||||
};
|
||||
|
||||
} // namespace pal
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,23 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
namespace pal {
|
||||
|
||||
#define SCHEDULER_KERNEL(...) #__VA_ARGS__
|
||||
|
||||
const char* SchedulerSourceCode = SCHEDULER_KERNEL(
|
||||
\n
|
||||
extern void __amd_scheduler(__global void *, __global void *, uint);
|
||||
\n
|
||||
__kernel void
|
||||
scheduler(
|
||||
__global void * queue,
|
||||
__global void * params,
|
||||
uint paramIdx)
|
||||
{
|
||||
__amd_scheduler(queue, params, paramIdx);
|
||||
}
|
||||
\n
|
||||
);
|
||||
|
||||
} // namespace pal
|
||||
@@ -0,0 +1,433 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#include "top.hpp"
|
||||
#include "os/os.hpp"
|
||||
#include "device/device.hpp"
|
||||
#include "device/pal/paldefs.hpp"
|
||||
#include "device/pal/palsettings.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace pal {
|
||||
|
||||
/*! \brief information for adjusting maximum workload time
|
||||
*
|
||||
* This structure contains the time and OS minor version for max workload time
|
||||
* adjustment for Windows 7 or 8.
|
||||
*/
|
||||
struct ModifyMaxWorkload
|
||||
{
|
||||
uint32_t time; //!< max work load time (10x ms)
|
||||
uint32_t minorVersion; //!< OS minor version
|
||||
};
|
||||
|
||||
|
||||
Settings::Settings()
|
||||
{
|
||||
// Initialize the GPU device default settings
|
||||
oclVersion_ = OpenCL12;
|
||||
debugFlags_ = 0;
|
||||
singleHeap_ = false;
|
||||
syncObject_ = GPU_USE_SYNC_OBJECTS;
|
||||
remoteAlloc_ = REMOTE_ALLOC;
|
||||
|
||||
stagedXferRead_ = true;
|
||||
stagedXferWrite_ = true;
|
||||
stagedXferSize_ = GPU_STAGING_BUFFER_SIZE * Ki;
|
||||
|
||||
// We will enable staged read/write if we use local memory
|
||||
disablePersistent_ = false;
|
||||
|
||||
// By Default persistent writes will be disabled.
|
||||
stagingWritePersistent_ = GPU_STAGING_WRITE_PERSISTENT;
|
||||
|
||||
maxRenames_ = 4;
|
||||
maxRenameSize_ = 4 * Mi;
|
||||
|
||||
imageSupport_ = false;
|
||||
hwLDSSize_ = 0;
|
||||
|
||||
// Set this to true when we drop the flag
|
||||
doublePrecision_ = ::CL_KHR_FP64;
|
||||
|
||||
// Fill workgroup info size
|
||||
// @todo: revisit the 256 limitation on workgroup size
|
||||
maxWorkGroupSize_ = 256;
|
||||
|
||||
hostMemDirectAccess_ = HostMemDisable;
|
||||
|
||||
libSelector_ = amd::LibraryUndefined;
|
||||
|
||||
// Enable workload split by default (for 24 bit arithmetic or timeout)
|
||||
workloadSplitSize_ = 1 << GPU_WORKLOAD_SPLIT;
|
||||
|
||||
// By default use host blit
|
||||
blitEngine_ = BlitEngineHost;
|
||||
const static size_t MaxPinnedXferSize = 32;
|
||||
pinnedXferSize_ = std::min(GPU_PINNED_XFER_SIZE, MaxPinnedXferSize) * Mi;
|
||||
pinnedMinXferSize_ = std::min(GPU_PINNED_MIN_XFER_SIZE * Ki, pinnedXferSize_);
|
||||
|
||||
// Disable FP_FAST_FMA defines by default
|
||||
reportFMAF_ = false;
|
||||
reportFMA_ = false;
|
||||
|
||||
// GPU device by default
|
||||
apuSystem_ = false;
|
||||
|
||||
// Disable 64 bit pointers support by default
|
||||
use64BitPtr_ = false;
|
||||
|
||||
// Max alloc size is 16GB
|
||||
maxAllocSize_ = 16 * static_cast<uint64_t>(Gi);
|
||||
|
||||
// Disable memory dependency tracking by default
|
||||
numMemDependencies_ = 0;
|
||||
|
||||
// By default cache isn't present
|
||||
cacheLineSize_ = 0;
|
||||
cacheSize_ = 0;
|
||||
|
||||
// Initialize transfer buffer size to 1MB by default
|
||||
xferBufSize_ = 1024 * Ki;
|
||||
|
||||
// Use image DMA if requested
|
||||
imageDMA_ = GPU_IMAGE_DMA;
|
||||
|
||||
// Disable ASIC specific features by default
|
||||
ciPlus_ = false;
|
||||
viPlus_ = false;
|
||||
aiPlus_ = false;
|
||||
|
||||
// Number of compute rings.
|
||||
numComputeRings_ = 0;
|
||||
|
||||
minWorkloadTime_ = 1; // 0.1 ms
|
||||
maxWorkloadTime_ = 5000; // 500 ms
|
||||
|
||||
// Controls tiled images in persistent
|
||||
//!@note IOL for Linux doesn't setup tiling aperture in CMM/QS
|
||||
linearPersistentImage_ = false;
|
||||
|
||||
useSingleScratch_ = GPU_USE_SINGLE_SCRATCH;
|
||||
|
||||
// Device enqueuing settings
|
||||
numDeviceEvents_ = 1024;
|
||||
numWaitEvents_ = 8;
|
||||
|
||||
// Disable HSAIL by default
|
||||
hsail_ = false;
|
||||
|
||||
// Don't support platform atomics by default.
|
||||
svmAtomics_ = false;
|
||||
|
||||
// Use direct SRD by default
|
||||
hsailDirectSRD_ = GPU_DIRECT_SRD;
|
||||
|
||||
// Use host queue for device enqueuing by default
|
||||
useDeviceQueue_ = GPU_USE_DEVICE_QUEUE;
|
||||
|
||||
// Don't support Denormals for single precision by default
|
||||
singleFpDenorm_ = false;
|
||||
}
|
||||
|
||||
bool
|
||||
Settings::create(
|
||||
const Pal::DeviceProperties& palProp,
|
||||
const Pal::GpuMemoryHeapProperties* heaps,
|
||||
bool reportAsOCL12Device
|
||||
)
|
||||
{
|
||||
// uint target = calAttr.target;
|
||||
uint32_t osVer = 0x0;
|
||||
|
||||
// Disable thread trace by default for all devices
|
||||
threadTraceEnable_ = false;
|
||||
bool doublePrecision = true;
|
||||
|
||||
if (doublePrecision) {
|
||||
// Report FP_FAST_FMA define if double precision HW
|
||||
reportFMA_ = true;
|
||||
// FMA is 1/4 speed on Pitcairn, Cape Verde, Devastator and Scrapper
|
||||
// Bonaire, Kalindi, Spectre and Spooky so disable
|
||||
// FP_FMA_FMAF for those parts in switch below
|
||||
reportFMAF_ = true;
|
||||
}
|
||||
|
||||
// Update GPU specific settings and info structure if we have any
|
||||
ModifyMaxWorkload modifyMaxWorkload = {0};
|
||||
|
||||
switch (palProp.revision) {
|
||||
/* case Pal::AsicRevision:::
|
||||
case CAL_TARGET_GREENLAND:
|
||||
//TODO: specific codes for AI
|
||||
aiPlus_ = true;*/
|
||||
// Fall through to VI ...
|
||||
case Pal::AsicRevision::Carrizo:
|
||||
case Pal::AsicRevision::Stoney:
|
||||
if (!aiPlus_) {
|
||||
// APU systems for VI
|
||||
apuSystem_ = true;
|
||||
}
|
||||
case Pal::AsicRevision::Iceland:
|
||||
case Pal::AsicRevision::Tonga:
|
||||
case Pal::AsicRevision::Fiji:
|
||||
case Pal::AsicRevision::Ellesmere:
|
||||
case Pal::AsicRevision::Baffin:
|
||||
// Disable tiling aperture on VI+
|
||||
linearPersistentImage_ = true;
|
||||
// Keep this false even though we have support
|
||||
// singleFpDenorm_ = true;
|
||||
viPlus_ = true;
|
||||
// Fall through to CI ...
|
||||
case Pal::AsicRevision::Kalindi:
|
||||
case Pal::AsicRevision::Spectre:
|
||||
if (!viPlus_) {
|
||||
// APU systems for CI
|
||||
apuSystem_ = true;
|
||||
// Fix BSOD/TDR issues observed on Kaveri Win7 (EPR#416903)
|
||||
modifyMaxWorkload.time = 2500; // 250ms
|
||||
modifyMaxWorkload.minorVersion = 1; // Win 7
|
||||
}
|
||||
// Fall through ...
|
||||
case Pal::AsicRevision::Bonaire:
|
||||
case Pal::AsicRevision::Hawaii:
|
||||
ciPlus_ = true;
|
||||
hsail_ = true;
|
||||
threadTraceEnable_ = AMD_THREAD_TRACE_ENABLE;
|
||||
reportFMAF_ = false;
|
||||
if (palProp.revision == Pal::AsicRevision::Hawaii) {
|
||||
reportFMAF_ = true;
|
||||
}
|
||||
// Cache line size is 64 bytes
|
||||
cacheLineSize_ = 64;
|
||||
// L1 cache size is 16KB
|
||||
cacheSize_ = 16 * Ki;
|
||||
|
||||
if (ciPlus_) {
|
||||
libSelector_ = amd::GPU_Library_CI;
|
||||
if (LP64_SWITCH(WINDOWS_SWITCH(viPlus_, false), true)) {
|
||||
oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ?
|
||||
XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12;
|
||||
}
|
||||
if (GPU_FORCE_OCL20_32BIT) {
|
||||
force32BitOcl20_ = true;
|
||||
oclVersion_ = !reportAsOCL12Device /*&& calAttr.isOpenCL200Device*/ ?
|
||||
XCONCAT(OpenCL, XCONCAT(OPENCL_MAJOR, OPENCL_MINOR)) : OpenCL12;
|
||||
}
|
||||
if (OPENCL_VERSION < 200) {
|
||||
oclVersion_ = OpenCL12;
|
||||
}
|
||||
numComputeRings_ = 8;
|
||||
}
|
||||
else {
|
||||
numComputeRings_ = 2;
|
||||
libSelector_ = amd::GPU_Library_SI;
|
||||
}
|
||||
|
||||
// This needs to be cleaned once 64bit addressing is stable
|
||||
if (oclVersion_ < OpenCL20) {
|
||||
use64BitPtr_ = flagIsDefault(GPU_FORCE_64BIT_PTR) ? LP64_SWITCH(false,
|
||||
/*calAttr.isWorkstation ||*/ hsail_) : GPU_FORCE_64BIT_PTR;
|
||||
}
|
||||
else {
|
||||
if (GPU_FORCE_64BIT_PTR || LP64_SWITCH(false, (hsail_
|
||||
|| (oclVersion_ >= OpenCL20)))) {
|
||||
use64BitPtr_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (oclVersion_ >= OpenCL20) {
|
||||
supportDepthsRGB_ = true;
|
||||
}
|
||||
if (use64BitPtr_) {
|
||||
if (GPU_ENABLE_LARGE_ALLOCATION /*&& calAttr.isWorkstation*/) {
|
||||
maxAllocSize_ = 64ULL * Gi;
|
||||
}
|
||||
else {
|
||||
maxAllocSize_ = 4048 * Mi;
|
||||
}
|
||||
}
|
||||
else {
|
||||
maxAllocSize_ = 3ULL * Gi;
|
||||
}
|
||||
|
||||
supportRA_ = false;
|
||||
partialDispatch_ = GPU_PARTIAL_DISPATCH;
|
||||
numMemDependencies_ = GPU_NUM_MEM_DEPENDENCY;
|
||||
break;
|
||||
default:
|
||||
assert(0 && "Unknown ASIC type!");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Enable atomics support
|
||||
enableExtension(ClKhrInt64BaseAtomics);
|
||||
enableExtension(ClKhrInt64ExtendedAtomics);
|
||||
enableExtension(ClKhrGlobalInt32BaseAtomics);
|
||||
enableExtension(ClKhrGlobalInt32ExtendedAtomics);
|
||||
enableExtension(ClKhrLocalInt32BaseAtomics);
|
||||
enableExtension(ClKhrLocalInt32ExtendedAtomics);
|
||||
enableExtension(ClKhrByteAddressableStore);
|
||||
enableExtension(ClKhrGlSharing);
|
||||
enableExtension(ClKhrGlEvent);
|
||||
enableExtension(ClAmdMediaOps);
|
||||
enableExtension(ClAmdMediaOps2);
|
||||
enableExtension(ClAmdPopcnt);
|
||||
enableExtension(ClKhr3DImageWrites);
|
||||
enableExtension(ClAmdVec3);
|
||||
enableExtension(ClAmdPrintf);
|
||||
enableExtension(ClKhrImage2dFromBuffer);
|
||||
|
||||
hwLDSSize_ = 32 * Ki;
|
||||
|
||||
imageSupport_ = true;
|
||||
singleHeap_ = true;
|
||||
|
||||
// Use kernels for blit if appropriate
|
||||
blitEngine_ = BlitEngineKernel;
|
||||
|
||||
hostMemDirectAccess_ |= HostMemBuffer;
|
||||
// HW doesn't support untiled image writes
|
||||
// hostMemDirectAccess_ |= HostMemImage;
|
||||
|
||||
// Make sure device actually supports double precision
|
||||
doublePrecision_ = (doublePrecision) ? doublePrecision_ : false;
|
||||
if (doublePrecision_) {
|
||||
// Enable KHR double precision extension
|
||||
enableExtension(ClKhrFp64);
|
||||
}
|
||||
|
||||
if (doublePrecision) {
|
||||
// Enable AMD double precision extension
|
||||
doublePrecision_ = true;
|
||||
enableExtension(ClAmdFp64);
|
||||
}
|
||||
|
||||
//! @todo
|
||||
/*
|
||||
if (calAttr.totalSDIHeap > 0) {
|
||||
//Enable bus addressable memory extension
|
||||
enableExtension(ClAMDBusAddressableMemory);
|
||||
}
|
||||
|
||||
if (calAttr.longIdleDetect) {
|
||||
// KMD is unable to detect if we map the visible memory for CPU access, so
|
||||
// accessing persistent staged buffer may fail if LongIdleDetct is enabled.
|
||||
disablePersistent_ = true;
|
||||
}
|
||||
|
||||
svmFineGrainSystem_ = calAttr.isSVMFineGrainSystem;
|
||||
|
||||
svmAtomics_ = (calAttr.svmAtomics || calAttr.isSVMFineGrainSystem) ? true : false;
|
||||
*/
|
||||
// Enable some platform extensions
|
||||
enableExtension(ClAmdDeviceAttributeQuery);
|
||||
|
||||
enableExtension(ClKhrSpir);
|
||||
|
||||
// SVM is not currently supported for DX Interop
|
||||
#if defined(_WIN32)
|
||||
enableExtension(ClKhrD3d9Sharing);
|
||||
enableExtension(ClKhrD3d10Sharing);
|
||||
enableExtension(ClKhrD3d11Sharing);
|
||||
#endif // _WIN32
|
||||
|
||||
// Enable some OpenCL 2.0 extensions
|
||||
if (oclVersion_ >= OpenCL20) {
|
||||
enableExtension(ClKhrGLDepthImages);
|
||||
enableExtension(ClKhrSubGroups);
|
||||
enableExtension(ClKhrDepthImages);
|
||||
|
||||
if (GPU_MIPMAP) {
|
||||
enableExtension(ClKhrMipMapImage);
|
||||
enableExtension(ClKhrMipMapImageWrites);
|
||||
}
|
||||
|
||||
// Enable HW debug
|
||||
if (GPU_ENABLE_HW_DEBUG) {
|
||||
enableHwDebug_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (apuSystem_ &&
|
||||
((heaps[Pal::GpuHeapLocal].heapSize + heaps[Pal::GpuHeapInvisible].heapSize) < (150*Mi))) {
|
||||
remoteAlloc_ = true;
|
||||
}
|
||||
|
||||
// Save resource cache size
|
||||
#ifdef ATI_OS_LINUX
|
||||
// Due to EPR#406216, set the default value for Linux for now
|
||||
resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi;
|
||||
#else
|
||||
if (remoteAlloc_) {
|
||||
resourceCacheSize_ = std::max((heaps[Pal::GpuHeapGartUswc].heapSize / 8),
|
||||
GPU_RESOURCE_CACHE_SIZE * Mi);
|
||||
}
|
||||
else {
|
||||
resourceCacheSize_ = std::max(((heaps[Pal::GpuHeapLocal].heapSize +
|
||||
heaps[Pal::GpuHeapInvisible].heapSize) / 8),
|
||||
GPU_RESOURCE_CACHE_SIZE * Mi);
|
||||
}
|
||||
resourceCacheSize_ = std::min(resourceCacheSize_, 512 * Mi);
|
||||
#endif
|
||||
|
||||
// Override current device settings
|
||||
override();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void
|
||||
Settings::override()
|
||||
{
|
||||
// Limit reported workgroup size
|
||||
if (GPU_MAX_WORKGROUP_SIZE != 0) {
|
||||
maxWorkGroupSize_ = GPU_MAX_WORKGROUP_SIZE;
|
||||
}
|
||||
|
||||
// Override blit engine type
|
||||
if (GPU_BLIT_ENGINE_TYPE != BlitEngineDefault) {
|
||||
blitEngine_ = GPU_BLIT_ENGINE_TYPE;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
|
||||
debugFlags_ = DEBUG_GPU_FLAGS;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(DEBUG_GPU_FLAGS)) {
|
||||
debugFlags_ = DEBUG_GPU_FLAGS;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(GPU_XFER_BUFFER_SIZE)) {
|
||||
xferBufSize_ = GPU_XFER_BUFFER_SIZE * Ki;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(GPU_USE_SYNC_OBJECTS)) {
|
||||
syncObject_ = GPU_USE_SYNC_OBJECTS;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(GPU_NUM_COMPUTE_RINGS)) {
|
||||
numComputeRings_ = GPU_NUM_COMPUTE_RINGS;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(GPU_RESOURCE_CACHE_SIZE)) {
|
||||
resourceCacheSize_ = GPU_RESOURCE_CACHE_SIZE * Mi;
|
||||
}
|
||||
|
||||
if (!flagIsDefault(AMD_GPU_FORCE_SINGLE_FP_DENORM)) {
|
||||
switch (AMD_GPU_FORCE_SINGLE_FP_DENORM) {
|
||||
case 0:
|
||||
singleFpDenorm_ = false;
|
||||
break;
|
||||
case 1:
|
||||
singleFpDenorm_ = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
@@ -0,0 +1,128 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALSETTINGS_HPP_
|
||||
#define PALSETTINGS_HPP_
|
||||
|
||||
#include "top.hpp"
|
||||
#include "library.hpp"
|
||||
#include "inc\core\palDevice.h"
|
||||
|
||||
/*! \addtogroup pal PAL Resource Implementation
|
||||
* @{
|
||||
*/
|
||||
|
||||
//! PAL Device Implementation
|
||||
namespace pal {
|
||||
|
||||
//! Device settings
|
||||
class Settings : public device::Settings
|
||||
{
|
||||
public:
|
||||
//! Debug GPU flags
|
||||
enum DebugGpuFlags
|
||||
{
|
||||
CheckForILSource = 0x00000001,
|
||||
StubCLPrograms = 0x00000002, //!< Enables OpenCL programs stubbing
|
||||
LockGlobalMemory = 0x00000004,
|
||||
};
|
||||
|
||||
enum BlitEngineType
|
||||
{
|
||||
BlitEngineDefault = 0x00000000,
|
||||
BlitEngineHost = 0x00000001,
|
||||
BlitEngineCAL = 0x00000002,
|
||||
BlitEngineKernel = 0x00000003,
|
||||
};
|
||||
|
||||
enum HostMemFlags
|
||||
{
|
||||
HostMemDisable = 0x00000000,
|
||||
HostMemBuffer = 0x00000001,
|
||||
HostMemImage = 0x00000002,
|
||||
};
|
||||
|
||||
union {
|
||||
struct {
|
||||
uint singleHeap_: 1; //!< Device will use a preallocated heap
|
||||
uint remoteAlloc_: 1; //!< Allocate remote memory for the heap
|
||||
uint stagedXferRead_: 1; //!< Uses a staged buffer read
|
||||
uint stagedXferWrite_: 1; //!< Uses a staged buffer write
|
||||
uint disablePersistent_: 1; //!< Disables using persistent memory for staging
|
||||
uint imageSupport_: 1; //!< Report images support
|
||||
uint doublePrecision_: 1; //!< Enables double precision support
|
||||
uint reportFMAF_: 1; //!< Report FP_FAST_FMAF define in CL program
|
||||
uint reportFMA_: 1; //!< Report FP_FAST_FMA define in CL program
|
||||
uint use64BitPtr_: 1; //!< Use 64bit pointers on GPU
|
||||
uint force32BitOcl20_: 1; //!< Force 32bit apps to take CLANG/HSAIL path on GPU
|
||||
uint imageDMA_: 1; //!< Enable direct image DMA transfers
|
||||
uint syncObject_: 1; //!< Enable syncobject
|
||||
uint ciPlus_: 1; //!< CI and post CI features
|
||||
uint viPlus_: 1; //!< VI and post VI features
|
||||
uint aiPlus_: 1; //!< AI and post AI features
|
||||
uint threadTraceEnable_: 1; //!< Thread trace enable
|
||||
uint linearPersistentImage_: 1; //!< Allocates linear images in persistent
|
||||
uint useSingleScratch_: 1; //!< Allocates single scratch per device
|
||||
uint hsail_: 1; //!< Enables HSAIL compilation
|
||||
uint stagingWritePersistent_: 1; //!< Enables persistent writes
|
||||
uint svmAtomics_: 1; //!< SVM device atomics
|
||||
uint svmFineGrainSystem_: 1; //!< SVM fine grain system support
|
||||
uint apuSystem_: 1; //!< Device is APU system with shared memory
|
||||
uint hsailDirectSRD_: 1; //!< Controls direct SRD for HSAIL
|
||||
uint useDeviceQueue_: 1; //!< Submit to separate device queue
|
||||
uint singleFpDenorm_: 1; //!< Support Single FP Denorm
|
||||
uint reserved_: 5;
|
||||
};
|
||||
uint value_;
|
||||
};
|
||||
|
||||
uint oclVersion_; //!< Reported OpenCL version support
|
||||
uint debugFlags_; //!< Debug GPU flags
|
||||
size_t stagedXferSize_; //!< Staged buffer size
|
||||
uint maxRenames_; //!< Maximum number of possible renames
|
||||
uint maxRenameSize_; //!< Maximum size for all renames
|
||||
uint hwLDSSize_; //!< HW local data store size
|
||||
uint maxWorkGroupSize_; //!< Requested workgroup size for this device
|
||||
uint hostMemDirectAccess_; //!< Enables direct access to the host memory
|
||||
amd::LibrarySelector libSelector_; //!< Select linking libraries for compiler
|
||||
uint workloadSplitSize_; //!< Workload split size
|
||||
uint minWorkloadTime_; //!< Minimal workload time in 0.1 ms
|
||||
uint maxWorkloadTime_; //!< Maximum workload time in 0.1 ms
|
||||
uint blitEngine_; //!< Blit engine type
|
||||
size_t pinnedXferSize_; //!< Pinned buffer size for transfer
|
||||
size_t pinnedMinXferSize_; //!< Minimal buffer size for pinned transfer
|
||||
size_t resourceCacheSize_; //!< Resource cache size in MB
|
||||
uint64_t maxAllocSize_; //!< Maximum single allocation size
|
||||
size_t numMemDependencies_;//!< The array size for memory dependencies tracking
|
||||
uint cacheLineSize_; //!< Cache line size in bytes
|
||||
uint cacheSize_; //!< L1 cache size in bytes
|
||||
size_t xferBufSize_; //!< Transfer buffer size for image copy optimization
|
||||
uint numComputeRings_; //!< 0 - disabled, 1 , 2,.. - the number of compute rings
|
||||
uint numDeviceEvents_; //!< The number of device events
|
||||
uint numWaitEvents_; //!< The number of wait events for device enqueue
|
||||
|
||||
|
||||
//! Default constructor
|
||||
Settings();
|
||||
|
||||
//! Creates settings
|
||||
bool create(
|
||||
const Pal::DeviceProperties& palProp, //!< PAL device properties
|
||||
const Pal::GpuMemoryHeapProperties* heaps, //!< PAL heap settings
|
||||
bool reportAsOCL12Device = false //!< Report As OpenCL1.2 Device
|
||||
);
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
Settings(const Settings&);
|
||||
|
||||
//! Disable assignment
|
||||
Settings& operator=(const Settings&);
|
||||
|
||||
//! Overrides current settings based on registry/environment
|
||||
void override();
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
|
||||
#endif /*PALSETTINGS_HPP_*/
|
||||
@@ -0,0 +1,67 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#include "device/pal/palthreadtrace.hpp"
|
||||
#include "device/pal/palvirtual.hpp"
|
||||
|
||||
namespace pal {
|
||||
|
||||
CalThreadTraceReference::~CalThreadTraceReference() {
|
||||
// The thread trace object is always associated with a particular queue,
|
||||
// so we have to lock just this queue
|
||||
amd::ScopedLock lock(gpu_.execution());
|
||||
|
||||
if (0 != threadTrace_) {
|
||||
//gpu().cs()->destroyQuery(gslThreadTrace());
|
||||
}
|
||||
}
|
||||
|
||||
ThreadTrace::~ThreadTrace()
|
||||
{
|
||||
if (calRef_ == nullptr) {
|
||||
return;
|
||||
}
|
||||
Unimplemented();
|
||||
for(uint i = 0; i < amdThreadTraceMemObjsNum_;++i) {
|
||||
// threadTraceBufferObjs_[i]->attachMemObject(gpu().cs(), nullptr, 0, 0, 0, i);
|
||||
// gpu().cs()->destroyShaderTraceBuffer(threadTraceBufferObjs_[i]);
|
||||
}
|
||||
|
||||
// Release the thread trace reference object
|
||||
//calRef_->release();
|
||||
}
|
||||
|
||||
bool
|
||||
ThreadTrace::create(CalThreadTraceReference* calRef)
|
||||
{
|
||||
assert(&gpu() == &calRef->gpu());
|
||||
|
||||
calRef_ = calRef;
|
||||
threadTrace_ = calRef->gslThreadTrace();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
ThreadTrace::info(uint infoType, uint* info, uint infoSize) const
|
||||
{
|
||||
switch (infoType) {
|
||||
case CL_THREAD_TRACE_BUFFERS_SIZE: {
|
||||
if (infoSize < amdThreadTraceMemObjsNum_) {
|
||||
LogError("The amount of buffers should be equal to the amount of Shader Engines");
|
||||
return false;
|
||||
}
|
||||
else {
|
||||
Unimplemented();
|
||||
//gslThreadTrace()->GetResultAll(gpu().cs(), info);
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
LogError("Wrong ThreadTrace::getInfo parameter");
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
@@ -0,0 +1,136 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef GPUTHREADTRACE_HPP_
|
||||
#define GPUTHREADTRACE_HPP_
|
||||
|
||||
#include "top.hpp"
|
||||
#include "device/device.hpp"
|
||||
#include "device/pal/paldevice.hpp"
|
||||
#include "palPerfExperiment.h"
|
||||
|
||||
#include <vector>
|
||||
namespace pal {
|
||||
|
||||
class VirtualGPU;
|
||||
|
||||
class CalThreadTraceReference : public amd::ReferenceCountedObject
|
||||
{
|
||||
public:
|
||||
//! Default constructor
|
||||
CalThreadTraceReference(
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
Pal::IPerfExperiment* gslThreadTrace) //!< GSL query thread trace object
|
||||
: gpu_(gpu)
|
||||
, threadTrace_(gslThreadTrace){}
|
||||
|
||||
//! Get GSL thread race object
|
||||
Pal::IPerfExperiment* gslThreadTrace() const { return threadTrace_; }
|
||||
|
||||
//! Returns the virtual GPU device
|
||||
const VirtualGPU& gpu() const { return gpu_; }
|
||||
|
||||
protected:
|
||||
//! Default destructor
|
||||
~CalThreadTraceReference();
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
CalThreadTraceReference(const CalThreadTraceReference&);
|
||||
|
||||
//! Disable operator=
|
||||
CalThreadTraceReference& operator=(const CalThreadTraceReference&);
|
||||
|
||||
VirtualGPU& gpu_; //!< The virtual GPU device object
|
||||
Pal::IPerfExperiment* threadTrace_; //!< GSL thread trace query object
|
||||
};
|
||||
|
||||
//! ThreadTrace implementation on GPU
|
||||
class ThreadTrace : public device::ThreadTrace
|
||||
{
|
||||
public:
|
||||
|
||||
//! Destructor for the GPU ThreadTrace object
|
||||
virtual ~ThreadTrace();
|
||||
|
||||
//! Creates the current object
|
||||
bool create(
|
||||
CalThreadTraceReference* calRef //!< Reference ThreadTrace
|
||||
);
|
||||
|
||||
//! Returns the GPU device, associated with the current object
|
||||
const Device& dev() const { return gpuDevice_; }
|
||||
|
||||
//! Returns the virtual GPU device
|
||||
const VirtualGPU& gpu() const { return gpu_; }
|
||||
|
||||
//! Constructor for the GPU ThreadTrace object
|
||||
ThreadTrace(
|
||||
Device& device, //!< A GPU device object
|
||||
VirtualGPU& gpu, //!< Virtual GPU device object
|
||||
uint amdThreadTraceMemObjsNum)
|
||||
: gpuDevice_(device)
|
||||
, gpu_(gpu)
|
||||
, calRef_(NULL)
|
||||
, index_(0)
|
||||
, amdThreadTraceMemObjsNum_(amdThreadTraceMemObjsNum)
|
||||
{
|
||||
threadTraceBufferObjs_ = new Pal::ThreadTraceLayout[amdThreadTraceMemObjsNum];
|
||||
Unimplemented();
|
||||
for (uint i = 0; i < amdThreadTraceMemObjsNum;++i) {
|
||||
//threadTraceBufferObjs_[i] = gpu.cs()->createShaderTraceBuffer();
|
||||
}
|
||||
}
|
||||
|
||||
//! Returns the specific information about the thread trace object
|
||||
bool info(
|
||||
uint infoType, //!< The type of returned information
|
||||
uint* info, //!< The returned information
|
||||
uint infoSize //!< The size of returned information
|
||||
) const;
|
||||
|
||||
//! Set the ThreadTrace memory buffer size
|
||||
void setMemBufferSizeTT(uint memBufferSizeTT) { memBufferSizeTT_ = memBufferSizeTT;}
|
||||
|
||||
//! Set isNewBufferBinded_ to true/false if new buffer was binded/unbinded respectively
|
||||
void setNewBufferBinded(bool isNewBufferBinded) { isNewBufferBinded_ = isNewBufferBinded; }
|
||||
|
||||
//! Attach Pal::IGpuMemory to the TreadTrace buffer
|
||||
void attachMemToThreadTraceBuffer();
|
||||
|
||||
void setMemObj(size_t memObjSize,std::vector<amd::Memory*> memObj)
|
||||
{
|
||||
memObj_ = memObj;
|
||||
memBufferSizeTT_ = memObjSize;
|
||||
}
|
||||
//! Get GSL thread trace object
|
||||
Pal::IPerfExperiment* gslThreadTrace() const { return threadTrace_; }
|
||||
|
||||
//! Get GSL Thread Trace Buffer objects
|
||||
Pal::ThreadTraceLayout* getThreadTraceBufferObjects() {return threadTraceBufferObjs_;}
|
||||
private:
|
||||
//! Disable default copy constructor
|
||||
ThreadTrace(const ThreadTrace&);
|
||||
|
||||
//! Disable default operator=
|
||||
ThreadTrace& operator=(const ThreadTrace&);
|
||||
|
||||
const Device& gpuDevice_; //!< The backend device
|
||||
|
||||
VirtualGPU& gpu_; //!< The virtual GPU device object
|
||||
|
||||
CalThreadTraceReference* calRef_; //!< Reference ThreadTrace
|
||||
Pal::ThreadTraceLayout* threadTraceBufferObjs_; //!< The buffer object for Thread Trace recording
|
||||
uint index_; //!< ThreadTrace index in the CAL container
|
||||
uint memBufferSizeTT_; //!< ThreadTrace memory buffer size
|
||||
std::vector<amd::Memory*> memObj_; //!< ThreadTrace memory object
|
||||
Pal::IPerfExperiment* threadTrace_; //!< GSL thread trace query object
|
||||
uint amdThreadTraceMemObjsNum_; //!< ThreadTrace memory object`s number (should be equal to the SE number)
|
||||
bool isNewBufferBinded_; //!< The indicator if new buffer was binded to the ThreadTrace object
|
||||
bool isBufferOnSubmit_; //!< The indicator if "new buffer on submit" mode is used
|
||||
};
|
||||
|
||||
} // namespace pal
|
||||
|
||||
#endif // PALTHREADTRACE_HPP_
|
||||
|
||||
@@ -0,0 +1,123 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#include "os/os.hpp"
|
||||
#include "platform/perfctr.hpp"
|
||||
#include "device/pal/paldefs.hpp"
|
||||
#include "device/pal/paltimestamp.hpp"
|
||||
#include "device/pal/palvirtual.hpp"
|
||||
#include "device/pal/palcounters.hpp"
|
||||
|
||||
namespace pal {
|
||||
|
||||
TimeStamp::TimeStamp(
|
||||
const VirtualGPU& gpu,
|
||||
Pal::IGpuMemory* iMem,
|
||||
uint memOffset,
|
||||
address cpuAddr)
|
||||
: gpu_(gpu)
|
||||
, iMem_(iMem)
|
||||
, memOffset_(memOffset)
|
||||
{
|
||||
values_ = reinterpret_cast<volatile uint64_t*>(cpuAddr + memOffset);
|
||||
}
|
||||
|
||||
TimeStamp::~TimeStamp()
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
TimeStamp::begin(bool sdma)
|
||||
{
|
||||
if (!flags_.beginIssued_) {
|
||||
gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeTop, *iMem_,
|
||||
memOffset_ + CommandStartTime * sizeof(uint64_t));
|
||||
flags_.beginIssued_ = true;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
TimeStamp::end(bool sdma)
|
||||
{
|
||||
CondLog(!flags_.beginIssued_, "We didn't issue a begin operation!");
|
||||
gpu().iCmd()->CmdWriteTimestamp(Pal::HwPipePoint::HwPipeBottom, *iMem_,
|
||||
memOffset_ + CommandEndTime * sizeof(uint64_t));
|
||||
flags_.endIssued_ = true;
|
||||
flags_.sdma_ = sdma;
|
||||
}
|
||||
|
||||
inline void
|
||||
SetValue(uint64_t* time, uint64_t val, double nanos)
|
||||
{
|
||||
*time = static_cast<uint64_t>(static_cast<double>(val) * nanos);
|
||||
}
|
||||
|
||||
void
|
||||
TimeStamp::value(uint64_t* startTime, uint64_t* endTime)
|
||||
{
|
||||
CondLog(!flags_.endIssued_, "We didn't send the counter end operation!");
|
||||
//! @todo optimize!
|
||||
const double NanoSecondsPerTick = 1000000000.0 / (gpu_.dev().properties().timestampFrequency);
|
||||
|
||||
SetValue(startTime, values_[CommandStartTime], NanoSecondsPerTick);
|
||||
SetValue(endTime, values_[CommandEndTime], NanoSecondsPerTick);
|
||||
}
|
||||
|
||||
TimeStampCache::~TimeStampCache()
|
||||
{
|
||||
// Release all time stamp objects from the cache
|
||||
for (uint i = 0; i < freedTS_.size(); ++i) {
|
||||
delete freedTS_[i];
|
||||
}
|
||||
freedTS_.clear();
|
||||
|
||||
// Release all memory objects
|
||||
for (uint i = 0; i < tsBuf_.size(); ++i) {
|
||||
tsBuf_[i]->unmap(&gpu_);
|
||||
gpu_.queue(MainEngine).removeMemRef(tsBuf_[i]->iMem());
|
||||
gpu_.queue(SdmaEngine).removeMemRef(tsBuf_[i]->iMem());
|
||||
delete tsBuf_[i];
|
||||
}
|
||||
tsBuf_.clear();
|
||||
|
||||
}
|
||||
|
||||
TimeStamp*
|
||||
TimeStampCache::allocTimeStamp()
|
||||
{
|
||||
TimeStamp* ts = nullptr;
|
||||
if (0 != freedTS_.size()) {
|
||||
ts = freedTS_.back();
|
||||
freedTS_.pop_back();
|
||||
}
|
||||
|
||||
if (nullptr == ts) {
|
||||
if ((tsBufCpu_ == nullptr) || ((tsOffset_ + TimerSlotSize) > TimerBufSize)) {
|
||||
Memory* buf = new Memory(gpu_.dev(), TimerBufSize);
|
||||
if (buf == nullptr || !buf->create(Resource::Remote)) {
|
||||
return nullptr;
|
||||
}
|
||||
gpu_.queue(MainEngine).addMemRef(buf->iMem());
|
||||
gpu_.queue(SdmaEngine).addMemRef(buf->iMem());
|
||||
tsBufCpu_ = reinterpret_cast<address>(buf->map(&gpu_));
|
||||
memset(tsBufCpu_, 0, TimerBufSize);
|
||||
tsOffset_ = 0;
|
||||
tsBuf_.push_back(buf);
|
||||
}
|
||||
// Allocate a TimeStamp object
|
||||
ts = new TimeStamp(gpu_, tsBuf_[(tsBuf_.size() - 1)]->iMem(),
|
||||
tsOffset_, tsBufCpu_);
|
||||
// Create a timestamp
|
||||
if (ts == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
tsOffset_ += TimerSlotSize;
|
||||
}
|
||||
|
||||
// Set this timestamp into DRM profile mode if it was requested
|
||||
ts->clearStates();
|
||||
|
||||
return ts;
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
@@ -0,0 +1,132 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#ifndef PALTIMESTAMP_HPP_
|
||||
#define PALTIMESTAMP_HPP_
|
||||
|
||||
#include "device/pal/paldefs.hpp"
|
||||
#include "device/pal/palresource.hpp"
|
||||
|
||||
/*! \addtogroup pal PAL Resource Implementation
|
||||
* @{
|
||||
*/
|
||||
|
||||
//! PAL Device Implementation
|
||||
namespace pal {
|
||||
|
||||
class Device;
|
||||
class VirtualGPU;
|
||||
class Memory;
|
||||
|
||||
class TimeStamp : public amd::HeapObject
|
||||
{
|
||||
public:
|
||||
//! Enums for the timestamp information
|
||||
//! \note *4 is the limitaiton of SDMA HW
|
||||
//! (address has to be aligned by 256 bit)
|
||||
enum TimeStampValue {
|
||||
CommandStartTime = 0,
|
||||
CommandEndTime = 4,
|
||||
CommandTotal = 8
|
||||
};
|
||||
|
||||
//! The TimeStamp object flags
|
||||
union Flags
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint32_t beginIssued_ : 1;
|
||||
uint32_t endIssued_ : 1;
|
||||
uint32_t sdma_ : 1;
|
||||
};
|
||||
uint32_t value_;
|
||||
Flags(): value_(0) {}
|
||||
};
|
||||
|
||||
//! Default constructor
|
||||
TimeStamp(
|
||||
const VirtualGPU& gpu, //!< Virtual GPU
|
||||
Pal::IGpuMemory* iMem, //!< Buffer with the timer values
|
||||
uint memOffset, //!< Offset in the buffer for the current TS
|
||||
address cpuAddr //!< CPU pointer for the values in memory
|
||||
);
|
||||
|
||||
//! Default destructor
|
||||
~TimeStamp();
|
||||
|
||||
//! Starts the timestamp
|
||||
void begin(bool sdma = false);
|
||||
|
||||
//! Ends the timestamp
|
||||
void end(bool sdma = false);
|
||||
|
||||
//! Returns the timestamp result in nano seconds
|
||||
void value(uint64_t* startTime, uint64_t* endTime);
|
||||
|
||||
//! Clear all TimeStamp states
|
||||
void clearStates()
|
||||
{ flags_.value_ = 0;
|
||||
values_[CommandStartTime] = 0;
|
||||
values_[CommandEndTime] = 0;
|
||||
}
|
||||
|
||||
//! Timer commands were submitted to HW
|
||||
bool isValid() const { return (flags_.endIssued_) ? true : false; }
|
||||
|
||||
private:
|
||||
//! Disable copy constructor
|
||||
TimeStamp(const TimeStamp&);
|
||||
|
||||
//! Disable operator=
|
||||
TimeStamp& operator=(const TimeStamp&);
|
||||
|
||||
//! Returns the GPU device object
|
||||
const VirtualGPU& gpu() const { return gpu_; }
|
||||
|
||||
const VirtualGPU& gpu_; //!< Virtual GPU
|
||||
Flags flags_; //!< The time stamp state
|
||||
Pal::IGpuMemory* iMem_; //!< Buffer with the timer values
|
||||
uint memOffset_; //!< Offset in the buffer for the current timer
|
||||
volatile uint64_t* values_; //!< CPU pointer to the timer values
|
||||
};
|
||||
|
||||
class TimeStampCache : public amd::HeapObject
|
||||
{
|
||||
public:
|
||||
//! Default constructor
|
||||
TimeStampCache(
|
||||
VirtualGPU& gpu //!< Virtual GPU object
|
||||
)
|
||||
: gpu_(gpu)
|
||||
, tsBufCpu_(NULL)
|
||||
, tsOffset_(0) {}
|
||||
|
||||
//! Default destructor
|
||||
~TimeStampCache();
|
||||
|
||||
//! Gets a time stamp object. It will find a freed object or allocate a new one
|
||||
TimeStamp* allocTimeStamp();
|
||||
|
||||
//! Frees a time stamp object
|
||||
void freeTimeStamp(TimeStamp* ts) { freedTS_.push_back(ts); }
|
||||
|
||||
private:
|
||||
static const uint TimerSlotSize = TimeStamp::CommandTotal * sizeof(uint64_t);
|
||||
static const uint TimerBufSize = TimerSlotSize * 4096;
|
||||
|
||||
//! Disable copy constructor
|
||||
TimeStampCache(const TimeStampCache&);
|
||||
|
||||
//! Disable operator=
|
||||
TimeStampCache& operator=(const TimeStampCache&);
|
||||
|
||||
std::vector<TimeStamp*> freedTS_; //!< Array of freed time stamp objects
|
||||
VirtualGPU& gpu_; //!< Virtual GPU
|
||||
std::vector<Memory*> tsBuf_; //!< Array of memory objects with the timer value
|
||||
address tsBufCpu_; //!< CPU pointer for current TS memory
|
||||
uint tsOffset_; //!< Active offset in the current mem object
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
|
||||
#endif /*PALTIMESTAMP_HPP_*/
|
||||
@@ -0,0 +1,187 @@
|
||||
/*******************************************************************************
|
||||
* The source of the runtime trap handler, "runtimetraphandler.sp3".
|
||||
* The binary is created by the SP3 tool with the following command:
|
||||
*
|
||||
* sp3.exe runtimetraphandler.sp3 -hex runtimeTrapCode.hex
|
||||
*
|
||||
*******************************************************************************
|
||||
|
||||
shader main
|
||||
asic(TAHITI) // for SI/CI or asic(VI) for VI
|
||||
type(CS)
|
||||
|
||||
// clear wave exception state
|
||||
v_clrexcp
|
||||
s_waitcnt 0
|
||||
//==========================================================================
|
||||
// Handle the workaround for HW bug that causes the incorrect TMA value.
|
||||
// Retrieve the TMA values, which are stored at TBA buffer at location
|
||||
// 256 (0x100).
|
||||
|
||||
// Construct the memory descriptor with TBA as the start address
|
||||
// we are using the registers ttmp[8:11] for that.
|
||||
s_mov_b32 ttmp8, tba_lo
|
||||
s_and_b32 ttmp9, tba_hi, 0xffff
|
||||
|
||||
// 0x100=256 bytes, which is the size of the buffer to
|
||||
// store all the level 2 trap handler info
|
||||
s_or_b32 ttmp9, ttmp9, 0x01000000
|
||||
s_mov_b32 ttmp10, 0x00002000
|
||||
s_mov_b32 ttmp11, 0x00024fac
|
||||
|
||||
// TMA is stored 256 (0x100) bytes before the TBA value
|
||||
s_sub_u32 ttmp8, ttmp8, 0x100
|
||||
|
||||
// Backup the s0 since ttmp registers cannot be target of
|
||||
// buffer read instruction
|
||||
s_mov_b32 ttmp7, s0
|
||||
s_buffer_load_dword s0, ttmp8, 0x0 // VI: offset=0x0 (bytes)
|
||||
s_waitcnt 0
|
||||
s_mov_b32 tma_lo, s0
|
||||
s_buffer_load_dword s0, ttmp8, 0x1 // VI: offset=0x4 (bytes)
|
||||
s_waitcnt 0
|
||||
s_mov_b32 tma_hi, s0
|
||||
s_mov_b32 s0, ttmp7
|
||||
|
||||
//===================================================
|
||||
// setup the mmeory descriptor for TMA
|
||||
s_mov_b32 ttmp6, 0x18
|
||||
s_add_u32 ttmp8, tma_lo, ttmp6
|
||||
s_and_b32 ttmp9, tma_hi, 0xffff
|
||||
//0x68=104 bytes, which is the size of the buffer to
|
||||
//store all the level2 trap handler info
|
||||
s_or_b32 ttmp9, ttmp9, 0x00680000
|
||||
s_mov_b32 ttmp10, 0x00002000
|
||||
s_mov_b32 ttmp11, 0x00024fac
|
||||
|
||||
//===================================================
|
||||
// backup the TMA values to be restored later
|
||||
// level-one TMA saved in the ttmp6,ttmp7
|
||||
s_mov_b32 ttmp6, tma_lo
|
||||
s_mov_b32 ttmp7, tma_hi
|
||||
|
||||
//===================================================
|
||||
// setup the TMA for the level-two trap handler
|
||||
// level-two TMA saved in tma_hi, tma_lo
|
||||
s_mov_b32 ttmp3, s0
|
||||
s_buffer_load_dword s0, ttmp8, 0x2 // VI: offset=0x8 (bytes)
|
||||
s_waitcnt 0x0000
|
||||
s_mov_b32 tma_lo, s0
|
||||
|
||||
s_buffer_load_dword s0, ttmp8, 0x3 // VI: offset=0xc (bytes)
|
||||
s_waitcnt 0x0000
|
||||
s_mov_b32 tma_hi, s0
|
||||
|
||||
//===================================================
|
||||
// setup the TBA for the level-two trap handler
|
||||
// level-two TBA saved in ttmp9, ttmp8
|
||||
s_buffer_load_dword s0, ttmp8, 0x0 // VI: offset=0x0 (bytes)
|
||||
s_waitcnt 0x0000
|
||||
s_mov_b32 ttmp2, s0
|
||||
|
||||
s_buffer_load_dword s0, ttmp8, 0x1 // VI: offset=0x4 (bytes)
|
||||
s_waitcnt 0x0000
|
||||
|
||||
//swap the values of s0 and ttmp3 without using other registers
|
||||
s_xor_b32 ttmp3, s0, ttmp3
|
||||
s_xor_b32 s0, s0, ttmp3
|
||||
s_xor_b32 ttmp3, s0, ttmp3
|
||||
|
||||
//store the debug trap handler start address in ttmp8,9
|
||||
s_mov_b32 ttmp8, ttmp2
|
||||
s_mov_b32 ttmp9, ttmp3
|
||||
|
||||
//===================================================
|
||||
// get the pc value to resume execution
|
||||
s_getpc_b64 [ttmp2, ttmp3]
|
||||
s_add_u32 ttmp2, ttmp2, 0x8
|
||||
|
||||
//===================================================
|
||||
//set the pc value to jump to the debug trap handler
|
||||
s_setpc_b64 [ttmp8, ttmp9]
|
||||
|
||||
//===================================================
|
||||
// restore the tamp values
|
||||
s_mov_b32 tma_hi, ttmp7
|
||||
s_mov_b32 tma_lo, ttmp6
|
||||
|
||||
label_return:
|
||||
//===================================================
|
||||
// return from the trap handler to the saved PC
|
||||
s_and_b32 ttmp1, ttmp1, 0xffff
|
||||
s_rfe_b64 [ttmp0,ttmp1]
|
||||
|
||||
end
|
||||
|
||||
*******************************************************************************/
|
||||
|
||||
/// shader codes with "asic(TAHITI)" instruction
|
||||
static const uint32_t RuntimeTrapCode [] = {
|
||||
0x7e008200, 0xbf8c0000,
|
||||
0xbef8036c, 0x8779ff6d,
|
||||
0x0000ffff, 0x8879ff79,
|
||||
0x01000000, 0xbefa03ff,
|
||||
0x00002000, 0xbefb03ff,
|
||||
0x00024fac, 0x80f8ff78,
|
||||
0x00000100, 0xbef70300,
|
||||
0xc2007900, 0xbf8c0000,
|
||||
0xbeee0300, 0xc2007901,
|
||||
0xbf8c0000, 0xbeef0300,
|
||||
0xbe800377, 0xbef60398,
|
||||
0x8078766e, 0x8779ff6f,
|
||||
0x0000ffff, 0x8879ff79,
|
||||
0x00680000, 0xbefa03ff,
|
||||
0x00002000, 0xbefb03ff,
|
||||
0x00024fac, 0xbef6036e,
|
||||
0xbef7036f, 0xbef30300,
|
||||
0xc2007902, 0xbf8c0000,
|
||||
0xbeee0300, 0xc2007903,
|
||||
0xbf8c0000, 0xbeef0300,
|
||||
0xc2007900, 0xbf8c0000,
|
||||
0xbef20300, 0xc2007901,
|
||||
0xbf8c0000, 0x89737300,
|
||||
0x89007300, 0x89737300,
|
||||
0xbef80372, 0xbef90373,
|
||||
0xbef21f00, 0x80728872,
|
||||
0xbe802078, 0xbeef0377,
|
||||
0xbeee0376, 0x8771ff71,
|
||||
0x0000ffff, 0xbe802270
|
||||
};
|
||||
|
||||
|
||||
/// shader codes with "asic(VI)" instruction
|
||||
static const uint32_t RuntimeTrapCodeVi [] = {
|
||||
0x7e006a00, 0xbf8c0000,
|
||||
0xbef8006c, 0x8679ff6d,
|
||||
0x0000ffff, 0x8779ff79,
|
||||
0x01000000, 0xbefa00ff,
|
||||
0x00002000, 0xbefb00ff,
|
||||
0x00024fac, 0x80f8ff78,
|
||||
0x00000100, 0xbef70000,
|
||||
0xc022003c, 0x00000000,
|
||||
0xbf8c0000, 0xbeee0000,
|
||||
0xc022003c, 0x00000004,
|
||||
0xbf8c0000, 0xbeef0000,
|
||||
0xbe800077, 0xbef60098,
|
||||
0x8078766e, 0x8679ff6f,
|
||||
0x0000ffff, 0x8779ff79,
|
||||
0x00680000, 0xbefa00ff,
|
||||
0x00002000, 0xbefb00ff,
|
||||
0x00024fac, 0xbef6006e,
|
||||
0xbef7006f, 0xbef30000,
|
||||
0xc022003c, 0x00000008,
|
||||
0xbf8c0000, 0xbeee0000,
|
||||
0xc022003c, 0x0000000c,
|
||||
0xbf8c0000, 0xbeef0000,
|
||||
0xc022003c, 0x00000000,
|
||||
0xbf8c0000, 0xbef20000,
|
||||
0xc022003c, 0x00000004,
|
||||
0xbf8c0000, 0x88737300,
|
||||
0x88007300, 0x88737300,
|
||||
0xbef80072, 0xbef90073,
|
||||
0xbef21c00, 0x80728872,
|
||||
0xbe801d78, 0xbeef0077,
|
||||
0xbeee0076, 0x8671ff71,
|
||||
0x0000ffff, 0xbe801f70
|
||||
};
|
||||
|
||||
檔案差異因為檔案過大而無法顯示
載入差異
@@ -0,0 +1,576 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
|
||||
#ifndef PALVIRTUAL_HPP_
|
||||
#define PALVIRTUAL_HPP_
|
||||
|
||||
#include "device/pal/paldefs.hpp"
|
||||
#include "device/pal/palconstbuf.hpp"
|
||||
#include "device/pal/palprintf.hpp"
|
||||
#include "device/pal/paltimestamp.hpp"
|
||||
#include "device/pal/palsched.hpp"
|
||||
#include "device/pal/paldebugger.hpp"
|
||||
#include "device/blit.hpp"
|
||||
#include "palCmdBuffer.h"
|
||||
#include "palCmdAllocator.h"
|
||||
#include "palQueue.h"
|
||||
|
||||
/*! \addtogroup PAL PAL Resource Implementation
|
||||
* @{
|
||||
*/
|
||||
|
||||
//! PAL Device Implementation
|
||||
namespace pal {
|
||||
|
||||
class Device;
|
||||
class Kernel;
|
||||
class Memory;
|
||||
class CalCounterReference;
|
||||
class VirtualGPU;
|
||||
class Program;
|
||||
class BlitManager;
|
||||
class ThreadTrace;
|
||||
class HSAILKernel;
|
||||
|
||||
//! Virtual GPU
|
||||
class VirtualGPU : public device::VirtualDevice
|
||||
{
|
||||
public:
|
||||
class Queue : public amd::HeapObject
|
||||
{
|
||||
public:
|
||||
static const uint MaxCmdBuffers = 8;
|
||||
static const uint MaxCommands = 512;
|
||||
static const uint StartCmdBufIdx = 1;
|
||||
static const uint FirstMemoryReference = 0x80000000;
|
||||
static Queue* Create(
|
||||
Pal::IDevice* palDev, //!< PAL device object
|
||||
Pal::QueueType queueType, //!< PAL queue type
|
||||
uint engineIdx, //!< Select particular engine index
|
||||
Pal::ICmdAllocator* cmdAlloc//!< PAL CMD buffer allocator
|
||||
);
|
||||
|
||||
Queue(Pal::IDevice* palDev)
|
||||
: iDev_(palDev), iQueue_(NULL),
|
||||
cmdBufIdSlot_(StartCmdBufIdx), cmdBufIdCurrent_(StartCmdBufIdx),
|
||||
cmbBufIdRetired_(0), cmdCnt_(0)
|
||||
{
|
||||
for (uint i = 0; i < MaxCmdBuffers; ++i) {
|
||||
iCmdBuffs_[i] = NULL;
|
||||
iCmdFences_[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
~Queue();
|
||||
|
||||
void addCmdMemRef(Pal::IGpuMemory* iMem);
|
||||
void removeCmdMemRef(Pal::IGpuMemory* iMem);
|
||||
|
||||
void addMemRef(Pal::IGpuMemory* iMem) const
|
||||
{
|
||||
iDev_->AddGpuMemoryReferences(1, &iMem, NULL);
|
||||
}
|
||||
void removeMemRef(Pal::IGpuMemory* iMem) const
|
||||
{
|
||||
iDev_->RemoveGpuMemoryReferences(1, &iMem, NULL);
|
||||
}
|
||||
|
||||
//! Flushes the current command buffer to HW
|
||||
//! Returns ID associated with the submission
|
||||
uint submit();
|
||||
|
||||
bool flush();
|
||||
|
||||
bool waitForEvent(uint id);
|
||||
|
||||
bool isDone(uint id);
|
||||
|
||||
Pal::ICmdBuffer* iCmd() const { return iCmdBuffs_[cmdBufIdSlot_]; }
|
||||
|
||||
Pal::IQueue* iQueue_; //!< PAL queue object
|
||||
Pal::ICmdBuffer* iCmdBuffs_[MaxCmdBuffers]; //!< PAL command buffers
|
||||
Pal::IFence* iCmdFences_[MaxCmdBuffers]; //!< PAL fences, associated with CMD
|
||||
|
||||
private:
|
||||
Pal::IDevice* iDev_; //!< PAL device
|
||||
uint cmdBufIdSlot_; //!< Command buffer ID slot for submissions
|
||||
uint cmdBufIdCurrent_; //!< Current global command buffer ID
|
||||
uint cmbBufIdRetired_; //!< The last retired command buffer ID
|
||||
uint cmdCnt_; //!< Counter of commands
|
||||
std::map<Pal::IGpuMemory*, uint> memReferences_;
|
||||
};
|
||||
|
||||
struct CommandBatch : public amd::HeapObject
|
||||
{
|
||||
amd::Command* head_; //!< Command batch head
|
||||
GpuEvent events_[AllEngines]; //!< Last known GPU events
|
||||
TimeStamp* lastTS_; //!< TS associated with command batch
|
||||
|
||||
//! Constructor
|
||||
CommandBatch(
|
||||
amd::Command* head, //!< Command batch head
|
||||
const GpuEvent* events, //!< HW events on all engines
|
||||
TimeStamp* lastTS //!< Last TS in command batch
|
||||
): head_(head), lastTS_(lastTS)
|
||||
{
|
||||
memcpy(&events_, events, AllEngines * sizeof(GpuEvent));
|
||||
}
|
||||
};
|
||||
|
||||
//! The virtual GPU states
|
||||
union State
|
||||
{
|
||||
struct
|
||||
{
|
||||
uint boundGlobal_ : 1; //!< Global buffer was bound
|
||||
uint profiling_ : 1; //!< Profiling is enabled
|
||||
uint forceWait_ : 1; //!< Forces wait in flush()
|
||||
uint boundCb_ : 1; //!< Constant buffer was bound
|
||||
uint boundPrintf_ : 1; //!< Printf buffer was bound
|
||||
uint profileEnabled_: 1; //!< Profiling is enabled for WaveLimiter
|
||||
};
|
||||
uint value_;
|
||||
State(): value_(0) {}
|
||||
};
|
||||
|
||||
//! CAL descriptor for the GPU virtual device
|
||||
struct CalVirtualDesc : public amd::EmbeddedObject
|
||||
{
|
||||
GpuEvent events_[AllEngines]; //!< Last known GPU events
|
||||
uint iterations_; //!< Number of iterations for the execution
|
||||
TimeStamp* lastTS_; //!< Last timestamp executed on Virtual GPU
|
||||
};
|
||||
|
||||
typedef std::vector<ConstBuffer*> constbufs_t;
|
||||
|
||||
class MemoryDependency : public amd::EmbeddedObject
|
||||
{
|
||||
public:
|
||||
//! Default constructor
|
||||
MemoryDependency()
|
||||
: memObjectsInQueue_(NULL)
|
||||
, numMemObjectsInQueue_(0)
|
||||
, maxMemObjectsInQueue_(0) {}
|
||||
|
||||
~MemoryDependency() { delete [] memObjectsInQueue_; }
|
||||
|
||||
//! Creates memory dependecy structure
|
||||
bool create(size_t numMemObj);
|
||||
|
||||
//! Notify the tracker about new kernel
|
||||
void newKernel() { endMemObjectsInQueue_ = numMemObjectsInQueue_; }
|
||||
|
||||
//! Validates memory object on dependency
|
||||
void validate(VirtualGPU& gpu, const Memory* memory, bool readOnly);
|
||||
|
||||
//! Clear memory dependency
|
||||
void clear(bool all = true);
|
||||
|
||||
private:
|
||||
struct MemoryState {
|
||||
uint64_t start_; //! Busy memory start address
|
||||
uint64_t end_; //! Busy memory end address
|
||||
bool readOnly_; //! Current GPU state in the queue
|
||||
};
|
||||
|
||||
MemoryState* memObjectsInQueue_; //!< Memory object state in the queue
|
||||
size_t endMemObjectsInQueue_; //!< End of mem objects in the queue
|
||||
size_t numMemObjectsInQueue_; //!< Number of mem objects in the queue
|
||||
size_t maxMemObjectsInQueue_; //!< Maximum number of mem objects in the queue
|
||||
};
|
||||
|
||||
|
||||
class DmaFlushMgmt : public amd::EmbeddedObject
|
||||
{
|
||||
public:
|
||||
DmaFlushMgmt(const Device& dev);
|
||||
|
||||
// Resets DMA command buffer workload
|
||||
void resetCbWorkload(const Device& dev);
|
||||
|
||||
// Finds split size for the current dispatch
|
||||
void findSplitSize(
|
||||
const Device& dev, //!< GPU device object
|
||||
uint64_t threads, //!< Total number of execution threads
|
||||
uint instructions //!< Number of ALU instructions
|
||||
);
|
||||
|
||||
// Returns TRUE if DMA command buffer is ready for a flush
|
||||
bool isCbReady(
|
||||
VirtualGPU& gpu, //!< Virtual GPU object
|
||||
uint64_t threads, //!< Total number of execution threads
|
||||
uint instructions //!< Number of ALU instructions
|
||||
);
|
||||
|
||||
// Returns dispatch split size
|
||||
uint dispatchSplitSize() const { return dispatchSplitSize_; }
|
||||
|
||||
private:
|
||||
uint64_t maxDispatchWorkload_; //!< Maximum number of operations for a single dispatch
|
||||
uint64_t maxCbWorkload_; //!< Maximum number of operations for DMA command buffer
|
||||
uint64_t cbWorkload_; //!< Current number of operations in DMA command buffer
|
||||
uint aluCnt_; //!< All ALUs on the chip
|
||||
uint dispatchSplitSize_; //!< Dispath split size in elements
|
||||
};
|
||||
|
||||
public:
|
||||
VirtualGPU(Device& device);
|
||||
//! Creates virtual gpu object
|
||||
bool create(
|
||||
bool profiling, //!< Enables profilng on the queue
|
||||
uint deviceQueueSize = 0 //!< Device queue size, 0 if host queue
|
||||
);
|
||||
~VirtualGPU();
|
||||
|
||||
void submitReadMemory(amd::ReadMemoryCommand& vcmd);
|
||||
void submitWriteMemory(amd::WriteMemoryCommand& vcmd);
|
||||
void submitCopyMemory(amd::CopyMemoryCommand& vcmd);
|
||||
void submitMapMemory(amd::MapMemoryCommand& vcmd);
|
||||
void submitUnmapMemory(amd::UnmapMemoryCommand& vcmd);
|
||||
void submitKernel(amd::NDRangeKernelCommand& vcmd);
|
||||
bool submitKernelInternal(
|
||||
const amd::NDRangeContainer& sizes, //!< Workload sizes
|
||||
const amd::Kernel& kernel, //!< Kernel for execution
|
||||
const_address parameters, //!< Parameters for the kernel
|
||||
bool nativeMem = true, //!< Native memory objects
|
||||
amd::Event* enqueueEvent = NULL //!< Event provided in the enqueue kernel command
|
||||
);
|
||||
void submitNativeFn(amd::NativeFnCommand& vcmd);
|
||||
void submitFillMemory(amd::FillMemoryCommand& vcmd);
|
||||
void submitMigrateMemObjects(amd::MigrateMemObjectsCommand& cmd);
|
||||
void submitMarker(amd::Marker& vcmd);
|
||||
void submitAcquireExtObjects(amd::AcquireExtObjectsCommand& vcmd);
|
||||
void submitReleaseExtObjects(amd::ReleaseExtObjectsCommand& vcmd);
|
||||
void submitPerfCounter(amd::PerfCounterCommand& vcmd);
|
||||
void submitThreadTraceMemObjects(amd::ThreadTraceMemObjectsCommand& cmd);
|
||||
void submitThreadTrace(amd::ThreadTraceCommand& vcmd);
|
||||
void submitSignal(amd::SignalCommand & vcmd);
|
||||
void submitMakeBuffersResident(amd::MakeBuffersResidentCommand & vcmd);
|
||||
virtual void submitSvmFreeMemory(amd::SvmFreeMemoryCommand& cmd);
|
||||
virtual void submitSvmCopyMemory(amd::SvmCopyMemoryCommand& cmd);
|
||||
virtual void submitSvmFillMemory(amd::SvmFillMemoryCommand& cmd);
|
||||
virtual void submitSvmMapMemory(amd::SvmMapMemoryCommand& cmd);
|
||||
virtual void submitSvmUnmapMemory(amd::SvmUnmapMemoryCommand& cmd);
|
||||
|
||||
void releaseMemory(Pal::IGpuMemory* iMem, bool wait = true);
|
||||
|
||||
void flush(amd::Command* list = NULL, bool wait = false);
|
||||
bool terminate() { return true; }
|
||||
|
||||
//! Returns GPU device object associated with this kernel
|
||||
const Device& dev() const { return gpuDevice_; }
|
||||
|
||||
//! Returns CAL descriptor of the virtual device
|
||||
const CalVirtualDesc* cal() const { return &cal_; }
|
||||
|
||||
//! Returns a GPU event, associated with GPU memory
|
||||
GpuEvent* getGpuEvent(
|
||||
Pal::IGpuMemory* iMem //!< PAL mem object
|
||||
);
|
||||
|
||||
//! Assigns a GPU event, associated with GPU memory
|
||||
void assignGpuEvent(
|
||||
Pal::IGpuMemory* iMem, //!< PAL mem object
|
||||
GpuEvent gpuEvent
|
||||
);
|
||||
|
||||
//! Set the last known GPU event
|
||||
void setGpuEvent(
|
||||
GpuEvent gpuEvent, //!< GPU event for tracking
|
||||
bool flush = false //!< TRUE if flush is required
|
||||
);
|
||||
|
||||
//! Flush DMA buffer on the specified engine
|
||||
void flushDMA(
|
||||
uint engineID //!< Engine ID for DMA flush
|
||||
);
|
||||
|
||||
//! Wait for all engines on this Virtual GPU
|
||||
//! Returns TRUE if CPU didn't wait for GPU
|
||||
bool waitAllEngines(
|
||||
CommandBatch* cb = NULL //!< Command batch
|
||||
);
|
||||
|
||||
//! Waits for the latest GPU event with a lock to prevent multiple entries
|
||||
void waitEventLock(
|
||||
CommandBatch* cb //!< Command batch
|
||||
);
|
||||
|
||||
//! Returns a resource associated with the constant buffer
|
||||
const ConstBuffer* cb(uint idx) const { return constBufs_[idx]; }
|
||||
|
||||
//! Adds CAL objects into the constant buffer vector
|
||||
void addConstBuffer(ConstBuffer* cb) { constBufs_.push_back(cb); }
|
||||
|
||||
constbufs_t constBufs_; //!< constant buffers
|
||||
|
||||
//! Start the command profiling
|
||||
void profilingBegin(
|
||||
amd::Command& command, //!< Command queue object
|
||||
bool drmProfiling = false //!< Measure DRM time
|
||||
);
|
||||
|
||||
//! End the command profiling
|
||||
void profilingEnd(amd::Command& command);
|
||||
|
||||
//! Collect the profiling results
|
||||
bool profilingCollectResults(
|
||||
CommandBatch* cb, //!< Command batch
|
||||
const amd::Event* waitingEvent //!< Waiting event
|
||||
);
|
||||
|
||||
//! Adds a memory handle into the GSL memory array for Virtual Heap
|
||||
bool addVmMemory(
|
||||
const Memory* memory //!< GPU memory object
|
||||
);
|
||||
|
||||
//! Adds a stage write buffer into a list
|
||||
void addXferWrite(Memory& memory);
|
||||
|
||||
//! Adds a pinned memory object into a map
|
||||
void addPinnedMem(amd::Memory* mem);
|
||||
|
||||
//! Release pinned memory objects
|
||||
void releasePinnedMem();
|
||||
|
||||
//! Finds if pinned memory is cached
|
||||
amd::Memory* findPinnedMem(void* addr, size_t size);
|
||||
|
||||
//! Returns the monitor object for execution access by VirtualGPU
|
||||
amd::Monitor& execution() { return execution_; }
|
||||
|
||||
//! Returns the virtual gpu unique index
|
||||
uint index() const { return index_; }
|
||||
|
||||
//! Get the PrintfDbg object
|
||||
PrintfDbg& printfDbg() const { return *printfDbg_; }
|
||||
|
||||
//! Get the PrintfDbgHSA object
|
||||
PrintfDbgHSA& printfDbgHSA() const { return *printfDbgHSA_; }
|
||||
|
||||
//! Enables synchronized transfers
|
||||
void enableSyncedBlit() const;
|
||||
|
||||
//! Checks if profiling is enabled
|
||||
bool profiling() const { return state_.profiling_; }
|
||||
|
||||
//! Returns memory dependency class
|
||||
MemoryDependency& memoryDependency() { return memoryDependency_; }
|
||||
|
||||
//! Returns hsaQueueMem_
|
||||
const Memory* hsaQueueMem() const { return hsaQueueMem_;}
|
||||
|
||||
//! Returns DMA flush management structure
|
||||
const DmaFlushMgmt& dmaFlushMgmt() const { return dmaFlushMgmt_; }
|
||||
|
||||
//! Releases GSL memory objects allocated on this queue
|
||||
void releaseMemObjects(bool scratch = true);
|
||||
|
||||
//! Returns the HW ring used on this virtual device
|
||||
uint hwRing() const { return hwRing_; }
|
||||
|
||||
//! Returns current timestamp object for profiling
|
||||
TimeStamp* currTs() const { return cal_.lastTS_; }
|
||||
|
||||
//! Returns virtual queue object for device enqueuing
|
||||
Memory* vQueue() const { return virtualQueue_; }
|
||||
|
||||
//! Update virtual queue header
|
||||
void writeVQueueHeader(VirtualGPU& hostQ, uint64_t kernelTable);
|
||||
|
||||
//! Returns TRUE if virtual queue was successfully allocatted
|
||||
bool createVirtualQueue(
|
||||
uint deviceQueueSize //!< Device queue size
|
||||
);
|
||||
|
||||
EngineType engineID_; //!< Engine ID for this VirtualGPU
|
||||
State state_; //!< virtual GPU current state
|
||||
CalVirtualDesc cal_; //!< CAL virtual device descriptor
|
||||
|
||||
void flushCuCaches(HwDbgGpuCacheMask cache_mask); //!< flush/invalidate SQ cache
|
||||
|
||||
//! Returns PAL command buffer interface
|
||||
Pal::ICmdBuffer* iCmd() const {
|
||||
Queue* queue = queues_[engineID_];
|
||||
return queue->iCmd();
|
||||
}
|
||||
|
||||
//! Returns queue, associated with VirtualGPU
|
||||
Queue& queue(EngineType id) const { return *queues_[id]; }
|
||||
|
||||
void flushCUCaches() const
|
||||
{
|
||||
Pal::BarrierInfo barrier = {};
|
||||
barrier.pipePointWaitCount = 1;
|
||||
Pal::HwPipePoint point = Pal::HwPipePostCs;
|
||||
barrier.pPipePoints = &point;
|
||||
barrier.transitionCount = 1;
|
||||
Pal::BarrierTransition trans = {Pal::CoherShader, Pal::CoherShader,
|
||||
{nullptr, { {Pal::ImageAspect::Color, 0, 0}, 0, 0 }, Pal::LayoutShaderRead, Pal::LayoutShaderRead}};
|
||||
barrier.pTransitions = &trans;
|
||||
barrier.waitPoint = Pal::HwPipePreCs;
|
||||
iCmd()->CmdBarrier(barrier);
|
||||
}
|
||||
|
||||
void eventBegin(EngineType engId) const {
|
||||
const static bool Begin = true;
|
||||
profileEvent(engId, Begin);
|
||||
}
|
||||
|
||||
void eventEnd(EngineType engId, GpuEvent& event) const {
|
||||
const static bool End = false;
|
||||
profileEvent(engId, End);
|
||||
event.id = queues_[engId]->submit();
|
||||
event.engineId_ = engId;
|
||||
}
|
||||
|
||||
void waitForEvent(GpuEvent* event) const {
|
||||
if (event->isValid()) {
|
||||
assert(event->engineId_ < AllEngines);
|
||||
queues_[event->engineId_]->waitForEvent(event->id);
|
||||
event->invalidate();
|
||||
}
|
||||
}
|
||||
|
||||
bool isDone(GpuEvent* event) {
|
||||
if (event->isValid()) {
|
||||
assert(event->engineId_ < AllEngines);
|
||||
if (queues_[event->engineId_]->isDone(event->id)) {
|
||||
event->invalidate();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
//! Returns TRUE if SDMA requires overlap synchronizaiton
|
||||
bool validateSdmaOverlap(
|
||||
const Resource& src, //!< Source resource for SDMA transfer
|
||||
const Resource& dst //!< Destination resource for SDMA transfer
|
||||
);
|
||||
protected:
|
||||
void profileEvent(EngineType engine, bool type) const;
|
||||
|
||||
//! Creates buffer object from image
|
||||
amd::Memory* createBufferFromImage(
|
||||
amd::Memory& amdImage //! The parent image object(untiled images only)
|
||||
) const;
|
||||
|
||||
private:
|
||||
struct MemoryRange {
|
||||
uint64_t start_; //!< Memory range start address
|
||||
uint64_t end_; //!< Memory range end address
|
||||
MemoryRange(): start_(0), end_(0) {}
|
||||
};
|
||||
|
||||
typedef std::map<const Pal::IGpuMemory*, GpuEvent> GpuEvents;
|
||||
|
||||
//! Finds total amount of necessary iterations
|
||||
inline void findIterations(
|
||||
const amd::NDRangeContainer& sizes, //!< Original workload sizes
|
||||
const amd::NDRange& local, //!< Local workgroup size
|
||||
amd::NDRange& groups, //!< Calculated workgroup sizes
|
||||
amd::NDRange& remainder, //!< Calculated remainder sizes
|
||||
size_t& extra //!< Amount of extra executions for remainder
|
||||
);
|
||||
|
||||
//! Allocates constant buffers
|
||||
bool allocConstantBuffers();
|
||||
|
||||
//! Releases stage write buffers
|
||||
void releaseXferWrite();
|
||||
|
||||
//! Allocate hsaQueueMem_
|
||||
bool allocHsaQueueMem();
|
||||
|
||||
//! Awaits a command batch with a waiting event
|
||||
bool awaitCompletion(
|
||||
CommandBatch* cb, //!< Command batch for to wait
|
||||
const amd::Event* waitingEvent = NULL //!< A waiting event
|
||||
);
|
||||
|
||||
//! Detects memory dependency for HSAIL kernels and flushes caches
|
||||
bool processMemObjectsHSA(
|
||||
const amd::Kernel& kernel, //!< AMD kernel object for execution
|
||||
const_address params, //!< Pointer to the param's store
|
||||
bool nativeMem, //!< Native memory objects
|
||||
std::vector<const Memory*>* memList //!< Memory list for KMD tracking
|
||||
);
|
||||
|
||||
//! Common function for fill memory used by both svm Fill and non-svm fill
|
||||
bool fillMemory(
|
||||
cl_command_type type, //!< the command type
|
||||
amd::Memory* amdMemory, //!< memory object to fill
|
||||
const void* pattern, //!< pattern to fill the memory
|
||||
size_t patternSize, //!< pattern size
|
||||
const amd::Coord3D& origin, //!< memory origin
|
||||
const amd::Coord3D& size //!< memory size for filling
|
||||
);
|
||||
|
||||
bool copyMemory(
|
||||
cl_command_type type, //!< the command type
|
||||
amd::Memory& srcMem, //!< source memory object
|
||||
amd::Memory& dstMem, //!< destination memory object
|
||||
bool entire, //!< flag of entire memory copy
|
||||
const amd::Coord3D& srcOrigin, //!< source memory origin
|
||||
const amd::Coord3D& dstOrigin, //!< destination memory object
|
||||
const amd::Coord3D& size, //!< copy size
|
||||
const amd::BufferRect& srcRect, //!< region of source for copy
|
||||
const amd::BufferRect& dstRect //!< region of destination for copy
|
||||
);
|
||||
|
||||
void buildKernelInfo(
|
||||
const HSAILKernel& hsaKernel, //!< hsa kernel
|
||||
hsa_kernel_dispatch_packet_t* aqlPkt, //!< aql packet for dispatch
|
||||
HwDbgKernelInfo& kernelInfo, //!< kernel info for the dispatch
|
||||
amd::Event* enqueueEvent //!< Event provided in the enqueue kernel command
|
||||
);
|
||||
|
||||
void assignDebugTrapHandler(
|
||||
const DebugToolInfo& dbgSetting, //!< debug settings
|
||||
HwDbgKernelInfo& kernelInfo //!< kernel info for the dispatch
|
||||
);
|
||||
|
||||
GpuEvents gpuEvents_; //!< GPU events
|
||||
|
||||
Device& gpuDevice_; //!< physical GPU device
|
||||
amd::Monitor execution_; //!< Lock to serialise access to all device objects
|
||||
uint index_; //!< The virtual device unique index
|
||||
|
||||
PrintfDbg* printfDbg_; //!< GPU printf implemenation
|
||||
PrintfDbgHSA* printfDbgHSA_; //!< HSAIL printf implemenation
|
||||
|
||||
TimeStampCache* tsCache_; //!< TimeStamp cache
|
||||
MemoryDependency memoryDependency_; //!< Memory dependency class
|
||||
|
||||
DmaFlushMgmt dmaFlushMgmt_; //!< DMA flush management
|
||||
|
||||
std::list<Memory*> xferWriteBuffers_; //!< Stage write buffers
|
||||
std::list<amd::Memory*> pinnedMems_;//!< Pinned memory list
|
||||
|
||||
typedef std::list<CommandBatch*> CommandBatchList;
|
||||
CommandBatchList cbList_; //!< List of command batches
|
||||
|
||||
uint hwRing_; //!< HW ring used on this virtual device
|
||||
|
||||
uint64_t readjustTimeGPU_; //!< Readjust time between GPU and CPU timestamps
|
||||
TimeStamp* currTs_; //!< current timestamp for command
|
||||
|
||||
AmdVQueueHeader* vqHeader_; //!< Sysmem copy for virtual queue header
|
||||
Memory* virtualQueue_; //!< Virtual device queue
|
||||
Memory* schedParams_; //!< The scheduler parameters
|
||||
uint schedParamIdx_; //!< Index in the scheduler parameters buffer
|
||||
uint deviceQueueSize_; //!< Device queue size
|
||||
uint maskGroups_; //!< The number of mask groups processed in the scheduler by one thread
|
||||
|
||||
Memory* hsaQueueMem_; //!< Memory for the amd_queue_t object
|
||||
Pal::ICmdAllocator* cmdAllocator_; //!< Command buffer allocator
|
||||
Queue* queues_[AllEngines]; //!< HW queues for all engines
|
||||
MemoryRange sdmaRange_; //!< SDMA memory range for write access
|
||||
};
|
||||
|
||||
/*@}*/} // namespace pal
|
||||
|
||||
#endif /*PALVIRTUAL_HPP_*/
|
||||
@@ -0,0 +1,354 @@
|
||||
//
|
||||
// Copyright (c) 2015 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
#include "device/pal/palkernel.hpp"
|
||||
#include "device/pal/palwavelimiter.hpp"
|
||||
#include "os/os.hpp"
|
||||
#include "utils/flags.hpp"
|
||||
|
||||
#include <cstdlib>
|
||||
using namespace std;
|
||||
|
||||
namespace pal {
|
||||
|
||||
uint WaveLimiter::MaxWave;
|
||||
uint WaveLimiter::WarmUpCount;
|
||||
uint WaveLimiter::RunCount;
|
||||
uint WLAlgorithmSmooth::AdaptCount;
|
||||
uint WLAlgorithmSmooth::AbandonThresh;
|
||||
uint WLAlgorithmSmooth::DscThresh;
|
||||
|
||||
WaveLimiter::WaveLimiter(
|
||||
HSAILKernel* owner,
|
||||
uint seqNum,
|
||||
bool enable,
|
||||
bool enableDump):
|
||||
owner_(owner),
|
||||
dumper_(owner_->name() + "_" + std::to_string(seqNum), enableDump) {
|
||||
auto gpuDev = static_cast<const Device*>(&owner_->dev());
|
||||
Unimplemented();
|
||||
//auto attrib = gpuDev->getAttribs();
|
||||
auto hwInfo = gpuDev->hwInfo();
|
||||
setIfNotDefault(SIMDPerSH_, GPU_WAVE_LIMIT_CU_PER_SH,
|
||||
/*attrib.numberOfCUsperShaderArray*/ 8 * hwInfo->simdPerCU_);
|
||||
MaxWave = GPU_WAVE_LIMIT_MAX_WAVE;
|
||||
WarmUpCount = GPU_WAVE_LIMIT_WARMUP;
|
||||
RunCount = GPU_WAVE_LIMIT_RUN * MaxWave;
|
||||
|
||||
state_ = WARMUP;
|
||||
if (!flagIsDefault(GPU_WAVE_LIMIT_TRACE)) {
|
||||
traceStream_.open(std::string(GPU_WAVE_LIMIT_TRACE) + owner_->name() +
|
||||
".txt");
|
||||
}
|
||||
|
||||
waves_ = MaxWave;
|
||||
currWaves_ = MaxWave;
|
||||
bestWave_ = MaxWave;
|
||||
enable_ = enable;
|
||||
}
|
||||
|
||||
WaveLimiter::~WaveLimiter() {
|
||||
if (traceStream_.is_open()) {
|
||||
traceStream_.close();
|
||||
}
|
||||
}
|
||||
|
||||
uint WaveLimiter::getWavesPerSH(){
|
||||
currWaves_ = waves_;
|
||||
return waves_ * SIMDPerSH_;
|
||||
}
|
||||
|
||||
WLAlgorithmSmooth::WLAlgorithmSmooth(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump):
|
||||
WaveLimiter(owner, seqNum, enable, enableDump) {
|
||||
AdaptCount = 2 * MaxWave + 1;
|
||||
AbandonThresh = GPU_WAVE_LIMIT_ABANDON;
|
||||
DscThresh = GPU_WAVE_LIMIT_DSC_THRESH;
|
||||
|
||||
dynRunCount_ = RunCount;
|
||||
measure_.resize(MaxWave + 1);
|
||||
reference_.resize(MaxWave + 1);
|
||||
trial_.resize(MaxWave + 1);
|
||||
ratio_.resize(MaxWave + 1);
|
||||
|
||||
clearData();
|
||||
}
|
||||
|
||||
WLAlgorithmSmooth::~WLAlgorithmSmooth() {
|
||||
|
||||
}
|
||||
|
||||
void WLAlgorithmSmooth::clearData() {
|
||||
waves_ = MaxWave;
|
||||
countAll_ = 0;
|
||||
clear(measure_);
|
||||
clear(reference_);
|
||||
clear(trial_);
|
||||
clear(ratio_);
|
||||
discontinuous_ = false;
|
||||
dataCount_ = 0;
|
||||
}
|
||||
|
||||
void WLAlgorithmSmooth::updateData(ulong time) {
|
||||
auto count = dataCount_ - 1;
|
||||
assert(count < 2 * MaxWave + 1);
|
||||
assert(time > 0);
|
||||
assert(currWaves_ == waves_);
|
||||
if (count % 2 == 0) {
|
||||
assert(waves_ == MaxWave);
|
||||
auto pos = count / 2;
|
||||
measure_[pos] = time;
|
||||
if (pos > 0) {
|
||||
auto wave = MaxWave + 1 - pos;
|
||||
if (abs(static_cast<long>(measure_[pos - 1]) -
|
||||
static_cast<long>(measure_[pos])) * 100 / measure_[pos] >
|
||||
DscThresh) {
|
||||
discontinuous_ = true;
|
||||
}
|
||||
reference_[wave] = (time + measure_[pos - 1]) / 2;
|
||||
ratio_[wave] = trial_[wave] * 100 / reference_[wave];
|
||||
if (ratio_[bestWave_] > ratio_[wave] && !discontinuous_) {
|
||||
bestWave_ = wave;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
assert(waves_ == MaxWave - count / 2);
|
||||
trial_[waves_] = time;
|
||||
}
|
||||
outputTrace();
|
||||
}
|
||||
|
||||
void WLAlgorithmSmooth::outputTrace() {
|
||||
if (!traceStream_.is_open()) {
|
||||
return;
|
||||
}
|
||||
|
||||
traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
|
||||
<< " currWaves=" << currWaves_ << " waves=" << waves_
|
||||
<< " bestWave=" << bestWave_ << '\n';
|
||||
output(traceStream_, "\n measure = ", measure_);
|
||||
output(traceStream_, "\n reference = ", reference_);
|
||||
output(traceStream_, "\n ratio = ", ratio_);
|
||||
traceStream_ << "\n\n";
|
||||
}
|
||||
|
||||
|
||||
void WLAlgorithmSmooth::callback(ulong duration) {
|
||||
dumper_.addData(duration, currWaves_, static_cast<char>(state_));
|
||||
|
||||
if (!enable_) {
|
||||
return;
|
||||
}
|
||||
|
||||
countAll_++;
|
||||
|
||||
switch (state_) {
|
||||
case WARMUP:
|
||||
if (countAll_ < WarmUpCount) {
|
||||
return;
|
||||
}
|
||||
state_ = ADAPT;
|
||||
bestWave_ = MaxWave;
|
||||
clearData();
|
||||
return;
|
||||
case ADAPT:
|
||||
assert(duration > 0);
|
||||
if (waves_ == currWaves_) {
|
||||
dataCount_++;
|
||||
updateData(duration);
|
||||
waves_ = MaxWave + 1 - dataCount_ / 2;
|
||||
if (dataCount_ == 1 || (dataCount_ < AdaptCount &&
|
||||
!discontinuous_ && (dataCount_ % 2 == 0 ||
|
||||
ratio_[waves_] < AbandonThresh))) {
|
||||
if (dataCount_ % 2 == 1) {
|
||||
--waves_;
|
||||
} else {
|
||||
waves_ = MaxWave;
|
||||
}
|
||||
return;
|
||||
}
|
||||
waves_ = bestWave_;
|
||||
if (dataCount_ >= AdaptCount) {
|
||||
dynRunCount_ = RunCount;
|
||||
} else {
|
||||
dynRunCount_ = AdaptCount;
|
||||
}
|
||||
countAll_ = rand() % MaxWave;
|
||||
state_ = RUN;
|
||||
}
|
||||
return;
|
||||
case RUN:
|
||||
if (countAll_ < dynRunCount_) {
|
||||
return;
|
||||
}
|
||||
state_ = ADAPT;
|
||||
bestWave_ = MaxWave;
|
||||
clearData();
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
WaveLimiter::DataDumper::DataDumper(const std::string &kernelName, bool enable) {
|
||||
enable_ = enable;
|
||||
if (enable_) {
|
||||
fileName_ = std::string(GPU_WAVE_LIMIT_DUMP) + kernelName + ".csv";
|
||||
}
|
||||
}
|
||||
|
||||
WaveLimiter::DataDumper::~DataDumper() {
|
||||
if (!enable_) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::ofstream OFS(fileName_);
|
||||
for (size_t i = 0, e = time_.size(); i != e; ++i) {
|
||||
OFS << i << ',' << time_[i] << ',' << wavePerSIMD_[i] << ','
|
||||
<< static_cast<uint>(state_[i]) << '\n';
|
||||
}
|
||||
OFS.close();
|
||||
}
|
||||
|
||||
void WaveLimiter::DataDumper::addData(ulong time, uint wave, char state) {
|
||||
if (!enable_) {
|
||||
return;
|
||||
}
|
||||
|
||||
time_.push_back(time);
|
||||
wavePerSIMD_.push_back(wave);
|
||||
state_.push_back(state);
|
||||
}
|
||||
|
||||
WLAlgorithmAvrg::WLAlgorithmAvrg(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump):
|
||||
WaveLimiter(owner, seqNum, enable, enableDump) {
|
||||
|
||||
measure_.resize(MaxWave + 1);
|
||||
clear(measure_);
|
||||
countAll_ = 0;
|
||||
}
|
||||
|
||||
WLAlgorithmAvrg::~WLAlgorithmAvrg() {
|
||||
|
||||
}
|
||||
|
||||
void WLAlgorithmAvrg::outputTrace() {
|
||||
if (!traceStream_.is_open()) {
|
||||
return;
|
||||
}
|
||||
|
||||
traceStream_ << "[WaveLimiter] " << owner_->name() << " state=" << state_
|
||||
<< " currWaves=" << currWaves_ << " waves=" << waves_
|
||||
<< " bestWave=" << bestWave_ << '\n';
|
||||
output(traceStream_, "\n measure = ", measure_);
|
||||
traceStream_ << "\n\n";
|
||||
}
|
||||
|
||||
|
||||
void WLAlgorithmAvrg::callback(ulong duration) {
|
||||
dumper_.addData(duration, currWaves_, static_cast<char>(state_));
|
||||
|
||||
if (!enable_) {
|
||||
return;
|
||||
}
|
||||
|
||||
countAll_++;
|
||||
|
||||
switch (state_) {
|
||||
case WARMUP:
|
||||
state_ = ADAPT;
|
||||
case ADAPT:
|
||||
measure_[waves_] += duration;
|
||||
if (countAll_ <= MaxWave * 5) {
|
||||
waves_--;
|
||||
if (waves_ == 0) {
|
||||
waves_ = MaxWave;
|
||||
}
|
||||
}
|
||||
else {
|
||||
bestWave_ = MaxWave;
|
||||
for (uint i=1; i<MaxWave; i++ ) {
|
||||
if (measure_[i] < measure_[bestWave_]) {
|
||||
bestWave_ = i;
|
||||
}
|
||||
}
|
||||
waves_ = bestWave_;
|
||||
state_ = RUN;
|
||||
}
|
||||
break;
|
||||
case RUN:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
WaveLimiterManager::WaveLimiterManager(HSAILKernel* kernel):
|
||||
owner_(kernel),
|
||||
enable_(false),
|
||||
enableDump_(!flagIsDefault(GPU_WAVE_LIMIT_DUMP)) {
|
||||
auto gpuDev = static_cast<const Device*>(&owner_->dev());
|
||||
Unimplemented();
|
||||
//auto attrib = gpuDev->getAttribs();
|
||||
auto hwInfo = gpuDev->hwInfo();
|
||||
unsigned simdPerSH = 0;
|
||||
setIfNotDefault(simdPerSH, GPU_WAVE_LIMIT_CU_PER_SH,
|
||||
/*attrib.numberOfCUsperShaderArray*/ 8 * hwInfo->simdPerCU_);
|
||||
fixed_ = GPU_WAVES_PER_SIMD * simdPerSH;
|
||||
}
|
||||
|
||||
WaveLimiterManager::~WaveLimiterManager() {
|
||||
for (auto &I: limiters_) {
|
||||
delete I.second;
|
||||
}
|
||||
}
|
||||
|
||||
uint WaveLimiterManager::getWavesPerSH(const device::VirtualDevice *vdev) const {
|
||||
if (fixed_ > 0) {
|
||||
return fixed_;
|
||||
}
|
||||
if (!enable_) {
|
||||
return 0;
|
||||
}
|
||||
auto loc = limiters_.find(vdev);
|
||||
if (loc == limiters_.end()) {
|
||||
return 0;
|
||||
}
|
||||
assert(loc->second != nullptr);
|
||||
return loc->second->getWavesPerSH();
|
||||
}
|
||||
|
||||
amd::ProfilingCallback* WaveLimiterManager::getProfilingCallback(
|
||||
const device::VirtualDevice *vdev) {
|
||||
assert(vdev != nullptr);
|
||||
if (!enable_ && !enableDump_) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
amd::ScopedLock SL(monitor_);
|
||||
auto loc = limiters_.find(vdev);
|
||||
if (loc != limiters_.end()) {
|
||||
return loc->second;
|
||||
}
|
||||
|
||||
auto limiter = new WLAlgorithmSmooth(owner_, limiters_.size(), enable_,
|
||||
enableDump_);
|
||||
if (limiter == nullptr) {
|
||||
enable_ = false;
|
||||
return nullptr;
|
||||
}
|
||||
limiters_[vdev] = limiter;
|
||||
return limiter;
|
||||
}
|
||||
|
||||
void WaveLimiterManager::enable() {
|
||||
if (fixed_ > 0) {
|
||||
return;
|
||||
}
|
||||
auto gpuDev = static_cast<const Device*>(&owner_->dev());
|
||||
auto hwInfo = gpuDev->hwInfo();
|
||||
Unimplemented();
|
||||
// Enable it only for CI+, unless GPU_WAVE_LIMIT_ENABLE is set to 1
|
||||
// Disabled for SI due to bug #10817
|
||||
setIfNotDefault(enable_, GPU_WAVE_LIMIT_ENABLE,
|
||||
/*owner_->workGroupInfo()->limitWave_*/ false && gpuDev->settings().ciPlus_);
|
||||
}
|
||||
|
||||
} // namespace pal
|
||||
|
||||
@@ -0,0 +1,154 @@
|
||||
//
|
||||
// Copyright (c) 2008 Advanced Micro Devices, Inc. All rights reserved.
|
||||
//
|
||||
|
||||
#ifndef PALWAVELIMITER_HPP_
|
||||
#define PALWAVELIMITER_HPP_
|
||||
|
||||
#include "platform/command.hpp"
|
||||
#include "thread/thread.hpp"
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstdint>
|
||||
#include <fstream>
|
||||
#include <unordered_map>
|
||||
|
||||
//! \namespace pal PAL Device Implementation
|
||||
namespace pal {
|
||||
|
||||
class HSAILKernel;
|
||||
|
||||
// Adaptively limit the number of waves per SIMD based on kernel execution time
|
||||
class WaveLimiter: public amd::ProfilingCallback {
|
||||
public:
|
||||
explicit WaveLimiter(HSAILKernel*, uint seqNum, bool enable, bool enableDump);
|
||||
virtual ~WaveLimiter();
|
||||
|
||||
//! Get waves per shader array to be used for kernel execution.
|
||||
uint getWavesPerSH();
|
||||
|
||||
protected:
|
||||
enum StateKind {
|
||||
WARMUP, ADAPT, RUN
|
||||
};
|
||||
|
||||
class DataDumper {
|
||||
public:
|
||||
explicit DataDumper(const std::string &kernelName, bool enable);
|
||||
~DataDumper();
|
||||
|
||||
//! Record execution time, waves/simd and state of wave limiter.
|
||||
void addData(ulong time, uint wave, char state);
|
||||
|
||||
//! Whether this data dumper is enabled.
|
||||
bool enabled() const { return enable_;}
|
||||
private:
|
||||
bool enable_;
|
||||
std::string fileName_;
|
||||
std::vector<ulong> time_;
|
||||
std::vector<uint> wavePerSIMD_;
|
||||
std::vector<char> state_;
|
||||
};
|
||||
|
||||
std::vector<ulong> measure_;
|
||||
bool enable_;
|
||||
uint SIMDPerSH_; // Number of SIMDs per SH
|
||||
uint waves_; // Waves per SIMD to be set
|
||||
uint bestWave_; // Optimal waves per SIMD
|
||||
uint countAll_; // Number of kernel executions
|
||||
StateKind state_;
|
||||
HSAILKernel *owner_;
|
||||
DataDumper dumper_;
|
||||
std::ofstream traceStream_;
|
||||
uint currWaves_; // Current waves per SIMD
|
||||
|
||||
static uint MaxWave; // Maximum number of waves per SIMD
|
||||
static uint WarmUpCount; // Number of kernel executions for warm up
|
||||
static uint RunCount; // Number of kernel executions for normal run
|
||||
|
||||
//! Call back from Event::recordProfilingInfo to get execution time.
|
||||
virtual void callback(ulong duration)=0;
|
||||
|
||||
//! Output trace of measurement/adaptation.
|
||||
virtual void outputTrace()=0;
|
||||
|
||||
template<class T> void clear(T& A) {
|
||||
for (auto &I : A) {
|
||||
I = 0;
|
||||
}
|
||||
}
|
||||
template<class T> void output(std::ofstream &ofs, const std::string &prompt,
|
||||
T& A) {
|
||||
ofs << prompt;
|
||||
for (auto &I : A) {
|
||||
ofs << ' ' << static_cast<ulong>(I);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
class WLAlgorithmSmooth: public WaveLimiter {
|
||||
public:
|
||||
explicit WLAlgorithmSmooth(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump);
|
||||
virtual ~WLAlgorithmSmooth();
|
||||
private:
|
||||
std::vector<ulong> reference_;
|
||||
std::vector<ulong> trial_;
|
||||
std::vector<ulong> ratio_;
|
||||
bool discontinuous_; // Measured data is discontinuous
|
||||
uint dynRunCount_;
|
||||
uint dataCount_;
|
||||
|
||||
static uint AdaptCount; // Number of kernel executions for adapting
|
||||
static uint AbandonThresh; // Threshold to abandon adaptation
|
||||
static uint DscThresh; // Threshold for identifying discontinuities
|
||||
|
||||
//! Update measurement data and optimal waves/simd with execution time.
|
||||
void updateData(ulong time);
|
||||
|
||||
//! Clear measurement data for the next adaptation.
|
||||
void clearData();
|
||||
|
||||
//! Call back from Event::recordProfilingInfo to get execution time.
|
||||
void callback(ulong duration);
|
||||
|
||||
//! Output trace of measurement/adaptation.
|
||||
void outputTrace();
|
||||
};
|
||||
|
||||
class WLAlgorithmAvrg: public WaveLimiter {
|
||||
public:
|
||||
explicit WLAlgorithmAvrg(HSAILKernel* owner, uint seqNum, bool enable, bool enableDump);
|
||||
virtual ~WLAlgorithmAvrg();
|
||||
private:
|
||||
//! Call back from Event::recordProfilingInfo to get execution time.
|
||||
void callback(ulong duration);
|
||||
|
||||
//! Output trace of measurement/adaptation.
|
||||
void outputTrace();
|
||||
};
|
||||
|
||||
// Create wave limiter for each virtual device for a kernel and manages the wave limiters.
|
||||
class WaveLimiterManager {
|
||||
public:
|
||||
explicit WaveLimiterManager(HSAILKernel* owner);
|
||||
virtual ~WaveLimiterManager();
|
||||
|
||||
//! Get waves per shader array for a specific virtual device.
|
||||
uint getWavesPerSH(const device::VirtualDevice *) const;
|
||||
|
||||
//! Provide call back function for a specific virtual device.
|
||||
amd::ProfilingCallback* getProfilingCallback(const device::VirtualDevice *);
|
||||
|
||||
//! Enable wave limiter manager by kernel metadata and flags.
|
||||
void enable();
|
||||
private:
|
||||
HSAILKernel* owner_; //!< The kernel which owns this object
|
||||
std::unordered_map<const device::VirtualDevice *,
|
||||
WaveLimiter*> limiters_; //!< Maps virtual device to wave limiter
|
||||
bool enable_; //!< Whether the adaptation is enabled
|
||||
bool enableDump_; //!< Whether the data dumper is enabled
|
||||
uint fixed_; //!< The fixed waves/simd value if not zero
|
||||
amd::Monitor monitor_; //!< The mutex for updating the wave limiter map
|
||||
};
|
||||
}
|
||||
#endif
|
||||
@@ -137,6 +137,10 @@ class HeapObject
|
||||
public:
|
||||
void* operator new(size_t size);
|
||||
void operator delete(void* obj);
|
||||
void* operator new(size_t size, size_t extSize)
|
||||
{ return HeapObject::operator new (size + extSize); };
|
||||
void operator delete(void* obj, size_t extSize)
|
||||
{ HeapObject::operator delete (obj); }
|
||||
};
|
||||
|
||||
/*! \brief For all reference counted objects.
|
||||
@@ -154,6 +158,10 @@ public:
|
||||
|
||||
void* operator new(size_t size) { return ::operator new(size); }
|
||||
void operator delete(void* p) { return ::operator delete(p); }
|
||||
void* operator new(size_t size, size_t extSize)
|
||||
{ return ReferenceCountedObject::operator new (size + extSize); };
|
||||
void operator delete(void* obj, size_t extSize)
|
||||
{ ReferenceCountedObject::operator delete (obj); }
|
||||
|
||||
uint referenceCount() const { return referenceCount_; }
|
||||
|
||||
|
||||
新增問題並參考
封鎖使用者